Merge #1263

1263: dropout function always active r=DhairyaLGandhi a=CarloLucibello Fix #1084 according to the suggestions of @AzamatB and @jondeuce ### PR Checklist - [x] Tests are added - [x] Entry in NEWS.md - [ ] Documentation, if applicable - [ ] Final review from `@MikeInnes` or `@dhairyagandhi96` (for API changes). Co-authored-by: CarloLucibello <carlo.lucibello@gmail.com>
FluxML · Jul 9, 2020 · f8001cf · f8001cf
2 parents 630d6c5 + 5b4b19e
commit f8001cf
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -17,6 +17,8 @@
 * Testing suite improvements now test for gradients of all layers along with GPU support.
 * Functors have now moved to [Functors.jl](https://github.com/FluxML/Flux.jl/pull/1174) to allow for their use outside of Flux.
 * Added [helper functions](https://github.com/FluxML/Flux.jl/pull/873) `Flux.convfilter` and `Flux.depthwiseconvfilter` to construct weight arrays for convolutions outside of layer constructors so as to not have to depend on the default layers for custom implementations.
+* `dropout` function now has a mandatory [active](https://github.com/FluxML/Flux.jl/pull/1263)
+keyword argument. The `Dropout` struct *whose behavior is left unchanged) is the recommended choice for common usage.
 * and many more fixes and additions...
 
 # v0.10.1 - v0.10.4

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -9,22 +9,43 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
+# TODO set active's default to true in v0.12
+# or deprecate the keyword altogheter
 """
-    dropout(x, p; dims = :)
+    dropout(x, p; dims=:, active::Bool)
 
-The dropout function. For each input, either sets that input to `0` (with probability
+The dropout function. If `active` is `true`,
+for each input, either sets that input to `0` (with probability
 `p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions,
 e.g. `dims=1` applies dropout along columns and `dims=2` along rows.
 This is used as a regularisation, i.e. it reduces overfitting during training.
 
-See also the [`Dropout`](@ref) layer.
+If `active` is `false`, it just returns the input `x`
+
+Warning: when using this function, you have to manually manage the activation 
+state. Usually in fact, dropout is used while training 
+but is deactivated in the inference phase. This can be 
+automatically managed using the [`Dropout`](@ref) layer instead of the 
+`dropout` function. 
+
+The [`Dropout`](@ref) layer is what you should use in most scenarios.
 """
-dropout(x, p; dims = :) = x
+function dropout(x, p; dims=:, active::Bool)
+  active || return x
+  y = dropout_mask(x, p, dims=dims)
+  return x .* y
+end
 
-@adjoint function dropout(x, p; dims = :)
+@adjoint function dropout(x, p; dims=:, active::Bool)
+  active || return x, Δ -> (Δ, nothing) 
+  y = dropout_mask(x, p, dims=dims)
+  return x .* y, Δ -> (Δ .* y, nothing)
+end
+
+function dropout_mask(x, p; dims=:)
   y = rand!(similar(x, _dropout_shape(x, dims)))
   y .= _dropout_kernel.(y, p, 1 - p)
-  return x .* y, Δ -> (Δ .* y, nothing)
+  return y
 end
 
 """
@@ -50,7 +71,7 @@ end
 
 function (a::Dropout)(x)
   _isactive(a) || return x
-  return dropout(x, a.p; dims = a.dims)
+  return dropout(x, a.p; dims = a.dims, active=true)
 end
 
 testmode!(m::Dropout, mode = true) =

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -38,6 +38,23 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
   y = m(x)
   c = map(i->count(a->a==0, @view y[:, i]), 1:50)
   @test minimum(c) == maximum(c)
+
+  # issue #1084
+  m = Dropout(0.9)
+  x = rand(100)
+
+  testmode!(m)
+  y = m(x) 
+  @test count(a->a == 0, y) == 0
+  trainmode!(m)
+  y = m(x) 
+  @test count(a->a == 0, y) > 50
+
+  y = Flux.dropout(x, 0.9, active=true) 
+  @test count(a->a == 0, y) > 50
+
+  y = Flux.dropout(x, 0.9, active=false) 
+  @test count(a->a == 0, y) == 0
 end
 
 @testset "BatchNorm" begin