FluxML · bors · Nov 26, 2021 · Nov 24, 2021 · Nov 24, 2021 · Nov 25, 2021
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
 # Flux Release Notes
 
+## v0.12.9
+* Fixed incorrect output and added GPU compatibility for [AlphaDropout](https://github.com/FluxML/Flux.jl/pull/1781).
+
 ## v0.12.8
 * Optimized inference and gradient calculation of OneHotMatrix[pr](https://github.com/FluxML/Flux.jl/pull/1756)
 
@@ -12,7 +15,7 @@
 * REPL printing via [`show`](https://github.com/FluxML/Flux.jl/pull/1467) displays parameter counts.
 
 ## v0.12.4
-* Implemented an [`Embedding layer`](https://github.com/FluxML/Flux.jl/pull/1516) 
+* Implemented an [`Embedding layer`](https://github.com/FluxML/Flux.jl/pull/1516)
   based on `NNlib.gather` and `NNlib.scatter`.
 
 ## v0.12.1 - v0.12.3
@@ -37,8 +40,8 @@
 * New [`Parallel` layer](https://github.com/FluxML/Flux.jl/pull/1462) adds inception module-like building blocks.
 * Feature additions and bug fixes for BatchNorm, LayerNorm, InstanceNorm, and GroupNorm [normalization layers](https://github.com/FluxML/Flux.jl/pull/1397)
 * Added [Upsample and PixelShuffle layers](https://github.com/FluxML/Flux.jl/pull/1468)
-* End of deprecation cycle: loss functions cannot be accessed directly from `Flux` anymore, they live in the `Flux.Losses` module. 
- All loss functions perform `mean` aggregation by default. 
+* End of deprecation cycle: loss functions cannot be accessed directly from `Flux` anymore, they live in the `Flux.Losses` module.
+ All loss functions perform `mean` aggregation by default.
 
 ## v0.11.2
 

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -101,17 +101,18 @@ mutable struct AlphaDropout{F}
   end
 end
 
-function (a::AlphaDropout)(x)
+function (a::AlphaDropout)(x::AbstractArray{T}) where T
   _isactive(a) || return x
-  λ = eltype(x)(1.0507009873554804934193349852946)
-  α = eltype(x)(1.6732632423543772848170429916717)
-  α1 = eltype(x)(-λ*α)
-  noise = randn(eltype(x), size(x))
-  x = @. x*(noise > (1 - a.p)) + α1 * (noise < (1 - a.p))
-  A = sqrt(a.p + a.p * (1 - a.p) * α1^2)
-  B = -A * α1 * (1 - a.p)
-  x = @. A * x + B
-  return x
+  p = a.p
+  iszero(p) && return x
+  isone(p) && return sign.(x) .* T(0)
+
+  α′ = T(-1.7580993408473766) # selu(-Inf) == -λα
+  A = T(inv(sqrt((1 - p) * (1 + p * α′^2))))
+  B = T(-A * α′ * p)
+
+  noise = rand!(similar(x))
+  return A .* ifelse.(noise .> p, x, α′) .+ B
 end
 
 testmode!(m::AlphaDropout, mode=true) =

diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -10,13 +10,8 @@
   @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
 end
 
-# TODO: These layers get into scalar indexing
-# `AlphaDropout` throws a compilation error on GPUs,
-# whereas, the rest are scalar indexing issues.
-# The norm layers behave differently on the CPU and
-# the GPU too.
-const BROKEN_LAYERS = Union{DepthwiseConv,
-                            AlphaDropout}
+# TODO: These layers get into scalar indexing issues.
+const BROKEN_LAYERS = Union{DepthwiseConv}
 
 const ACTIVATIONS = [identity, relu, tanh,
                      sigmoid, exp, softplus,

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -57,6 +57,34 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
   @test count(a->a == 0, y) == 0
 end
 
+@testset "AlphaDropout" begin
+  x = [1., 2., 3.]
+  @test x == AlphaDropout(0.1)(x)
+  @test x == evalwgrad(AlphaDropout(0), x)
+  @test zero(x) == evalwgrad(AlphaDropout(1), x)
+
+  x = randn(1000) # large enough to prevent flaky test
+  m = AlphaDropout(0.5)
+
+  y = evalwgrad(m, x)
+  # Should preserve unit mean and variance
+  @test mean(y) ≈ 0 atol=0.1
+  @test var(y) ≈ 1 atol=0.1
+
+  testmode!(m, true) # should override istraining
+  @test evalwgrad(m, x) == x
+
+  testmode!(m, false)
+  y = evalwgrad(m, x)
+  @test mean(y) ≈ 0 atol=0.1
+  @test var(y) ≈ 1 atol=0.1
+
+  # Known good value ranges
+  # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
+  x = ones(100)
+  @test 40 < sum(evalwgrad(m, x)) < 130
+end
+
 @testset "BatchNorm" begin
   let m = BatchNorm(2), x = [1.0 3.0 5.0;
                              2.0 4.0 6.0]