diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 0b5e04fb14..fe15f26b38 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -113,7 +113,7 @@ LayerNorm(h::Integer) = @functor LayerNorm -(a::LayerNorm)(x) = a.diag(normalise(x)) +(a::LayerNorm)(x) = a.diag(normalise(x, dims=1)) function Base.show(io::IO, l::LayerNorm) print(io, "LayerNorm(", length(l.diag.α), ")") diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index f652b912fa..233384922c 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -286,35 +286,14 @@ end # TODO normalise over last dimension is typically what you want to do. # Deprecation path: `normalise(x; dims=1)` -> `normalise(x; dims)` -> `normalise(x; dims=size(x)[end])` """ - normalise(x; dims, ϵ=1e-6) + normalise(x; dims, ϵ=1e-5) Normalise `x` to mean 0 and standard deviation 1 across the dimensions given by `dims`. -Defaults to normalising over columns. `ϵ` is a small additive factor added to the denominator for numerical stability. - -```jldoctest -julia> a = reshape(collect(1:9), 3, 3) -3×3 Array{Int64,2}: - 1 4 7 - 2 5 8 - 3 6 9 - -julia> Flux.normalise(a, dims=1) -3×3 Array{Float64,2}: - -1.22474 -1.22474 -1.22474 - 0.0 0.0 0.0 - 1.22474 1.22474 1.22474 - -julia> Flux.normalise(a, dims=2) -3×3 Array{Float64,2}: - -1.22474 0.0 1.22474 - -1.22474 0.0 1.22474 - -1.22474 0.0 1.22474 -``` """ -function normalise(x::AbstractArray; dims, ϵ=ofeltype(x, 1e-6)) - μ′ = mean(x, dims=dims) - # σ′ = std(x, dims=dims, mean=μ′, corrected=false) # use this when #478 gets merged - σ′ = std(x, dims=dims, corrected=false) - return (x .- μ′) ./ (σ′.+ ϵ) +function normalise(x::AbstractArray; dims, ϵ=ofeltype(x, 1e-5)) + μ = mean(x, dims=dims) + # σ = std(x, dims=dims, mean=μ, corrected=false) # use this when #478 gets merged + σ = std(x, dims=dims, corrected=false) + return (x .- μ) ./ (σ .+ ϵ) end \ No newline at end of file diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl index 28fc22ac52..83cdf10222 100644 --- a/test/cuda/cuda.jl +++ b/test/cuda/cuda.jl @@ -1,6 +1,7 @@ using Flux, Test using Flux.CuArrays using Flux: gpu +using Statistics: mean @info "Testing GPU Support" @@ -27,9 +28,9 @@ cm = gpu(m) x = [1.,2.,3.] cx = gpu(x) -@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx) -@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0) -@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0])) +@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx) +@test Flux.crossentropy(x,x, agg=identity) ≈ Flux.crossentropy(cx,cx, agg=identity) |> cpu +@test Flux.crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ Flux.crossentropy(cx,cx, agg=x->mean(cu([1.0;2.0;3.0]).*x)) x = [-1.1491, 0.8619, 0.3127] y = [1, 1, 0.] diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl index 56b40c7b3b..4e8fbd41df 100644 --- a/test/cuda/layers.jl +++ b/test/cuda/layers.jl @@ -90,7 +90,7 @@ end for layer in stateless_layers if layer == Flux.normalise - stateless_gradtest(layer, x) + stateless_gradtest(x -> layer(x, dims=1), x) else stateless_gradtest(layer, x, y) end