diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 0b5e04fb14..fe15f26b38 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -113,7 +113,7 @@ LayerNorm(h::Integer) =
 
 @functor LayerNorm
 
-(a::LayerNorm)(x) = a.diag(normalise(x))
+(a::LayerNorm)(x) = a.diag(normalise(x, dims=1))
 
 function Base.show(io::IO, l::LayerNorm)
   print(io, "LayerNorm(", length(l.diag.α), ")")
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index f652b912fa..233384922c 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -286,35 +286,14 @@ end
 # TODO normalise over last dimension is typically what you want to do. 
 # Deprecation path: `normalise(x; dims=1)` -> `normalise(x; dims)` -> `normalise(x; dims=size(x)[end])`  
 """
-    normalise(x; dims, ϵ=1e-6)
+    normalise(x; dims, ϵ=1e-5)
 
 Normalise `x` to mean 0 and standard deviation 1 across the dimensions given by `dims`.
-Defaults to normalising over columns.
 `ϵ` is a small additive factor added to the denominator for numerical stability.
-
-```jldoctest
-julia> a = reshape(collect(1:9), 3, 3)
-3×3 Array{Int64,2}:
- 1  4  7
- 2  5  8
- 3  6  9
-
-julia> Flux.normalise(a, dims=1)
-3×3 Array{Float64,2}:
- -1.22474  -1.22474  -1.22474
-  0.0       0.0       0.0
-  1.22474   1.22474   1.22474
-
-julia> Flux.normalise(a, dims=2)
-3×3 Array{Float64,2}:
- -1.22474  0.0  1.22474
- -1.22474  0.0  1.22474
- -1.22474  0.0  1.22474
-```
 """
-function normalise(x::AbstractArray; dims, ϵ=ofeltype(x, 1e-6))
-  μ′ = mean(x, dims=dims)
-    #   σ′ = std(x, dims=dims, mean=μ′, corrected=false) # use this when #478 gets merged
-  σ′ = std(x, dims=dims, corrected=false)
-  return (x .- μ′) ./ (σ′.+ ϵ)
+function normalise(x::AbstractArray; dims, ϵ=ofeltype(x, 1e-5))
+  μ = mean(x, dims=dims)
+    #   σ = std(x, dims=dims, mean=μ, corrected=false) # use this when #478 gets merged
+  σ = std(x, dims=dims, corrected=false)
+  return (x .- μ) ./ (σ .+ ϵ)
 end
\ No newline at end of file
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 28fc22ac52..83cdf10222 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -1,6 +1,7 @@
 using Flux, Test
 using Flux.CuArrays
 using Flux: gpu
+using Statistics: mean
 
 @info "Testing GPU Support"
 
@@ -27,9 +28,9 @@ cm = gpu(m)
 
 x = [1.,2.,3.]
 cx = gpu(x)
-@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
-@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
-@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
+@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx) 
+@test Flux.crossentropy(x,x, agg=identity) ≈ Flux.crossentropy(cx,cx, agg=identity) |> cpu
+@test Flux.crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ Flux.crossentropy(cx,cx, agg=x->mean(cu([1.0;2.0;3.0]).*x))
 
 x = [-1.1491, 0.8619, 0.3127]
 y = [1, 1, 0.]
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index 56b40c7b3b..4e8fbd41df 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -90,7 +90,7 @@ end
 
   for layer in stateless_layers
     if layer == Flux.normalise
-      stateless_gradtest(layer, x)
+      stateless_gradtest(x -> layer(x, dims=1), x)
     else
       stateless_gradtest(layer, x, y)
     end