From e33de0ce11c50b1625162bdbd81f1a2685f35fae Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 31 Jan 2023 20:32:56 -0500 Subject: [PATCH] Move `dropout` to NNlib (#2150) * use NNlib.dropout, deprecate Flux.dropout * improve Dropout's docstring * make Dropout(0) === identity, cannot mutate * NNlibCUDA = 0.2.5 * NNlibCUDA = 0.2.6 * simplify default_rng etc * Revert "simplify default_rng etc" This reverts commit 0e396a6f09b97c23d4b16ffcc2cd06ccd29353ae. * un-revert the removal of the active=true method * avoid a branch * Update src/layers/normalise.jl Co-authored-by: Carlo Lucibello * Apply suggestions from code review Co-authored-by: Kyle Daruwalla --------- Co-authored-by: Carlo Lucibello Co-authored-by: Kyle Daruwalla --- Project.toml | 4 +- docs/src/models/layers.md | 2 +- src/deprecations.jl | 1 + src/layers/normalise.jl | 121 +++++++++++++---------------------- test/layers/normalisation.jl | 6 +- 5 files changed, 50 insertions(+), 84 deletions(-) diff --git a/Project.toml b/Project.toml index b4c3dc6d75..d189304f78 100644 --- a/Project.toml +++ b/Project.toml @@ -30,8 +30,8 @@ ChainRulesCore = "1.12" Functors = "0.3, 0.4" MLUtils = "0.2, 0.3.1, 0.4" MacroTools = "0.5" -NNlib = "0.8.14" -NNlibCUDA = "0.2.4" +NNlib = "0.8.15" +NNlibCUDA = "0.2.6" OneHotArrays = "0.1, 0.2" Optimisers = "0.2.12" ProgressLogging = "0.1" diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index b8345c3f78..c0e1c57307 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -123,7 +123,7 @@ LayerNorm InstanceNorm GroupNorm Flux.normalise -Flux.dropout +NNlib.dropout ``` ### Test vs. Train diff --git a/src/deprecations.jl b/src/deprecations.jl index 8625c458e0..a99297649b 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -185,6 +185,7 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, """) end + # v0.14 deprecations # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 7ee28f64e9..e9362313ab 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -1,112 +1,77 @@ - +# Internal function, used only for layers defined in this file. _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active -_dropout_shape(s, ::Colon) = size(s) -_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...) - -_dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0) - -""" - dropout([rng = rng_from_array(x)], x, p; dims=:, active=true) - -The dropout function. If `active` is `true`, -for each input, either sets that input to `0` (with probability -`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions, -e.g. `dims=1` applies dropout along columns and `dims=2` along rows. -If `active` is `false`, it just returns the input `x`. - -Specify `rng` for custom RNGs instead of the default RNG. -Note that custom RNGs are only supported on the CPU. - -Warning: when using this function, you have to manually manage the activation -state. Usually in fact, dropout is used while training -but is deactivated in the inference phase. This can be -automatically managed using the [`Dropout`](@ref) layer instead of the -`dropout` function. - -The [`Dropout`](@ref) layer is what you should use in most scenarios. -""" -function dropout(rng, x, p; dims=:, active::Bool=true) - active || return x - y = dropout_mask(rng, x, p, dims=dims) - return x .* y -end -dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...) - -dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) -dropout_mask(rng, x::CuArray, p; kwargs...) = - throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.")) -dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) -function _dropout_mask(rng, x, p; dims=:) - realfptype = float(real(eltype(x))) - y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims))) - y .= _dropout_kernel.(y, p, 1 - p) - return y -end - -# TODO move this to NNlib -ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any) - """ - Dropout(p; dims=:, rng = default_rng_value()) + Dropout(p; [dims, rng]) -Dropout layer. +Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability. +This is used as a regularisation, i.e. to reduce overfitting. -While training, for each input, this layer either sets that input to `0` (with probability -`p`) or scales it by `1 / (1 - p)`. To apply dropout along certain dimension(s), specify the -`dims` keyword. e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input -(also called 2D dropout). This is used as a regularisation, i.e. it reduces overfitting during -training. +While training, it sets each input to `0` (with probability `p`) +or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function. +While testing, it has no effect. -In the forward pass, this layer applies the [`Flux.dropout`](@ref) function. See that for more -details. +By default the mode will switch automatically, but it can also +be controlled manually via [`Flux.testmode!`](@ref). -Specify `rng` to use a custom RNG instead of the default. -Custom RNGs are only supported on the CPU. +By default every input is treated independently. With the `dims` keyword, +instead it takes a random choice only along that dimension. +For example `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input +(also called 2D dropout). -Does nothing to the input once [`Flux.testmode!`](@ref) is `true`. +Keyword `rng` lets you specify a custom random number generator. +(Only supported on the CPU.) # Examples -```jldoctest -julia> m = Chain(Dense(1 => 1), Dropout(1)); +```julia +julia> m = Chain(Dense(ones(3,2)), Dropout(0.4)) +Chain( + Dense(2 => 3), # 9 parameters + Dropout(0.4), +) -julia> Flux.trainmode!(m); +julia> m(ones(2, 7)) # test mode, no effect +3×7 Matrix{Float64}: + 2.0 2.0 2.0 2.0 2.0 2.0 2.0 + 2.0 2.0 2.0 2.0 2.0 2.0 2.0 + 2.0 2.0 2.0 2.0 2.0 2.0 2.0 -julia> y = m([1]); +julia> Flux.trainmode!(m); # would happen within gradient -julia> y == [0] -true +julia> m(ones(2, 7)) +3×7 Matrix{Float64}: + 0.0 0.0 3.33333 0.0 0.0 0.0 0.0 + 3.33333 0.0 3.33333 0.0 3.33333 0.0 3.33333 + 3.33333 3.33333 0.0 3.33333 0.0 0.0 3.33333 -julia> m = Chain(Dense(1000 => 1000), Dropout(0.5)); +julia> y = m(ones(2, 10_000)); -julia> Flux.trainmode!(m); +julia> using Statistics -julia> y = m(ones(1000)); +julia> mean(y) # is about 2.0, as for test mode +1.9892222222222182 -julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1) -true +julia> mean(iszero, y) # is about 0.4 +0.40323333333333333 ``` """ -mutable struct Dropout{F,D,R<:AbstractRNG} +mutable struct Dropout{F<:Real,D,R<:AbstractRNG} p::F dims::D active::Union{Bool, Nothing} rng::R end -Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value()) +Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) -function Dropout(p; dims=:, rng = default_rng_value()) - @assert 0 ≤ p ≤ 1 +function Dropout(p::Real; dims=:, rng = default_rng_value()) + 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expects 0 ≤ p ≤ 1, got p = $p")) Dropout(p, dims, nothing, rng) end @functor Dropout trainable(a::Dropout) = (;) -function (a::Dropout)(x) - _isactive(a, x) || return x - return dropout(a.rng, x, a.p; dims=a.dims, active=true) -end +(a::Dropout)(x) = dropout(a.rng, x, a.p * _isactive(a, x); dims=a.dims) testmode!(m::Dropout, mode=true) = (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 2aa26bb2a7..6a3d85756d 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -1,4 +1,4 @@ -using Flux, Test, Statistics +using Flux, Test, Statistics, Random using Zygote: pullback, ForwardDiff evalwgrad(f, x...) = pullback(f, x...)[1] @@ -56,10 +56,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1] y = m(x) @test count(a->a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true) + y = Flux.dropout(values(rng_kwargs)..., x, 0.9) # , active=true) @test count(a->a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false) + y = Flux.dropout(values(rng_kwargs)..., x, 0.9 * 0) # , active=false) @test count(a->a == 0, y) == 0 # CPU RNGs map onto CPU ok