Skip to content

Use float(η), and fixed eps #120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,6 @@ Broadcast.materialize(x::Lazy) = Broadcast.instantiate(x.bc)

onevalue(λ::T, x::AbstractArray{T}) where T = map(_ -> λ, x)
onevalue(λ, x::AbstractArray{T}) where T = onevalue(convert(float(T), λ), x)

float_typeof(η) = float(typeof(η))
float_typeof(::Integer) = Float32
49 changes: 25 additions & 24 deletions src/rules.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ struct Descent{T} <: AbstractRule
eta::T
end
Descent() = Descent(1f-1)
Descent(η::Integer) = Descent(Float32(η)) # float ensures that adjust! can change this

init(o::Descent, x::AbstractArray) = nothing

Expand All @@ -44,7 +45,7 @@ struct Momentum{T} <: AbstractRule
eta::T
rho::T
end
Momentum(η = 1f-2, ρ = 9f-1) = Momentum{typeof(η)}(η, ρ)
Momentum(η = 1f-2, ρ = 9f-1) = Momentum{float_typeof(η)}(η, ρ)

init(o::Momentum, x::AbstractArray) = zero(x)

Expand All @@ -70,7 +71,7 @@ struct Nesterov{T} <: AbstractRule
eta::T
rho::T
end
Nesterov(η = 1f-3, ρ = 9f-1) = Nesterov{typeof(η)}(η, ρ)
Nesterov(η = 1f-3, ρ = 9f-1) = Nesterov{float_typeof(η)}(η, ρ)

init(o::Nesterov, x::AbstractArray) = zero(x)

Expand All @@ -84,7 +85,7 @@ function apply!(o::Nesterov, state, x, dx)
end

"""
RMSProp(η = 1f-3, ρ = 9f-1, ϵ = eps(typeof(η)); centred = false)
RMSProp(η = 1f-3, ρ = 9f-1, ϵ = 1f-7; centred = false)

Optimizer using the
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
Expand All @@ -111,8 +112,8 @@ struct RMSProp{T} <: AbstractRule
centred::Bool
end

RMSProp(η = 1f-3, ρ = 9f-1, ϵ = eps(typeof(η)); centred::Bool = false, centered::Bool = false) =
RMSProp{typeof(η)}(η, ρ, ϵ, centred | centered)
RMSProp(η = 1f-3, ρ = 9f-1, ϵ = 1f-7; centred::Bool = false, centered::Bool = false) =
RMSProp{float_typeof(η)}(η, ρ, ϵ, centred | centered)

init(o::RMSProp, x::AbstractArray) = (zero(x), o.centred ? zero(x) : false)

Expand Down Expand Up @@ -163,7 +164,7 @@ struct Rprop{T} <: AbstractRule
gamma::Tuple{T,T}
end

Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0)) = Rprop{typeof(η)}(η, ℓ, Γ)
Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0)) = Rprop{float_typeof(η)}(η, ℓ, Γ)

init(o::Rprop, x::AbstractArray) = (zero(x), onevalue(o.eta, x))

Expand All @@ -185,7 +186,7 @@ function apply!(o::Rprop, state, x, dx)
end

"""
Adam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
Adam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7)

[Adam](https://arxiv.org/abs/1412.6980) optimiser.

Expand All @@ -202,7 +203,7 @@ struct Adam{T} <: AbstractRule
beta::Tuple{T, T}
epsilon::T
end
Adam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η))) = Adam{typeof(η)}(η, β, ϵ)
Adam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7) = Adam{float_typeof(η)}(η, β, ϵ)

init(o::Adam, x::AbstractArray) = (zero(x), zero(x), o.beta)

Expand All @@ -218,7 +219,7 @@ function apply!(o::Adam, state, x, dx)
end

"""
RAdam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
RAdam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7)

[Rectified Adam](https://arxiv.org/abs/1908.03265) optimizer.

Expand All @@ -235,7 +236,7 @@ struct RAdam{T} <: AbstractRule
beta::Tuple{T, T}
epsilon::T
end
RAdam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η))) = RAdam{typeof(η)}(η, β, ϵ)
RAdam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7) = RAdam{float_typeof(η)}(η, β, ϵ)

init(o::RAdam, x::AbstractArray) = (zero(x), zero(x), o.beta, 1)

Expand All @@ -259,7 +260,7 @@ function apply!(o::RAdam, state, x, dx)
end

"""
AdaMax(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
AdaMax(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7)

[AdaMax](https://arxiv.org/abs/1412.6980) is a variant of Adam based on the ∞-norm.

Expand All @@ -276,7 +277,7 @@ struct AdaMax{T} <: AbstractRule
beta::Tuple{T, T}
epsilon::T
end
AdaMax(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η))) = AdaMax{typeof(η)}(η, β, ϵ)
AdaMax(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7) = AdaMax{float_typeof(η)}(η, β, ϵ)

init(o::AdaMax, x::AbstractArray) = (zero(x), zero(x), o.beta)

Expand All @@ -292,7 +293,7 @@ function apply!(o::AdaMax, state, x, dx)
end

"""
OAdam(η = 1f-3, β = (5f-1, 9f-1), ϵ = eps(typeof(η)))
OAdam(η = 1f-3, β = (5f-1, 9f-1), ϵ = 1f-7)

[OAdam](https://arxiv.org/abs/1711.00141) (Optimistic Adam)
is a variant of Adam adding an "optimistic" term suitable for adversarial training.
Expand All @@ -310,7 +311,7 @@ struct OAdam{T} <: AbstractRule
beta::Tuple{T, T}
epsilon::T
end
OAdam(η = 1f-3, β = (5f-1, 9f-1), ϵ = eps(typeof(η))) = OAdam{typeof(η)}(η, β, ϵ)
OAdam(η = 1f-3, β = (5f-1, 9f-1), ϵ = 1f-7) = OAdam{float_typeof(η)}(η, β, ϵ)

init(o::OAdam, x::AbstractArray) = (zero(x), zero(x), o.beta, zero(x))

Expand All @@ -328,7 +329,7 @@ function apply!(o::OAdam, state, x, dx)
end

"""
AdaGrad(η = 1f-1, ϵ = eps(typeof(η)))
AdaGrad(η = 1f-1, ϵ = 1f-7)

[AdaGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
parameter specific learning rates based on how frequently it is updated.
Expand All @@ -344,7 +345,7 @@ struct AdaGrad{T} <: AbstractRule
eta::T
epsilon::T
end
AdaGrad(η = 1f-1, ϵ = eps(typeof(η))) = AdaGrad{typeof(η)}(η, ϵ)
AdaGrad(η = 1f-1, ϵ = 1f-7) = AdaGrad{float_typeof(η)}(η, ϵ)

init(o::AdaGrad, x::AbstractArray) = onevalue(o.epsilon, x)

Expand Down Expand Up @@ -391,7 +392,7 @@ function apply!(o::AdaDelta, state, x, dx)
end

"""
AMSGrad(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
AMSGrad(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7)

The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the Adam
optimiser. Parameters don't need tuning.
Expand All @@ -409,7 +410,7 @@ struct AMSGrad{T} <: AbstractRule
beta::Tuple{T, T}
epsilon::T
end
AMSGrad(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η))) = AMSGrad{typeof(η)}(η, β, ϵ)
AMSGrad(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7) = AMSGrad{float_typeof(η)}(η, β, ϵ)

init(o::AMSGrad, x::AbstractArray) =
(onevalue(o.epsilon, x), onevalue(o.epsilon, x), onevalue(o.epsilon, x))
Expand All @@ -427,7 +428,7 @@ function apply!(o::AMSGrad, state, x, dx)
end

"""
NAdam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
NAdam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7)

[NAdam](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of Adam.
Parameters don't need tuning.
Expand All @@ -445,7 +446,7 @@ struct NAdam{T} <: AbstractRule
beta::Tuple{T, T}
epsilon::T
end
NAdam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η))) = NAdam{typeof(η)}(η, β, ϵ)
NAdam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1f-7) = NAdam{float_typeof(η)}(η, β, ϵ)

init(o::NAdam, x::AbstractArray) = (zero(x), zero(x), o.beta)

Expand All @@ -463,7 +464,7 @@ function apply!(o::NAdam, state, x, dx)
end

"""
AdamW(η = 1f-3, β = (9f-1, 9.99f-1), γ = 0, ϵ = eps(typeof(η)))
AdamW(η = 1f-3, β = (9f-1, 9.99f-1), γ = 0, ϵ = 1f-7)

[AdamW](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its
weight decay regularization.
Expand All @@ -477,8 +478,8 @@ weight decay regularization.
- Machine epsilon (`ϵ`): Constant to prevent division by zero
(no need to change default)
"""
AdamW(η = 1f-3, β = (9f-1, 9.99f-1), γ = 0, ϵ = eps(typeof(η))) =
OptimiserChain(Adam{typeof(η)}(η, β, ϵ), WeightDecay{typeof(η)}(γ))
AdamW(η = 1f-3, β = (9f-1, 9.99f-1), γ = 0, ϵ = 1f-7) =
OptimiserChain(Adam{float_typeof(η)}(η, β, ϵ), WeightDecay{float_typeof(η)}(γ))

"""
AdaBelief(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = 1e-16)
Expand All @@ -499,7 +500,7 @@ struct AdaBelief{T} <: AbstractRule
beta::Tuple{T, T}
epsilon::T
end
AdaBelief(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = oftype(η, 1e-16)) = AdaBelief{typeof(η)}(η, β, ϵ)
AdaBelief(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = oftype(η, 1e-16)) = AdaBelief{float_typeof(η)}(η, β, ϵ)

init(o::AdaBelief, x::AbstractArray) = (zero(x), zero(x), o.beta)

Expand Down
7 changes: 7 additions & 0 deletions test/rules.jl
Original file line number Diff line number Diff line change
Expand Up @@ -229,3 +229,10 @@ end
@test static_loss(static_model) < 1.9
end
end

@testset "integer constructors: $rule" for rule in (Descent, Adam)
# Adam(0) was an error, https://github.com/FluxML/Optimisers.jl/issues/119
st = Optimisers.setup(rule(0), rand(3))
st2 = Optimisers.adjust(st, 0.1)
@test st2.rule.eta ≈ 0.1
end