-
-
Notifications
You must be signed in to change notification settings - Fork 611
Use conjugates in optimizers to better learn on complex-valued inputs #1776
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -141,7 +141,7 @@ RMSProp(η = 0.001, ρ = 0.9) = RMSProp(η, ρ, IdDict()) | |
function apply!(o::RMSProp, x, Δ) | ||
η, ρ = o.eta, o.rho | ||
acc = get!(() -> zero(x), o.acc, x)::typeof(x) | ||
@. acc = ρ * acc + (1 - ρ) * Δ^2 | ||
@. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, they are not the same, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I didn't look deeply into these enough to figure out it returning a real value would cause type instability or whatnot, so I just left it in this format. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be fine to use Surely not a huge effect though. I don't think these broadcasts will ever be seen by Zygote, for instance. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't |
||
@. Δ *= η / (√acc + ϵ) | ||
end | ||
|
||
|
@@ -179,7 +179,7 @@ function apply!(o::ADAM, x, Δ) | |
end :: Tuple{typeof(x),typeof(x),Vector{Float64}} | ||
|
||
@. mt = β[1] * mt + (1 - β[1]) * Δ | ||
@. vt = β[2] * vt + (1 - β[2]) * Δ^2 | ||
@. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) | ||
@. Δ = mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + ϵ) * η | ||
βp .= βp .* β | ||
|
||
|
@@ -221,7 +221,7 @@ function apply!(o::RADAM, x, Δ) | |
end :: Tuple{typeof(x),typeof(x),Vector{Float64},Ref{Int}} | ||
|
||
@. mt = β[1] * mt + (1 - β[1]) * Δ | ||
@. vt = β[2] * vt + (1 - β[2]) * Δ^2 | ||
@. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) | ||
ρ = ρ∞ - 2t[] * βp[2] / (1 - βp[2]) | ||
if ρ > 4 | ||
r = sqrt((ρ-4)*(ρ-2)*ρ∞/((ρ∞-4)*(ρ∞-2)*ρ)) | ||
|
@@ -311,7 +311,7 @@ function apply!(o::OADAM, x, Δ) | |
end :: Tuple{typeof(x),typeof(x),typeof(x),Vector{Float64}} | ||
|
||
@. mt = β[1] * mt + (1 - β[1]) * Δ | ||
@. vt = β[2] * vt + (1 - β[2]) * Δ^2 | ||
@. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) | ||
@. Δ = -Δ_ | ||
@. Δ_ = η * mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + ϵ) | ||
@. Δ += 2Δ_ | ||
|
@@ -348,7 +348,7 @@ ADAGrad(η = 0.1) = ADAGrad(η, IdDict()) | |
function apply!(o::ADAGrad, x, Δ) | ||
η = o.eta | ||
acc = get!(() -> fill!(similar(x), ϵ), o.acc, x)::typeof(x) | ||
@. acc += Δ^2 | ||
@. acc += Δ * conj(Δ) | ||
@. Δ *= η / (√acc + ϵ) | ||
end | ||
|
||
|
@@ -379,11 +379,11 @@ ADADelta(ρ = 0.9) = ADADelta(ρ, IdDict()) | |
function apply!(o::ADADelta, x, Δ) | ||
ρ = o.rho | ||
acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)} | ||
@. acc = ρ * acc + (1 - ρ) * Δ^2 | ||
@. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) | ||
# DON'T remove epsilon from numerator | ||
# or even out of the square roots | ||
@. Δ *= √(Δacc + ϵ) / √(acc + ϵ) | ||
@. Δacc = ρ * Δacc + (1 - ρ) * Δ^2 | ||
@. Δacc = ρ * Δacc + (1 - ρ) * Δ * conj(Δ) | ||
return Δ | ||
end | ||
|
||
|
@@ -463,7 +463,7 @@ function apply!(o::NADAM, x, Δ) | |
β1p, β2p = βp | ||
|
||
@. mt = β[1] * mt + (1 - β[1]) * Δ | ||
@. vt = β[2] * vt + (1 - β[2]) * Δ^2 | ||
@. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) | ||
@. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / (√(vt * β[2] / (1 - β2p)) + ϵ) * η | ||
βp .= βp .* β | ||
|
||
|
@@ -524,7 +524,7 @@ function apply!(o::AdaBelief, x, Δ) | |
η, β = o.eta, o.beta | ||
mt, st = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)} | ||
@. mt = β[1] * mt + (1 - β[1]) * Δ | ||
@. st = β[2] * st + (1 - β[2]) * (Δ - mt)^2 | ||
@. st = β[2] * st + (1 - β[2]) * (Δ - mt) * conj(Δ - mt) | ||
@. Δ = η * mt / (√(st) + ϵ) | ||
return Δ | ||
end | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -190,3 +190,40 @@ end | |
Flux.update!(opt, θ, gs) | ||
@test w ≈ wold .- 0.1 | ||
end | ||
|
||
# Flux PR #1776 | ||
# We need to test that optimisers like ADAM that maintain an internal momentum | ||
# estimate properly calculate the second-order statistics on the gradients as | ||
# the flow backward through the model. Previously, we would calculate second- | ||
# order statistics via `Δ^2` rather than the complex-aware `Δ * conj(Δ)`, which | ||
# wreaks all sorts of havoc on our training loops. This test ensures that | ||
# a simple optimization is montonically decreasing (up to learning step effects) | ||
@testset "Momentum Optimisers and complex values" begin | ||
# Test every optimizer that has momentum internally | ||
for opt_ctor in [ADAM, RMSProp, RADAM, OADAM, ADAGrad, ADADelta, NADAM, AdaBelief] | ||
# Our "model" is just a complex number | ||
w = zeros(ComplexF32, 1) | ||
|
||
# Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` | ||
function loss() | ||
# Deterministic training data is the best training data | ||
x = ones(1, 1) + 1im*ones(1, 1) | ||
|
||
# Manually implement `mse()` to allow demonstration of brokenness | ||
# on older Flux builds that don't have a fixed `mse()` | ||
return sum(abs2.(w * x .- conj(x))) | ||
end | ||
|
||
params = Flux.Params([w]) | ||
opt = opt_ctor(1e-2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This uses the same parameter for all. But ADADelta's first parameter wants to be close to 1, not 0. |
||
|
||
# Train for 10 iterations, enforcing that loss is monotonically decreasing | ||
last_loss = Inf | ||
for idx in 1:10 | ||
grads = Flux.gradient(loss, params) | ||
@test loss() < last_loss | ||
last_loss = loss() | ||
Flux.update!(opt, params, grads) | ||
end | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@DhairyaLGandhi We can simplify this by using
agg(abs2.(ŷ .- y))
instead, asabs2()
(as noted elsewhere) always returns aReal
. Would you prefer that?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I think that would be good.