Closed
Description
When training a neural network with an L2 regularization, it is often advised not to regularize the bias parameters (in contrast with weight parameters).
I implemented this as follows in AlphaZero.jl:
regularized_params_(l) = []
regularized_params_(l::Flux.Dense) = [l.W]
regularized_params_(l::Flux.Conv) = [l.weight]
function foreach_flux_node(f::Function, x, seen = IdDict())
Functors.isleaf(x) && return
haskey(seen, x) && return
seen[x] = true
f(x)
for child in Flux.trainable(x)
foreach_flux_node(f, child, seen)
end
end
function regularized_params(net::FluxNetwork)
ps = Flux.Params()
foreach_flux_node(net) do p
for r in regularized_params_(p)
any(x -> x === r, ps) || push!(ps, r)
end
end
return ps
end
regularization_term(nn) = sum(sum(w .* w) for w in regularized_params(nn))
This feels a bit hackish though (and also it relies on internals and so it tends to break at every new Flux release).
Do you see any better way? Shouldn't we make this easier?
Metadata
Metadata
Assignees
Labels
No labels