Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions python/paddle/fluid/tests/unittests/test_adamw_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ def test_adamw_op_dygraph(self):
parameters=linear.parameters(),
apply_decay_param_fun=lambda name: True,
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()

for _ in range(2):
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()

def test_adamw_op_coverage(self):
paddle.disable_static()
Expand Down
2 changes: 2 additions & 0 deletions python/paddle/optimizer/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable
from ..fluid.dygraph import base as imperative_base

import paddle

Expand Down Expand Up @@ -247,6 +248,7 @@ def _append_optimize_op(self, block, param_and_grad):

return adam_op

@imperative_base.no_grad
@framework.dygraph_only
def step(self):
"""
Expand Down
119 changes: 36 additions & 83 deletions python/paddle/optimizer/adamw.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def __init__(self,
self._params_name = set()
self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff
self._lr_to_coeff = dict()
super(AdamW, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
Expand All @@ -139,96 +140,48 @@ def __init__(self,
name=name,
lazy_mode=lazy_mode)

def _scale_parameters(self, params_and_grads):
def _append_decoupled_weight_decay(self, block, param_and_grad):
"""
Adds weight decay ops.
scaled_parameter = parameter * coeff
Add decoupled weight decay op.
parameter = parameter - parameter * coeff * lr

Args:
params_and_grads: A list of (parameters, gradients) pairs,
block: block in which variable is to be created
param_and_grad: (parameters, gradients) pairs,
the parameters need to decay.
Raises:
Exception: The type of coeff and parameter is not consistent.
"""

scaled_params = []
for param, grad in params_and_grads:
# If no gradient then we don't need to do anything
if grad is None:
continue
if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name):
continue

if isinstance(self._coeff, float):
assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
"the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
else:
assert self._coeff.dtype == param.dtype, \
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
if isinstance(self._learning_rate, float):
learning_rate = self._learning_rate
else:
learning_rate = self._learning_rate()
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
scaled_params.append(
(param, grad, param * self._coeff * learning_rate))
if param.name not in self._params_name:
self._params_name.add(param.name)
param = param * self._coeff
return scaled_params

@imperative_base.no_grad
def minimize(self,
loss,
startup_program=None,
parameters=None,
no_grad_set=None):
parameters = parameters if parameters \
else self._parameter_list

params_grads = self.backward(
loss=loss,
startup_program=startup_program,
parameters=parameters,
no_grad_set=no_grad_set)
scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)

optimize_ops = self._apply_optimize(
loss=loss,
params_grads=params_grads,
startup_program=startup_program)
return optimize_ops, params_grads

@framework.dygraph_only
@imperative_base.no_grad
def step(self):
params_grads = []
for param in self._parameter_list:
if not param.trainable:
continue
if param._grad_ivar() is not None:
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))

scaled_params = self._scale_parameters(params_grads)
for p_grad_sgrad in scaled_params:
param, grad, scaled_param = p_grad_sgrad
with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub(
x=param, y=scaled_param)
paddle.fluid.layers.assign(input=updated_param, output=param)
self._apply_optimize(
loss=None, startup_program=None, params_grads=params_grads)
param, grad = param_and_grad

if self._apply_decay_param_fun is not None \
and not self._apply_decay_param_fun(param.name):
return

if isinstance(self._learning_rate, float):
learning_rate = self._learning_rate
else:
# NOTE. We add this function to the _append_optimize_op(),
# for we must make sure _create_param_lr() be called after
# optimizer._create_global_learning_rate().
learning_rate = self._create_param_lr(param_and_grad)

with block.program._optimized_guard(
[param, grad]), framework.name_scope('weight decay'):
self._params_name.add(param.name)

# If it has been calculated, the result will be reused
decay_coeff = self._lr_to_coeff.get(learning_rate, None)
if decay_coeff is None:
decay_coeff = 1.0 - learning_rate * self._coeff
self._lr_to_coeff[learning_rate] = decay_coeff

scaled_param = param * decay_coeff
paddle.fluid.layers.assign(input=scaled_param, output=param)

def _append_optimize_op(self, block, param_and_grad):
self._append_decoupled_weight_decay(block, param_and_grad)
return super(AdamW, self)._append_optimize_op(block, param_and_grad)

def __str__(self):
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])