Skip to content

Commit

Permalink
Adding the interface for the momentum optimizer (#4919)
Browse files Browse the repository at this point in the history
* Adding the interface for the momentum optimizer
* Adding a comment about accumulators
  • Loading branch information
abhinavarora authored Oct 20, 2017
1 parent 37bfd03 commit 0e31d7d
Show file tree
Hide file tree
Showing 2 changed files with 213 additions and 21 deletions.
188 changes: 169 additions & 19 deletions python/paddle/v2/framework/optimizer.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,104 @@
import paddle.v2.framework.framework as framework
from collections import defaultdict

__all__ = ['SGDOptimizer']
__all__ = ['SGDOptimizer', 'MomentumOptimizer']


class Optimizer(object):
"""Optimizer Base class.
Define the common interface of an optimizer.
User should not use this class directly, but need to use one of it's implementation.
User should not use this class directly,
but need to use one of it's implementation.
"""

def __init__(self):
pass
# Dictionary of accumulators. Some optimizer subclasses need to
# allocate and manage extra variables associated with the parameters
# to train. These variables are called accumulators.
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict())

def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op
"""
raise NotImplementedError()

def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
def _initialize_tensors(self, block):
"""Create all necessary tensors, that will be shared for all parameter updates.
Tensors like learning rate should be initialized here.
Args:
block: the block in which the loss variable is present
"""
pass

def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters
Args:
block: the block in which the loss variable is present
parameters: list of parameter variables for the optimizer
"""
create and add gradient Operators in BlockDesc to Compute gradients of `loss`
for parameters in parameter_list
pass

def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0):
"""Utility function to add an accumulator for a parameter
Args:
block: the block in which the loss variable is present
name: name of the accumulator
param: parameter variable for which accumulator is to be added
dtype: data type of the accumulator variable
fill_value: value to initialize the accumulator variable
"""
if (name in self._accumulators and
param.name in self._accumulators[name]):
raise Exception("Accumulator {} already exists for parmeter {}".
format(name, param.name))
global_block = block.program.global_block()
param_shape = list(param.shape)
param_acc = global_block.create_var(
dtype=dtype, shape=param_shape, lod_level=0)

# Initialize the accumulator with fill_value
# FIXME: Fix when Initialization design has been implemented
# https://github.com/PaddlePaddle/Paddle/pull/4852
global_block.append_op(
type="fill_constant",
outputs={"Out": param_acc},
attrs={"shape": param_shape,
"value": fill_value})

# Add to accumulators dict
self._accumulators[name][param.name] = param_acc

def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if (name not in self._accumulators or
param.name not in self._accumulators[name]):
raise Exception("Accumulator {} does not exist for parameter {}".
format(name, param.name))
return self._accumulators[name][param.name]

def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
"""Create and add gradient Operators in BlockDesc to compute
gradients of `loss` for parameters in parameter_list
Args:
loss: an variable generated by cost function.
no_grad_set: variable that should not create gradient
parameter_list: parameters that need to compute gradient and update to optimize the lost.
parameter_list: parameters that need to compute gradient and
update to optimize the lost.
Returns:
list of (parameters, gradients) pair.
Expand All @@ -48,7 +120,8 @@ def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
if not grad_block.has_var(grad_info[0]):
raise Exception("grad block[%d] did not have grad var %s" %
grad_info[1], grad_info[0])
param_var = loss.block.var(param)
# Get the param var from the global block
param_var = loss.block.program.global_block().var(param)
grad_var = grad_block.var(grad_info[0])
if loss.block.has_var(grad_info[0]):
params_and_grads.append((param_var, grad_var))
Expand All @@ -64,14 +137,29 @@ def create_optimization_pass(self, parameters_and_grads, loss):
parameters_and_grads: a list of (variable, gradient) pair to update.
Returns:
optmization_op_list: a list of optimization operator that will update parameter using gradient.
optmization_op_list: a list of optimization operator that will update
parameter using gradient.
"""
# This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that
# the subclass will implement the _append_optimize_op method and the
# _initialize_tensors method. The subclass can extend the
# _create_accumulators method if it needs to create accumulators
# for parameters.

# Create any accumulators
self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads])
# Create any necessary tensors
self._initialize_tensors(loss.block)

optimize_ops = []
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is not None:
optimize_op = self._append_optimize_op(loss.block,
param_and_grad)
optimize_ops.append(optimize_op)

return optimize_ops

def minimize(self, loss, parameter_list=None, no_grad_set=None):
Expand All @@ -92,33 +180,95 @@ class SGDOptimizer(Optimizer):

def __init__(self, learning_rate):
assert learning_rate is not None
super(Optimizer, self).__init__()
super(SGDOptimizer, self).__init__()
self.type = "sgd"
self._learning_rate = learning_rate

def _append_optimize_op(self, block, param_and_grad):
def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1]
# create a var for learning_rate
lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0)
# create a variable for learning_rate
self._lr = block.create_var(
dtype="float32", shape=lr_shape, lod_level=0)

# create an op to init the learning_rate
init_op = block.append_op(
# FIXME: Fix when Initialization design has been implemented
# https://github.com/PaddlePaddle/Paddle/pull/4852
block.append_op(
type="fill_constant",
outputs={"Out": lr},
outputs={"Out": self._lr},
attrs={"shape": lr_shape,
"value": self._learning_rate})

def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)

# create the optimize op
sgd_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr
"LearningRate": self._lr
},
outputs={"ParamOut": param_and_grad[0]},
attrs={"shape": [1],
"value": self._learning_rate})
outputs={"ParamOut": param_and_grad[0]})

return sgd_op


class MomentumOptimizer(Optimizer):
"""Simple Momentum optimizer with velocity state
"""
_velocity_acc_str = "velocity"

def __init__(self, learning_rate, momentum):
assert learning_rate is not None
assert momentum is not None
super(MomentumOptimizer, self).__init__()
self.type = "momentum"
self._learning_rate = learning_rate
self._momentum = momentum

def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1]
# create a variable for learning_rate
self._lr = block.create_var(
dtype="float32", shape=lr_shape, lod_level=0)

# create an op to init the learning_rate
# FIXME: Fix when Initialization design has been implemented
# https://github.com/PaddlePaddle/Paddle/pull/4852
block.append_op(
type="fill_constant",
outputs={"Out": self._lr},
attrs={"shape": lr_shape,
"value": self._learning_rate})

def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)

for p in parameters:
self._add_accumulator(block, self._velocity_acc_str, p, 'float32')

def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)

velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Velocity": velocity_acc,
"LearningRate": self._lr
},
outputs={
"ParamOut": param_and_grad[0],
"VelocityOut": velocity_acc
},
attrs={"mu": self._momentum})

return momentum_op
46 changes: 44 additions & 2 deletions python/paddle/v2/framework/tests/test_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@

class TestOptimizer(unittest.TestCase):
def test_sgd_optimizer(self):
program = framework.g_program
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
mul_op = block.append_op(
block.append_op(
type="mul",
inputs={"X": mul_x,
"Y": mul_y},
Expand All @@ -27,5 +27,47 @@ def test_sgd_optimizer(self):
self.assertEqual(sgd_op.type, "sgd")


class TestMomentumOptimizer(unittest.TestCase):
class MockMomentum(optimizer.MomentumOptimizer):
def get_accumulators(self):
return self._accumulators

def get_velocity_str(self):
return self._velocity_acc_str

def test_momentum_optimizer(self):
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
block.append_op(
type="mul",
inputs={"X": mul_x,
"Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2)
params_grads = momentum_optimizer.create_backward_pass(mul_out)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass(params_grads,
mul_out)
self.assertEqual(len(opts), 1)
sgd_op = opts[0]
self.assertEqual(sgd_op.type, "momentum")

# Check accumulators
accumulators = momentum_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 1)
self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
self.assertEqual(len(velocity_acc), 1)
self.assertTrue(mul_x.name in velocity_acc)


if __name__ == '__main__':
unittest.main()

0 comments on commit 0e31d7d

Please sign in to comment.