diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py index e356a7aadb8d6..f992a42c40a6e 100644 --- a/python/paddle/v2/framework/optimizer.py +++ b/python/paddle/v2/framework/optimizer.py @@ -1,32 +1,104 @@ import paddle.v2.framework.framework as framework +from collections import defaultdict -__all__ = ['SGDOptimizer'] +__all__ = ['SGDOptimizer', 'MomentumOptimizer'] class Optimizer(object): """Optimizer Base class. Define the common interface of an optimizer. - User should not use this class directly, but need to use one of it's implementation. + User should not use this class directly, + but need to use one of it's implementation. """ def __init__(self): - pass + # Dictionary of accumulators. Some optimizer subclasses need to + # allocate and manage extra variables associated with the parameters + # to train. These variables are called accumulators. + # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} + self._accumulators = defaultdict(lambda: dict()) def _append_optimize_op(self, block, param_and_grad): """ append optimize operator to block and return all the added optimize_op """ raise NotImplementedError() - def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None): + def _initialize_tensors(self, block): + """Create all necessary tensors, that will be shared for all parameter updates. + + Tensors like learning rate should be initialized here. + + Args: + block: the block in which the loss variable is present + """ + pass + + def _create_accumulators(self, block, parameters): + """Create all accumulators needed by the parameters + + Args: + block: the block in which the loss variable is present + parameters: list of parameter variables for the optimizer """ - create and add gradient Operators in BlockDesc to Compute gradients of `loss` - for parameters in parameter_list + pass + + def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0): + """Utility function to add an accumulator for a parameter + + Args: + block: the block in which the loss variable is present + name: name of the accumulator + param: parameter variable for which accumulator is to be added + dtype: data type of the accumulator variable + fill_value: value to initialize the accumulator variable + """ + if (name in self._accumulators and + param.name in self._accumulators[name]): + raise Exception("Accumulator {} already exists for parmeter {}". + format(name, param.name)) + global_block = block.program.global_block() + param_shape = list(param.shape) + param_acc = global_block.create_var( + dtype=dtype, shape=param_shape, lod_level=0) + + # Initialize the accumulator with fill_value + # FIXME: Fix when Initialization design has been implemented + # https://github.com/PaddlePaddle/Paddle/pull/4852 + global_block.append_op( + type="fill_constant", + outputs={"Out": param_acc}, + attrs={"shape": param_shape, + "value": fill_value}) + + # Add to accumulators dict + self._accumulators[name][param.name] = param_acc + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + + Args: + name: name of the accumulator + param: parameter variable for which accumulator is to be fetched + + Returns: + accumulator variable for the parameter + """ + if (name not in self._accumulators or + param.name not in self._accumulators[name]): + raise Exception("Accumulator {} does not exist for parameter {}". + format(name, param.name)) + return self._accumulators[name][param.name] + + def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None): + """Create and add gradient Operators in BlockDesc to compute + gradients of `loss` for parameters in parameter_list Args: loss: an variable generated by cost function. no_grad_set: variable that should not create gradient - parameter_list: parameters that need to compute gradient and update to optimize the lost. + parameter_list: parameters that need to compute gradient and + update to optimize the lost. Returns: list of (parameters, gradients) pair. @@ -48,7 +120,8 @@ def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None): if not grad_block.has_var(grad_info[0]): raise Exception("grad block[%d] did not have grad var %s" % grad_info[1], grad_info[0]) - param_var = loss.block.var(param) + # Get the param var from the global block + param_var = loss.block.program.global_block().var(param) grad_var = grad_block.var(grad_info[0]) if loss.block.has_var(grad_info[0]): params_and_grads.append((param_var, grad_var)) @@ -64,14 +137,29 @@ def create_optimization_pass(self, parameters_and_grads, loss): parameters_and_grads: a list of (variable, gradient) pair to update. Returns: - optmization_op_list: a list of optimization operator that will update parameter using gradient. + optmization_op_list: a list of optimization operator that will update + parameter using gradient. """ + # This is a default implementation of create_optimization_pass that + # can be shared by most optimizers. This implementation assumes that + # the subclass will implement the _append_optimize_op method and the + # _initialize_tensors method. The subclass can extend the + # _create_accumulators method if it needs to create accumulators + # for parameters. + + # Create any accumulators + self._create_accumulators(loss.block, + [p[0] for p in parameters_and_grads]) + # Create any necessary tensors + self._initialize_tensors(loss.block) + optimize_ops = [] for param_and_grad in parameters_and_grads: if param_and_grad[1] is not None: optimize_op = self._append_optimize_op(loss.block, param_and_grad) optimize_ops.append(optimize_op) + return optimize_ops def minimize(self, loss, parameter_list=None, no_grad_set=None): @@ -92,33 +180,95 @@ class SGDOptimizer(Optimizer): def __init__(self, learning_rate): assert learning_rate is not None - super(Optimizer, self).__init__() + super(SGDOptimizer, self).__init__() self.type = "sgd" self._learning_rate = learning_rate - def _append_optimize_op(self, block, param_and_grad): + def _initialize_tensors(self, block): assert isinstance(block, framework.Block) lr_shape = [1] - # create a var for learning_rate - lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0) + # create a variable for learning_rate + self._lr = block.create_var( + dtype="float32", shape=lr_shape, lod_level=0) # create an op to init the learning_rate - init_op = block.append_op( + # FIXME: Fix when Initialization design has been implemented + # https://github.com/PaddlePaddle/Paddle/pull/4852 + block.append_op( type="fill_constant", - outputs={"Out": lr}, + outputs={"Out": self._lr}, attrs={"shape": lr_shape, "value": self._learning_rate}) + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + # create the optimize op sgd_op = block.append_op( type=self.type, inputs={ "Param": param_and_grad[0], "Grad": param_and_grad[1], - "LearningRate": lr + "LearningRate": self._lr }, - outputs={"ParamOut": param_and_grad[0]}, - attrs={"shape": [1], - "value": self._learning_rate}) + outputs={"ParamOut": param_and_grad[0]}) return sgd_op + + +class MomentumOptimizer(Optimizer): + """Simple Momentum optimizer with velocity state + """ + _velocity_acc_str = "velocity" + + def __init__(self, learning_rate, momentum): + assert learning_rate is not None + assert momentum is not None + super(MomentumOptimizer, self).__init__() + self.type = "momentum" + self._learning_rate = learning_rate + self._momentum = momentum + + def _initialize_tensors(self, block): + assert isinstance(block, framework.Block) + lr_shape = [1] + # create a variable for learning_rate + self._lr = block.create_var( + dtype="float32", shape=lr_shape, lod_level=0) + + # create an op to init the learning_rate + # FIXME: Fix when Initialization design has been implemented + # https://github.com/PaddlePaddle/Paddle/pull/4852 + block.append_op( + type="fill_constant", + outputs={"Out": self._lr}, + attrs={"shape": lr_shape, + "value": self._learning_rate}) + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + for p in parameters: + self._add_accumulator(block, self._velocity_acc_str, p, 'float32') + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + velocity_acc = self._get_accumulator(self._velocity_acc_str, + param_and_grad[0]) + # create the momentum optimize op + momentum_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Velocity": velocity_acc, + "LearningRate": self._lr + }, + outputs={ + "ParamOut": param_and_grad[0], + "VelocityOut": velocity_acc + }, + attrs={"mu": self._momentum}) + + return momentum_op diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py index 3d6fa70737bf3..e6a142ac361b5 100644 --- a/python/paddle/v2/framework/tests/test_optimizer.py +++ b/python/paddle/v2/framework/tests/test_optimizer.py @@ -6,7 +6,7 @@ class TestOptimizer(unittest.TestCase): def test_sgd_optimizer(self): - program = framework.g_program + program = framework.Program() block = program.global_block() mul_x = block.create_parameter( dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") @@ -14,7 +14,7 @@ def test_sgd_optimizer(self): dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") - mul_op = block.append_op( + block.append_op( type="mul", inputs={"X": mul_x, "Y": mul_y}, @@ -27,5 +27,47 @@ def test_sgd_optimizer(self): self.assertEqual(sgd_op.type, "sgd") +class TestMomentumOptimizer(unittest.TestCase): + class MockMomentum(optimizer.MomentumOptimizer): + def get_accumulators(self): + return self._accumulators + + def get_velocity_str(self): + return self._velocity_acc_str + + def test_momentum_optimizer(self): + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2) + params_grads = momentum_optimizer.create_backward_pass(mul_out) + self.assertEqual(len(params_grads), 1) + self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) + opts = momentum_optimizer.create_optimization_pass(params_grads, + mul_out) + self.assertEqual(len(opts), 1) + sgd_op = opts[0] + self.assertEqual(sgd_op.type, "momentum") + + # Check accumulators + accumulators = momentum_optimizer.get_accumulators() + self.assertEqual(len(accumulators), 1) + self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators) + velocity_acc = accumulators[momentum_optimizer.get_velocity_str()] + self.assertEqual(len(velocity_acc), 1) + self.assertTrue(mul_x.name in velocity_acc) + + if __name__ == '__main__': unittest.main()