Skip to content

Commit

Permalink
【paddle.fleet】parameter_server_optimizer support auto_strategy (Paddl…
Browse files Browse the repository at this point in the history
…ePaddle#27181)

* parameter_server_optimizer support auto_strategy
  • Loading branch information
123malin authored Sep 10, 2020
1 parent fde5cfe commit 60c3ef3
Show file tree
Hide file tree
Showing 17 changed files with 251 additions and 157 deletions.
8 changes: 4 additions & 4 deletions python/paddle/distributed/fleet/base/fleet_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def worker_num(self):
Returns:
int: worker numbers
Examples:
.. code-block:: python
Expand Down Expand Up @@ -737,7 +737,7 @@ def set_lr(self, value):
"""
Set the value of the learning rate manually in the optimizer.
Only work in dygraph mode
Args:
value (float|Tensor): the value of learning rate
Expand Down Expand Up @@ -877,7 +877,7 @@ def clear_grad(self):
"""
Execute the optimizer once.
Only work in dygraph mode
Returns: None
Examples:
Expand Down Expand Up @@ -1019,7 +1019,7 @@ def minimize(self,
if self.user_defined_strategy._is_strict_auto():
# turn on all the strategy for each optimizer
for opt in distributed_optimizer_list:
opt._enable_strategy(self.user_defined_strategy)
opt._enable_strategy(self.user_defined_strategy, context)

valid_optimizer_list = []
valid_graph_optimizer_list = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
loss, role_maker, user_defined_optimizer, user_defined_strategy)

def _can_apply(self):
if not self.role_maker._is_collective:
return False

if self.user_defined_strategy.amp:
return True
return False
Expand All @@ -42,7 +45,7 @@ def _disable_strategy(self, dist_strategy):
dist_strategy.amp = False
dist_strategy.amp_configs = {}

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.amp = True
dist_strategy.amp_configs = {
"init_loss_scaling": 32768.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
name=opt._name)

def _can_apply(self):
if not self.role_maker._is_collective:
return False

if self.user_defined_strategy.dgc:
if not isinstance(self.inner_opt, Momentum):
logging.warn("dgc only works on Momentum optimizer")
Expand All @@ -69,7 +72,7 @@ def _disable_strategy(self, dist_strategy):
dist_strategy.dgc = False
dist_strategy.dgc_configs = {}

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.dgc = True
dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1}

Expand All @@ -89,5 +92,5 @@ def minimize_impl(self,
no_grad_set=None):
optimize_ops, params_grads = \
self.dgc_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
parameter_list, no_grad_set)
return optimize_ops, params_grads
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,18 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
self.user_defined_strategy.gradient_merge_configs["avg"])

def _can_apply(self):
if not self.role_maker._is_collective:
return False

can_apply = (self.user_defined_strategy.gradient_merge == True) and \
self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
return can_apply

def _disable_strategy(self, dist_strategy):
dist_strategy.gradient_merge = False
dist_strategy.gradient_merge_configs = {}

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
# we currently do not support auto-enable gradient merge
return

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def backward(self,
callbacks=None):
pass

# should fix the variable
# should fix the variable
def _setup_nccl_op(self, startup_program, main_program, build_strategy):
trainer_endpoints = self.role_maker.get_trainer_endpoints()
trainers = trainer_endpoints
Expand Down Expand Up @@ -94,31 +94,31 @@ def _try_to_compile(self, startup_program, main_program, loss):
dist_strategy = self.user_defined_strategy
local_build_strategy = paddle.fluid.BuildStrategy()
local_build_strategy.enable_sequential_execution = \
dist_strategy.build_strategy.enable_sequential_execution
dist_strategy.build_strategy.enable_sequential_execution
local_build_strategy.fuse_elewise_add_act_ops = \
dist_strategy.build_strategy.fuse_elewise_add_act_ops
dist_strategy.build_strategy.fuse_elewise_add_act_ops
local_build_strategy.fuse_bn_act_ops = \
dist_strategy.build_strategy.fuse_bn_act_ops
dist_strategy.build_strategy.fuse_bn_act_ops
local_build_strategy.enable_auto_fusion = \
dist_strategy.build_strategy.enable_auto_fusion
dist_strategy.build_strategy.enable_auto_fusion
local_build_strategy.fuse_relu_depthwise_conv = \
dist_strategy.build_strategy.fuse_relu_depthwise_conv
dist_strategy.build_strategy.fuse_relu_depthwise_conv
local_build_strategy.fuse_broadcast_ops = \
dist_strategy.build_strategy.fuse_broadcast_ops
dist_strategy.build_strategy.fuse_broadcast_ops
local_build_strategy.fuse_all_optimizer_ops = \
dist_strategy.build_strategy.fuse_all_optimizer_ops
dist_strategy.build_strategy.fuse_all_optimizer_ops
local_build_strategy.enable_inplace = \
dist_strategy.build_strategy.enable_inplace
dist_strategy.build_strategy.enable_inplace
local_build_strategy.use_hierarchical_allreduce = \
dist_strategy.use_hierarchical_allreduce
dist_strategy.use_hierarchical_allreduce
local_build_strategy.hierarchical_allreduce_inter_nranks = \
dist_strategy.hierarchical_allreduce_inter_nranks
dist_strategy.hierarchical_allreduce_inter_nranks
local_build_strategy.sync_batch_norm = \
dist_strategy.sync_batch_norm
dist_strategy.sync_batch_norm
local_build_strategy.fuse_all_reduce_ops = \
dist_strategy.fuse_all_reduce_ops
dist_strategy.fuse_all_reduce_ops
local_build_strategy.nccl_comm_num = \
dist_strategy.nccl_comm_num
dist_strategy.nccl_comm_num

if self.user_defined_strategy.recompute == True:
logging.warn(
Expand Down Expand Up @@ -190,7 +190,7 @@ def _disable_strategy(self, dist_strategy):
# TODO(guru4elephant): should close all PE related flags here
return

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
# by default, graph execution strategy is enabled
return

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ def exclude_fn(param):
name=opt._name)

def _can_apply(self):
if not self.role_maker._is_collective:
return False

if self.user_defined_strategy.lamb:
if not isinstance(self.inner_opt, AdamOptimizer):
logging.warn(
Expand All @@ -75,7 +78,7 @@ def _disable_strategy(self, dist_strategy):
dist_strategy.lamb = False
dist_strategy.lamb_configs = {}

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.lamb = True
dist_strategy.lamb_configs = {
"lamb_weight_decay": 0.01,
Expand All @@ -102,5 +105,5 @@ def minimize_impl(self,
no_grad_set=None):
optimize_ops, params_grads = \
self.lamb_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
parameter_list, no_grad_set)
return optimize_ops, params_grads
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
epsilon=configs['epsilon'])

def _can_apply(self):
if not self.role_maker._is_collective:
return False

if self.user_defined_strategy.lars:
if not isinstance(self.inner_opt, Momentum):
logging.warn(
Expand All @@ -62,7 +65,7 @@ def _disable_strategy(self, dist_strategy):
dist_strategy.lars = False
dist_strategy.lars_configs = {}

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.lars = True
dist_strategy.lars_configs = {
"lars_coeff": 0.01,
Expand All @@ -89,5 +92,5 @@ def minimize_impl(self,
no_grad_set=None):
optimize_ops, params_grads = \
self.lars_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
parameter_list, no_grad_set)
return optimize_ops, params_grads
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,25 @@ def __init__(self, optimizer):
self.snapshot_key = '@SNAPSHOT'

def _can_apply(self):
if not self.role_maker._is_collective:
return False

if not self.user_defined_strategy.localsgd:
return False

if self.role_maker.worker_num() <= 1:
return False

return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)

def _disable_strategy(self, dist_strategy):
dist_strategy.localsgd = False
dist_strategy.localsgd_configs = {}

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
dist_strategy.localsgd = True
dist_strategy.localsgd_configs = {"k_steps": 1}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _disable_strategy(self, dist_strategy):
raise NotImplementedError("you should implement disable strategy in {}".
format(type(self).__name__))

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context=None):
raise NotImplementedError("you should implement enable strategy in {}".
format(type(self).__name__))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ def __init__(self, optimizer):
self.meta_optimizers_white_list = []

def _can_apply(self):
if self.role_maker._is_collective:
return False

k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
if k_steps < 0:
return False
Expand All @@ -37,12 +40,11 @@ def _can_apply(self):
return True

def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync_configs = {}
return

def _enable_strategy(self, dist_strategy):
def _enable_strategy(self, dist_strategy, context):
# only open up the async mode for auto-parallel
dist_strategy.a_sync = True
dist_strategy.a_sync_configs = {}
return

def _is_graph_out(self):
return True
Expand Down
Loading

0 comments on commit 60c3ef3

Please sign in to comment.