Skip to content

Commit 57aabba

Browse files
authored
gradient scale (#33862)
1 parent 3fc56aa commit 57aabba

File tree

5 files changed

+119
-0
lines changed

5 files changed

+119
-0
lines changed

paddle/fluid/framework/distributed_strategy.proto

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,16 @@ message ExecutionStrategy {
119119
optional bool use_thread_barrier = 4 [ default = false ];
120120
}
121121

122+
message GradientScaleConfig {
123+
// Optional value ['avg', 'sum', 'customized']
124+
// If avg, loss@grad will be divided by the number of devices,
125+
// that is, the gradient will be accumulated and averaged among
126+
// multiple devices.
127+
// Else if sum, the gradient will accumulated among multiple
128+
// devices.
129+
optional string scale_strategy = 1 [ default = 'avg' ];
130+
}
131+
122132
message AsyncConfig {
123133
optional int32 k_steps = 1 [ default = -1 ];
124134
optional int32 max_merge_var_num = 2 [ default = 1 ];
@@ -195,6 +205,7 @@ message DistributedStrategy {
195205
optional TensorParallelConfig tensor_parallel_configs = 113;
196206
optional BuildStrategy build_strategy = 201;
197207
optional ExecutionStrategy execution_strategy = 202;
208+
optional GradientScaleConfig gradient_scale_configs = 203;
198209
}
199210

200211
message DistributedJobInfo {

python/paddle/distributed/fleet/base/distributed_strategy.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,28 @@ def build_strategy(self, strategy):
254254
getattr(self.strategy.build_strategy,
255255
f.name).extend(getattr(strategy, f.name))
256256

257+
@property
258+
def gradient_scale_configs(self):
259+
"""
260+
Set the strategy of gradient scale
261+
Examples:
262+
263+
.. code-block:: python
264+
import paddle.distributed.fleet as fleet
265+
strategy = fleet.DistributedStrategy()
266+
strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
267+
268+
Note that, strategy must be in 'avg', 'sum' or 'customized'
269+
"""
270+
return get_msg_dict(self.strategy.gradient_scale_configs)
271+
272+
@gradient_scale_configs.setter
273+
@is_strict_auto
274+
def gradient_scale_configs(self, config):
275+
check_configs_key(self.strategy.gradient_scale_configs, config,
276+
'gradient_scale_configs')
277+
assign_configs_value(self.strategy.gradient_scale_configs, config)
278+
257279
@property
258280
def a_sync(self):
259281
"""

python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from .meta_optimizer_base import MetaOptimizerBase
1919
from ..base.private_helper_function import wait_server_ready
2020
import logging
21+
from paddle.static import BuildStrategy
2122

2223
__all__ = []
2324

@@ -147,6 +148,17 @@ def _try_to_compile(self, startup_program, main_program, loss):
147148
local_build_strategy.nccl_comm_num = \
148149
dist_strategy.nccl_comm_num
149150

151+
gradient_scale_configs = self.user_defined_strategy.gradient_scale_configs
152+
scale_strategys = {
153+
'avg': BuildStrategy.GradientScaleStrategy.CoeffNumDevice,
154+
'sum': BuildStrategy.GradientScaleStrategy.One,
155+
'customized': BuildStrategy.GradientScaleStrategy.Customized,
156+
}
157+
assert gradient_scale_configs['scale_strategy'] in scale_strategys, \
158+
"gradient_scale_configs.scale_strategy must be 'avg', 'sum' or 'customized'"
159+
local_build_strategy.gradient_scale_strategy = \
160+
scale_strategys[gradient_scale_configs['scale_strategy']]
161+
150162
if self.user_defined_strategy.recompute == True:
151163
logging.warn(
152164
"set enable_sequential_execution=True since you have enable the recompute strategy"

python/paddle/fluid/tests/unittests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
107107
LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
108108
LIST(REMOVE_ITEM TEST_OPS test_memcpy_op)
109109
LIST(REMOVE_ITEM TEST_OPS test_raw_program_optimizer)
110+
LIST(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale)
110111
endif()
111112

112113
if(WIN32)
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import print_function
16+
17+
import unittest
18+
19+
import paddle
20+
import paddle.fluid as fluid
21+
import paddle.distributed.fleet as fleet
22+
import numpy as np
23+
import os
24+
25+
26+
class TestGradientScale(unittest.TestCase):
27+
def setUp(self):
28+
os.environ["PADDLE_TRAINER_ID"] = "0"
29+
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
30+
31+
def mlp(self, input_x, input_y, hid_dim=128, label_dim=2):
32+
fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh')
33+
fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh')
34+
prediction = paddle.static.nn.fc(x=[fc_2],
35+
size=label_dim,
36+
activation='softmax')
37+
cost = paddle.nn.functional.cross_entropy(
38+
input=prediction, label=input_y)
39+
avg_cost = paddle.mean(x=cost)
40+
return avg_cost
41+
42+
def gen_data(self):
43+
return {
44+
"x": np.random.random(size=(128, 32)).astype('float32'),
45+
"y": np.random.randint(
46+
2, size=(128, 1)).astype('int64')
47+
}
48+
49+
def test_single_gpu(self):
50+
paddle.enable_static()
51+
fleet.init(is_collective=True)
52+
main_program = paddle.static.Program()
53+
startup_program = paddle.static.Program()
54+
strategy = fleet.DistributedStrategy()
55+
strategy.gradient_scale_configs = {'scale_strategy': 'sum'}
56+
with fluid.program_guard(main_program, startup_program):
57+
with fluid.unique_name.guard():
58+
input_x = paddle.static.data(
59+
name="x", shape=[None, 32], dtype='float32')
60+
input_y = paddle.static.data(
61+
name="y", shape=[None, 1], dtype='int64')
62+
cost = self.mlp(input_x=input_x, input_y=input_y)
63+
output_name = cost.name
64+
optimizer = fleet.distributed_optimizer(fluid.optimizer.Adam(),
65+
strategy)
66+
optimizer.minimize(cost)
67+
68+
final_strategy = fleet._final_strategy()
69+
assert final_strategy.gradient_scale_configs['scale_strategy'] == 'sum'
70+
71+
72+
if __name__ == "__main__":
73+
unittest.main()

0 commit comments

Comments
 (0)