Skip to content

Commit

Permalink
[remove fluid] under fleet meta_optimizers (#47864)
Browse files Browse the repository at this point in the history
* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers

* [remove fluid] under fleet meta_optimizers
  • Loading branch information
wangzhen38 authored Nov 16, 2022
1 parent 9fba1e7 commit a2a97cb
Show file tree
Hide file tree
Showing 19 changed files with 350 additions and 51 deletions.
3 changes: 2 additions & 1 deletion python/paddle/distributed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from . import io
from .spawn import spawn # noqa: F401
from .launch.main import launch # noqa: F401

from .parallel import init_parallel_env # noqa: F401
from .parallel import get_rank # noqa: F401
from .parallel import get_world_size # noqa: F401
Expand Down Expand Up @@ -74,6 +74,7 @@
from . import rpc

__all__ = [ # noqa
"io",
"spawn",
"launch",
"scatter",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from paddle.fluid.optimizer import Optimizer
import paddle.fluid.core as core
from paddle.optimizer import Optimizer
import paddle.framework.core as core
from . import ascend_parser
from paddle.distributed import fleet
import hccl.manage.api as hccl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.core as core
import paddle.framework.core as core
import numpy as np
from functools import reduce

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid import framework
import paddle.autograd as imperative_base
from paddle import framework

__all__ = []

Expand Down Expand Up @@ -41,13 +41,13 @@ def __init__(self, optimizer, strategy):
# NOTE(liubo48): In pure DataParallel mode,
# the gradient synchronization is achieved through reducer.

@imperative_base.no_grad
@imperative_base.no_grad()
@framework.dygraph_only
def step(self):
parameters_list = _obtain_optimizer_parameters_list(self._inner_opt)
self._inner_opt.step()

@imperative_base.no_grad
@imperative_base.no_grad()
def minimize(
self, loss, startup_program=None, parameters=None, no_grad_set=None
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from ...base.topology import ParallelMode
from paddle.fluid.dygraph import base as imperative_base
import paddle.autograd as imperative_base
import paddle
from paddle import _legacy_C_ops

Expand Down Expand Up @@ -51,7 +51,7 @@ def minimize(self, optimizer, *args, **kwargs):

return optimize_ops, params_grads

@imperative_base.no_grad
@imperative_base.no_grad()
def _unscale(self, optimizer):
if not self._enable:
return
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
sharding_reduce_gradients,
)
from ...base.topology import ParallelMode
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid import framework
from paddle.autograd import no_grad
from paddle import framework
from ...utils.log_util import logger
from paddle.fluid import core
from paddle.framework import core
from paddle.fluid import layers

__all__ = []
Expand All @@ -47,7 +47,7 @@ def __init__(self, clip, hcg):
self._clip = clip
self._hcg = hcg

@imperative_base.no_grad
@no_grad()
def _dygraph_clip(self, params_grads):
sum_square_dist_fp16 = []
sum_square_dist_fp32 = []
Expand All @@ -63,8 +63,8 @@ def _dygraph_clip(self, params_grads):
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
square = layers.square(merge_grad)
sum_square = layers.reduce_sum(square)
square = paddle.square(merge_grad)
sum_square = paddle.sum(square)

not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or (
hasattr(p, 'is_firstly_shared')
Expand All @@ -89,8 +89,8 @@ def _dygraph_clip(self, params_grads):
[0.0], dtype=paddle.float32
)
else:
global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16)
global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16)
global_norm_dist_fp16 = paddle.concat(sum_square_dist_fp16)
global_norm_dist_fp16 = paddle.sum(global_norm_dist_fp16)
global_norm_dist_fp16 = paddle.cast(
global_norm_dist_fp16, dtype=paddle.float32
)
Expand All @@ -101,29 +101,27 @@ def _dygraph_clip(self, params_grads):
[0.0], dtype=paddle.float32
)
else:
global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16)
global_norm_not_dist_fp16 = layers.reduce_sum(
global_norm_not_dist_fp16
)
global_norm_not_dist_fp16 = paddle.concat(sum_square_not_dist_fp16)
global_norm_not_dist_fp16 = paddle.sum(global_norm_not_dist_fp16)
global_norm_not_dist_fp16 = paddle.cast(
global_norm_not_dist_fp16, dtype=paddle.float32
)

# global norm of distributed FP32 params_and_grads
global_norm_dist_fp32 = (
layers.concat(sum_square_dist_fp32)
paddle.concat(sum_square_dist_fp32)
if len(sum_square_dist_fp32) != 0
else paddle.to_tensor([0.0], dtype=paddle.float32)
)
global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32)
global_norm_dist_fp32 = paddle.sum(global_norm_dist_fp32)

# global norm of non-distributed FP32 params_and_grads
global_norm_not_dist_fp32 = (
layers.concat(sum_square_not_dist_fp32)
paddle.concat(sum_square_not_dist_fp32)
if len(sum_square_not_dist_fp32) != 0
else paddle.to_tensor([0.0], dtype=paddle.float32)
)
global_norm_not_dist_fp32 = layers.reduce_sum(global_norm_not_dist_fp32)
global_norm_not_dist_fp32 = paddle.sum(global_norm_not_dist_fp32)

global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32
global_norm_var_not_dist = (
Expand Down Expand Up @@ -151,14 +149,16 @@ def _dygraph_clip(self, params_grads):
group=self._hcg.get_sharding_parallel_group(),
)

global_norm_var_fp32 = layers.sqrt(
global_norm_var_fp32 = paddle.sqrt(
global_norm_var_dist + global_norm_var_not_dist
)

max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm
max_global_norm = paddle.full(
shape=[1],
dtype=global_norm_var_fp32.dtype,
fill_value=self.clip_norm,
)
clip_var = layers.elementwise_div(
clip_var = paddle.divide(
x=max_global_norm,
y=paddle.maximum(x=global_norm_var_fp32, y=max_global_norm),
)
Expand Down Expand Up @@ -229,7 +229,7 @@ def __init__(self, optimizer, hcg, strategy):
self._inner_opt._grad_clip, hcg
)

@imperative_base.no_grad
@no_grad()
@framework.dygraph_only
def step(self):
parameters_list = _obtain_optimizer_parameters_list(self._inner_opt)
Expand All @@ -241,7 +241,7 @@ def step(self):

self._inner_opt.step()

@imperative_base.no_grad
@no_grad()
def minimize(
self, loss, startup_program=None, parameters=None, no_grad_set=None
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import paddle
import paddle.distributed as dist
from paddle.fluid import core
from paddle.framework import core
from paddle.optimizer import Optimizer
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.distributed.collective import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
OpRole,
)

from paddle.fluid import core
from paddle.framework import core

__all__ = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
# limitations under the License.

from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole, is_update_op
from paddle.fluid import core, unique_name
from paddle.framework import core
from paddle.utils import unique_name

__all__ = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.fluid import core, unique_name
from paddle.framework import core
from paddle.utils import unique_name
from functools import reduce
from paddle.distributed.fleet.meta_optimizers.common import (
is_loss_grad_op,
Expand Down Expand Up @@ -1046,11 +1047,11 @@ def sharding_predicate(var):
)

if int(os.environ.get('PADDLE_TRAINER_ID', 0)) == 0:
paddle.fluid.io.save_persistables(
exe, dirname, main_program=main_program, filename=None
paddle.distributed.io.save_persistables(
exe, dirname, main_program=main_program, filename=filename
)
else:
paddle.fluid.io.save_vars(
paddle.static.save_vars(
exe,
dirname,
main_program=main_program,
Expand Down
Loading

0 comments on commit a2a97cb

Please sign in to comment.