Skip to content

Commit

Permalink
add arg
Browse files Browse the repository at this point in the history
  • Loading branch information
haohongxiang committed Mar 21, 2024
1 parent f005084 commit 03577c2
Showing 1 changed file with 14 additions and 0 deletions.
14 changes: 14 additions & 0 deletions paddlenlp/trainer/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ class TrainingArguments:
enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation
enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed
enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap.
disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.
recompute (`bool`, *optional*, defaults to `False`):
Recompute the forward pass to calculate gradients. Used for saving memory.
Only support for networks with transformer blocks.
Expand Down Expand Up @@ -626,6 +627,7 @@ class TrainingArguments:
"following config is support: \n"
"enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation\n"
"enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed\n"
"disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.\n"
"enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap"
)
},
Expand Down Expand Up @@ -1136,11 +1138,23 @@ def is_segment_parallel_supported():
"enable_stage1_overlap",
"enable_stage2_overlap",
"split_param",
"disable_stage1_reduce_avg",
]:
raise ValueError(
f"Found unknown pipeline mode config {x}, "
f"accpet config is enable_stage1_tensor_fusion, enable_stage1_overlap, enable_stage2_overlap."
)
if "disable_stage1_reduce_avg" in sharding_parallel_config:
assert (

Check warning on line 1148 in paddlenlp/trainer/training_args.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/training_args.py#L1147-L1148

Added lines #L1147 - L1148 were not covered by tests
self.sharding == "stage1"
), "Only sharding stage1 supports to disable reduce_avg strategy."
try:
strategy.hybrid_configs["sharding_configs"].use_reduce_avg = False
except:
warnings.warn(

Check warning on line 1154 in paddlenlp/trainer/training_args.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/training_args.py#L1151-L1154

Added lines #L1151 - L1154 were not covered by tests
"The reduce_avg strategy is not supported by current version of Paddle so you don't need to disable it. The nccl comm in sharding still use reduce_sum and scale of gradients."
)

try:
if "split_param" in sharding_parallel_config:
strategy.hybrid_configs["sharding_configs"].split_param = True
Expand Down

0 comments on commit 03577c2

Please sign in to comment.