From e0dae27832bb9541aebf0119ebdd33c632de8b56 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Thu, 21 Mar 2024 09:27:43 +0000 Subject: [PATCH] add arg --- paddlenlp/trainer/training_args.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py index df2cbb359a21..79c7efde0575 100644 --- a/paddlenlp/trainer/training_args.py +++ b/paddlenlp/trainer/training_args.py @@ -259,6 +259,7 @@ class TrainingArguments: enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap. + disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification. recompute (`bool`, *optional*, defaults to `False`): Recompute the forward pass to calculate gradients. Used for saving memory. Only support for networks with transformer blocks. @@ -626,6 +627,7 @@ class TrainingArguments: "following config is support: \n" "enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation\n" "enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed\n" + "disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.\n" "enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap" ) }, @@ -1136,11 +1138,24 @@ def is_segment_parallel_supported(): "enable_stage1_overlap", "enable_stage2_overlap", "split_param", + "disable_stage1_reduce_avg", ]: raise ValueError( f"Found unknown pipeline mode config {x}, " f"accpet config is enable_stage1_tensor_fusion, enable_stage1_overlap, enable_stage2_overlap." ) + if "disable_stage1_reduce_avg" in sharding_parallel_config: + assert ( + self.sharding == "stage1" + ), "Only sharding stage1 supports to disable reduce_avg strategy." + try: + strategy.hybrid_configs["sharding_configs"].use_reduce_avg = False + except: + warnings.warn( + "The disable_stage1_reduce_avg is not supported " + "by current version of Paddle. Please try latest develop Paddle." + ) + try: if "split_param" in sharding_parallel_config: strategy.hybrid_configs["sharding_configs"].split_param = True