PaddlePaddle · wawltor · Mar 22, 2024 · Mar 21, 2024
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
@@ -259,6 +259,7 @@
               enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation
               enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed
               enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap.
+              disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.
         recompute (`bool`, *optional*, defaults to `False`):
             Recompute the forward pass to calculate gradients. Used for saving memory.
             Only support for networks with transformer blocks.
@@ -626,6 +627,7 @@
                 "following config is support: \n"
                 "enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation\n"
                 "enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed\n"
+                "disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.\n"
                 "enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap"
             )
         },
@@ -1036,6 +1038,7 @@
                                 "In pipeline model, the evaluation also shares same setting with training. "
                                 "We will enforce that per_device_eval_batch_size=per_device_train_batch_size * gradient_accumulation_steps."
                             )
+
                             self.per_device_eval_batch_size = (
                                 self.per_device_train_batch_size * self.gradient_accumulation_steps
                             )
@@ -1136,11 +1139,23 @@
                                 "enable_stage1_overlap",
                                 "enable_stage2_overlap",
                                 "split_param",
+                                "disable_stage1_reduce_avg",
                             ]:
                                 raise ValueError(
                                     f"Found unknown pipeline mode config {x}, "
                                     f"accpet config is enable_stage1_tensor_fusion, enable_stage1_overlap, enable_stage2_overlap."
                                 )
+                    if "disable_stage1_reduce_avg" in sharding_parallel_config:
+                        assert self.sharding == [
+                            ShardingOption.SHARD_OP
+                        ], "Only sharding stage1 supports to disable reduce_avg strategy."
+                        try:
+                            strategy.hybrid_configs["sharding_configs"].use_reduce_avg = False
+                        except:
+                            warnings.warn(
+                                "The reduce_avg strategy is not supported by current version of Paddle so you don't need to disable it. The nccl comm in sharding still use reduce_sum and scale of gradients."
+                            )
+
                     try:
                         if "split_param" in sharding_parallel_config:
                             strategy.hybrid_configs["sharding_configs"].split_param = True