From e0dae27832bb9541aebf0119ebdd33c632de8b56 Mon Sep 17 00:00:00 2001
From: haohongxiang <haohongxiang@baidu.com>
Date: Thu, 21 Mar 2024 09:27:43 +0000
Subject: [PATCH] add arg

---
 paddlenlp/trainer/training_args.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
index df2cbb359a21..79c7efde0575 100644
--- a/paddlenlp/trainer/training_args.py
+++ b/paddlenlp/trainer/training_args.py
@@ -259,6 +259,7 @@ class TrainingArguments:
               enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation
               enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed
               enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap.
+              disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.
         recompute (`bool`, *optional*, defaults to `False`):
             Recompute the forward pass to calculate gradients. Used for saving memory.
             Only support for networks with transformer blocks.
@@ -626,6 +627,7 @@ class TrainingArguments:
                 "following config is support: \n"
                 "enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation\n"
                 "enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed\n"
+                "disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.\n"
                 "enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap"
             )
         },
@@ -1136,11 +1138,24 @@ def is_segment_parallel_supported():
                                 "enable_stage1_overlap",
                                 "enable_stage2_overlap",
                                 "split_param",
+                                "disable_stage1_reduce_avg",
                             ]:
                                 raise ValueError(
                                     f"Found unknown pipeline mode config {x}, "
                                     f"accpet config is enable_stage1_tensor_fusion, enable_stage1_overlap, enable_stage2_overlap."
                                 )
+                    if "disable_stage1_reduce_avg" in sharding_parallel_config:
+                        assert (
+                            self.sharding == "stage1"
+                        ), "Only sharding stage1 supports to disable reduce_avg strategy."
+                        try:
+                            strategy.hybrid_configs["sharding_configs"].use_reduce_avg = False
+                        except:
+                            warnings.warn(
+                                "The disable_stage1_reduce_avg is not supported "
+                                "by current version of Paddle. Please try latest develop Paddle."
+                            )
+
                     try:
                         if "split_param" in sharding_parallel_config:
                             strategy.hybrid_configs["sharding_configs"].split_param = True