[BugFix] shard-reshard下dp间is_matched行为不一致，以dp0为准 (#9404)

PaddlePaddle · Nov 13, 2024 · bada65d · bada65d
1 parent 3e54b85
commit bada65d
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/paddlenlp/trainer/utils/sharding_io.py b/paddlenlp/trainer/utils/sharding_io.py
@@ -214,7 +214,7 @@ def _load_one_state_dict_from_checkpoint(self, resume_from_checkpoint, base_weig
         if not os.path.isfile(file_path):
             raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}, no {file_path}")
 
-        logger.info(f"Loading model from {resume_from_checkpoint} .")
+        logger.info(f"Loading model from {file_path}.")
         # We load the model state dict on the CPU to avoid an OOM error.
         state_dict = paddle.load(file_path, return_numpy=True)
         return state_dict
@@ -315,6 +315,11 @@ def load_optimizer_state_with_reshard(self, checkpoint, base_opt_name, model_wra
                 is_matched = reshard_util.sharding_v2.is_matched_optimizer_state_dict(
                     one_shard_opt_state_dict, self.optimizer, model_wrapped
                 )
+                is_matched = paddle.to_tensor([is_matched], dtype=paddle.int32)
+                dp_group = fleet.get_hybrid_communicate_group().get_data_parallel_group()
+                dp_src_rank = fleet.get_hybrid_communicate_group().get_data_parallel_group_src_rank()
+                dist.broadcast(is_matched, src=dp_src_rank, group=dp_group)
+                is_matched = bool(is_matched[0])
             else:
                 is_matched = True