Skip to content

Commit

Permalink
[BugFix] shard-reshard下dp间is_matched行为不一致,以dp0为准 (#9404)
Browse files Browse the repository at this point in the history
  • Loading branch information
bo-ke authored Nov 13, 2024
1 parent 3e54b85 commit bada65d
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion paddlenlp/trainer/utils/sharding_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def _load_one_state_dict_from_checkpoint(self, resume_from_checkpoint, base_weig
if not os.path.isfile(file_path):
raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}, no {file_path}")

logger.info(f"Loading model from {resume_from_checkpoint} .")
logger.info(f"Loading model from {file_path}.")
# We load the model state dict on the CPU to avoid an OOM error.
state_dict = paddle.load(file_path, return_numpy=True)
return state_dict
Expand Down Expand Up @@ -315,6 +315,11 @@ def load_optimizer_state_with_reshard(self, checkpoint, base_opt_name, model_wra
is_matched = reshard_util.sharding_v2.is_matched_optimizer_state_dict(
one_shard_opt_state_dict, self.optimizer, model_wrapped
)
is_matched = paddle.to_tensor([is_matched], dtype=paddle.int32)
dp_group = fleet.get_hybrid_communicate_group().get_data_parallel_group()
dp_src_rank = fleet.get_hybrid_communicate_group().get_data_parallel_group_src_rank()
dist.broadcast(is_matched, src=dp_src_rank, group=dp_group)
is_matched = bool(is_matched[0])
else:
is_matched = True

Expand Down

0 comments on commit bada65d

Please sign in to comment.