diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 2a82cdb33140..9f34589bc94d 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -19,6 +19,7 @@ import collections import contextlib import inspect +import json import math import os import random @@ -2475,6 +2476,24 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None, merge_tensor_ # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` + local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) + if ( + strtobool(os.getenv("FLAG_LLM_PDC", "False")) + and local_rank == 0 + and self.args.unified_checkpoint + and "async_save" in self.args.unified_checkpoint_config + ): + os.makedirs(self.args.logging_dir, exist_ok=True) + world_size = paddle.distributed.get_world_size() + save_info = { + "world_size": world_size, + "ignore_save_lr_and_optim": self.args.ignore_save_lr_and_optim, + "skip_save_model_weight": "skip_save_model_weight" in self.args.unified_checkpoint_config, + } + if not os.path.exists(os.path.join(self.args.logging_dir, "async_save_info.json")): + with open(os.path.join(self.args.logging_dir, "async_save_info.json"), "w") as f: + json.dump(save_info, f) + if self.args.should_save: if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir)