Skip to content

Commit

Permalink
[Unified Checkpoint] Update async save info (#8982)
Browse files Browse the repository at this point in the history
* [Unified checkpoint] update optimizer async save signal

* [Unified Checkpoint] Update async save info
  • Loading branch information
DesmonDay authored Aug 29, 2024
1 parent ae691e2 commit 5956822
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import collections
import contextlib
import inspect
import json
import math
import os
import random
Expand Down Expand Up @@ -2475,6 +2476,24 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None, merge_tensor_
# Save a trained model and configuration using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`

local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
if (
strtobool(os.getenv("FLAG_LLM_PDC", "False"))
and local_rank == 0
and self.args.unified_checkpoint
and "async_save" in self.args.unified_checkpoint_config
):
os.makedirs(self.args.logging_dir, exist_ok=True)
world_size = paddle.distributed.get_world_size()
save_info = {
"world_size": world_size,
"ignore_save_lr_and_optim": self.args.ignore_save_lr_and_optim,
"skip_save_model_weight": "skip_save_model_weight" in self.args.unified_checkpoint_config,
}
if not os.path.exists(os.path.join(self.args.logging_dir, "async_save_info.json")):
with open(os.path.join(self.args.logging_dir, "async_save_info.json"), "w") as f:
json.dump(save_info, f)

if self.args.should_save:
if self.tokenizer is not None:
self.tokenizer.save_pretrained(output_dir)
Expand Down

0 comments on commit 5956822

Please sign in to comment.