diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index cc06c78aafaf..5befe333b58a 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2225,7 +2225,8 @@ def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None: checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] for checkpoint in checkpoints_to_be_deleted: logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") - shutil.rmtree(checkpoint) + # ignore_errors for shared disks between train nodes. + shutil.rmtree(checkpoint, ignore_errors=True) def _save(self, output_dir: Optional[str] = None, state_dict=None, merge_tensor_parallel=False): output_dir = output_dir if output_dir is not None else self.args.output_dir