fix fsdp checkpointing issues (#24926)

pacman100 · web-flow · commit 1c7e5e236823 · 2023-07-21T12:17:26.000+05:30
* fix fsdp load

* Update trainer.py

* remove saving duplicate state_dict
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -2115,7 +2115,7 @@ def _load_best_model(self):
                         state_dict["_smp_is_partial"] = False
                         load_result = model.load_state_dict(state_dict, strict=True)
                 elif self.is_fsdp_enabled:
-                    load_fsdp_model(
+                    load_result = load_fsdp_model(
                         self.accelerator.state.fsdp_plugin, self.accelerator, model, self.state.best_model_checkpoint
                     )
                 else:
@@ -2298,6 +2298,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 # Needs to be called on all ranks to gather all states.
                 # full_optim_state_dict will be deprecated after Pytorch 2.2!
                 full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer)
+                torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
 
         if is_torch_tpu_available():
             xm.rendezvous("saving_optimizer_states")
@@ -2321,12 +2322,9 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 reissue_pt_warnings(caught_warnings)
                 if self.do_grad_scaling:
                     torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
-        elif self.args.should_save and not self.is_deepspeed_enabled:
+        elif self.args.should_save and not self.is_deepspeed_enabled and not (self.fsdp or self.is_fsdp_enabled):
             # deepspeed.save_checkpoint above saves model/optim/sched
-            if self.fsdp and not self.is_fsdp_enabled:
-                torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
-            else:
-                torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
 
             with warnings.catch_warnings(record=True) as caught_warnings:
                 torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
@@ -2731,10 +2729,16 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             or self.fsdp is not None
             or self.is_fsdp_enabled
         ):
-            state_dict = self.model.state_dict()
+            state_dict = self.model.state_dict() if not self.is_fsdp_enabled else {}
             if self.args.should_save:
                 self._save(output_dir, state_dict=state_dict)
             if self.is_fsdp_enabled:
+                # remove the dummy state_dict saved above
+                if self.args.should_save:
+                    for filename in [WEIGHTS_NAME, SAFE_WEIGHTS_NAME]:
+                        file = os.path.join(output_dir, filename)
+                        if os.path.isfile(file):
+                            os.remove(file)
                 save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
 
         elif self.is_deepspeed_enabled: