neuralmagic · Satrat · Mar 18, 2024 · Mar 15, 2024 · Mar 18, 2024
diff --git a/src/sparseml/transformers/finetune/runner.py b/src/sparseml/transformers/finetune/runner.py
@@ -40,7 +40,11 @@
 )
 from sparseml.transformers.finetune.model_args import ModelArguments
 from sparseml.transformers.finetune.training_args import TrainingArguments
-from sparseml.utils.fsdp.helpers import is_fsdp_model, unwrap_and_export_model
+from sparseml.utils.fsdp.helpers import (
+    find_and_move_state_dicts_to_cpu,
+    is_fsdp_model,
+    unwrap_and_export_model,
+)
 
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -175,6 +179,15 @@ def one_shot(self, stage: Optional[str] = None):
                     output_dir=self._output_dir,
                     tokenizer=self.tokenizer,
                 )
+                # only allow the main process move the state
+                # dicts to cpu
+                if self.trainer.accelerator.is_main_process:
+                    # assuming quantization is the last step
+                    # we no longer need the original model
+                    # and can safely delete it to save memory
+                    del self.trainer.model
+                    find_and_move_state_dicts_to_cpu(self._output_dir)
+
         else:
             save_model_and_recipe(
                 model=self.trainer.model,

diff --git a/src/sparseml/utils/fsdp/helpers.py b/src/sparseml/utils/fsdp/helpers.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import operator
+from pathlib import Path
 from typing import Optional, Union
 
 
@@ -25,6 +27,7 @@
 except ImportError:
     FullyShardedDataParallel = None
 
+import torch
 from torch.nn import Module
 
 from sparseml.core.model import ModifiableModel
@@ -39,8 +42,11 @@
     "unwrap_and_export_model",
     "save_pretrained_fsdp",
     "get_fsdp_parent",
+    "find_and_move_state_dicts_to_cpu",
 ]
 
+_LOGGER = logging.getLogger(__name__)
+
 
 def is_fsdp_model(model: Module) -> bool:
     """
@@ -113,7 +119,28 @@ def unwrap_and_export_model(model, accelerator, output_dir, tokenizer):
         )
 
 
-def save_pretrained_fsdp(model, accelerator, output_dir):
+def find_and_move_state_dicts_to_cpu(output_dir: str):
+    """
+    Looks for state dicts in the output directory and overwrites them
+    with cpu state dicts.
+
+    this is needed for quantized models trained with FSDP as the state dict
+    contains device information, which can cause issues when loading the model
+    using transformers AutoModel.from_pretrained(...) if the device information
+    is not removed, assumes the state dicts are named pytorch_model*.bin
+    """
+
+    for model_file in Path(output_dir).rglob("pytorch_model*.bin"):
+        loaded_dict = torch.load(model_file)
+        for key, value in loaded_dict.items():
+            if isinstance(value, torch.Tensor):
+                loaded_dict[key] = value.cpu()
+
+        torch.save(loaded_dict, model_file)
+        _LOGGER.info(f"Moved state dict {model_file} to cpu")
+
+
+def save_pretrained_fsdp(model, accelerator, output_dir, save_safetensors: bool = True):
     full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
     """
     Gathers the full FSDP state dict of the model onto rank0 GPU, then uses it to save

diff --git a/src/sparseml/version.py b/src/sparseml/version.py
@@ -19,7 +19,7 @@
 from datetime import date
 
 
-version_base = "1.7.0"
+version_base = "1.7.1"
 is_release = False  # change to True to set the generated version as a release version