Merge branch 'main' into release/v0.32.0

ethantang-db · ethantang-db · commit beed9b19983e · 2025-07-14T13:07:10.000-07:00
diff --git a/composer/distributed/prepare_distributed.py b/composer/distributed/prepare_distributed.py
@@ -42,6 +42,27 @@ def log_execution_time(logger: logging.Logger, operation_name: str):
         logger.info(f'{operation_name} took {end_time - start_time:.2f} seconds')
 
 
+@contextmanager
+def get_full_state_dict(model: torch.nn.Module):
+    """Context manager to temporarily get full state dict regardless of should_save_peft_only setting for huggingface models.
+
+    PEFT models with lora have an updated state_dict fn (in composer/models/huggingface.py) that
+    returns the state_dict with only the lora params if should_save_peft_only is True.
+    But when we're syncing module states, we need the full state dict, so we temporarily set
+    should_save_peft_only to False.
+    """
+    # TODO: Since sharding peft/lora weights can be inefficient due to their small sizes (leading to communication overhead
+    # outweighing memory savings), we should provide an interface that allows users to avoid sharding these weights.
+    original_setting = getattr(model, 'should_save_peft_only', None)
+    if original_setting is not None:
+        model.should_save_peft_only = False  # type: ignore
+    try:
+        yield
+    finally:
+        if original_setting is not None:
+            model.should_save_peft_only = original_setting  # type: ignore
+
+
 def _check_duplicate_modules(model: torch.nn.Module) -> None:
     """Checks whether the model has duplicate module references.
 
@@ -98,7 +119,8 @@ def _parallelize_model_helper(
                 full_state_dict=True,
                 cpu_offload=True,
             )
-            full_state_dict = get_model_state_dict(model, options=options)
+            with get_full_state_dict(model):
+                full_state_dict = get_model_state_dict(model, options=options)
 
         with log_execution_time(log, 'Prepare FSDP2'):
             prepare_fully_shard(model, config, precision, fsdp_wrap_policy)
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
@@ -1701,6 +1701,12 @@ def __init__(
                 log.info('No previous autoresume checkpoint found')
         # Actually load the checkpoint from potentially updated arguments
         if load_path is not None:
+            # If we are using FSDP and load_monolith_rank0_only is True, then the state_dict must be `full`
+            # when we are loading a checkpoint
+            if self.state.fsdp_config and self.state.fsdp_config.load_monolith_rank0_only:  # type: ignore
+                err_msg = 'state_dict_type must be `full` when load_monolith_rank0_only is True when loading a checkpoint'
+                assert self.state.fsdp_config.state_dict_type == 'full', err_msg  # type: ignore
+
             log.info(f'Loading checkpoint from {load_path}')
             if load_object_store is None:
                 load_object_store = maybe_create_object_store_from_uri(load_path)
diff --git a/composer/utils/parallelism.py b/composer/utils/parallelism.py
@@ -73,8 +73,12 @@ class FSDP2Config:
         reshard_after_forward (Union[bool, int]): Controls parameter behavior after forward.
         activation_checkpointing (bool): Whether to use activation checkpointing. Defaults to False.
         activation_cpu_offload (bool): Whether to use activation CPU offloading. Defaults to False.
-        load_monolith_rank0_only (bool): Whether to load monolithic checkpoints on rank 0 only. Defaults to False.
         state_dict_type (str): Type of state dict to use. Can be 'full' or 'sharded'. Defaults to 'sharded'.
+            - Note: In cases where `load_path` is not set in Trainer, `state_dict_type` indicates how a model will be saved.
+            - Note: In cases where `load_path` is set in Trainer, `state_dict_type` indicates how a model will be loaded and also saved.
+        load_monolith_rank0_only (bool): Whether to load monolithic checkpoints on rank 0 only. Defaults to False.
+            - Note: when `load_monolith_rank0_only` is True and `load_path` is set in `Trainer`, `state_dict_type` must be 'full'.
+        mixed_precision (str): Mixed precision to use. Can be 'DEFAULT', 'PURE', or 'FULL'. Defaults to 'DEFAULT'.
         verbose (bool): Whether to print verbose output. Defaults to False.
     """
 
@@ -169,15 +173,6 @@ def use_orig_params(self) -> bool:
     def __post_init__(self):
         warnings.warn('FSDP2 Config/APIs are experimental and subject to heavy changes', UserWarning)
 
-        # TODO: We might not need `load_monolith_rank0_only` as we can theoretically use
-        # self.monolith_rank0_only = self.state_dict_type == 'full' assuming that saving
-        # the model doesn't get affected by `load_monolith_rank0_only`
-        if self.load_monolith_rank0_only and self.state_dict_type != 'full':
-            raise ValueError(
-                'load_monolith_rank0_only=True requires state_dict_type="full". '
-                f'Got state_dict_type="{self.state_dict_type}"',
-            )
-
 
 @dataclass
 class TPConfig:
diff --git a/setup.py b/setup.py
@@ -174,7 +174,7 @@ def package_files(prefix: str, directory: str, extension: str):
 
 extra_deps['nlp'] = [
     'transformers>=4.11,!=4.34.0,<4.54',
-    'datasets>=2.4,<4',
+    'datasets>=2.4,<5',
     'huggingface-hub>=0.21.2,<0.34',
 ]
 
diff --git a/tests/trainer/test_fsdp2.py b/tests/trainer/test_fsdp2.py
@@ -10,8 +10,10 @@
 import torch.distributed.fsdp
 from torch.distributed._tensor import DTensor
 from torch.utils.data import DataLoader
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block
 
 from composer.models import ComposerClassifier
+from composer.models.huggingface import HuggingFaceModel
 from composer.trainer.trainer import Trainer
 from composer.utils import dist, load_checkpoint
 from composer.utils.parallelism import FSDP2Config, FSDPConfig, ParallelismConfig
@@ -815,3 +817,30 @@ def validate_reduce_dtype(module):
 
     for handle in hook_handles:
         handle.remove()
+
+
+@pytest.mark.gpu
+@world_size(2)
+def test_fsdp2_with_peft_model_and_mixed_init(
+    world_size: int,
+    tiny_gpt2_model,
+    tiny_gpt2_tokenizer,
+    gpt2_peft_config,
+):
+    del world_size
+    resolved_device = 'cuda' if dist.get_local_rank() == 0 else 'meta'
+    model = HuggingFaceModel(
+        tiny_gpt2_model,
+        tokenizer=tiny_gpt2_tokenizer,
+        peft_config=gpt2_peft_config,
+        should_save_peft_only=True,
+    )
+    for module in model.model.modules():
+        if isinstance(module, GPT2Block):
+            module._fsdp_wrap = True  # type: ignore
+    model.to(resolved_device)
+
+    create_trainer_with_model(
+        model=model,  # type: ignore
+        use_fsdp2=True,
+    )
diff --git a/tests/trainer/test_fsdp2_config.py b/tests/trainer/test_fsdp2_config.py
@@ -87,21 +87,3 @@ def test_fsdp2config_from_fsdp1_multiple_invalid_attributes():
     assert any('invalid_attribute2: value2' in msg for msg in warning_messages)
     assert any('auto_wrap: True' in msg for msg in warning_messages)
     assert any('sync_module_states: True' in msg for msg in warning_messages)
-
-
-def test_fsdp2_config_monolithic_validation():
-    """Test FSDP2Config validation for monolithic checkpointing."""
-    # Test valid monolithic config
-    config = FSDP2Config(
-        state_dict_type='full',
-        load_monolith_rank0_only=True,
-    )
-    assert config.state_dict_type == 'full'
-    assert config.load_monolith_rank0_only is True
-
-    # Test invalid monolithic config
-    with pytest.raises(ValueError, match='load_monolith_rank0_only=True requires state_dict_type="full"'):
-        FSDP2Config(
-            state_dict_type='sharded',
-            load_monolith_rank0_only=True,
-        )

Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,7 @@ def package_files(prefix: str, directory: str, extension: str):`
`174`	`174`
`175`	`175`	`extra_deps['nlp'] = [`
`176`	`176`	`'transformers>=4.11,!=4.34.0,<4.54',`
`177`		`- 'datasets>=2.4,<4',`
	`177`	`+ 'datasets>=2.4,<5',`
`178`	`178`	`'huggingface-hub>=0.21.2,<0.34',`
`179`	`179`	`]`
`180`	`180`