Lightning-AI · awaelchli · Jun 6, 2024 · Jan 17, 2024 · Jan 17, 2024 · Feb 20, 2024
@@ -80,6 +80,11 @@
     _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy]
     _SHARDING_STRATEGY = Union[ShardingStrategy, Literal["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD"]]
 
+    if _TORCH_GREATER_EQUAL_2_2:
+        from torch.distributed._tensor import DeviceMesh
+    else:
+        DeviceMesh = None  # type: ignore
+
 _FSDP_ALIASES = ("fsdp", "fsdp_cpu_offload")
 
 
@@ -117,10 +122,14 @@ class FSDPStrategy(ParallelStrategy, _Sharded):
             - ``"SHARD_GRAD_OP"``: Shards gradients and optimizer states only. Model parameters get replicated.
             - ``"NO_SHARD"``: No sharding (identical to regular DDP).
             - ``"HYBRID_SHARD"``: Shards model parameters, gradients, and optimizer states within a single machine, but
-              replicates across machines.
+              replicates across machines. See also the `device_mesh` parameter below.
 
             Also accepts a :class:`torch.distributed.fsdp.ShardingStrategy` enum value.
 
+        device_mesh: A tuple `(replication size, sharding size)` that defines over how many devices to shard and
+            replicate the model. The product of the two numbers must equal the world size. Only valid in combination
+            with the `HYBRID_SHARD` sharding strategy.
+
         state_dict_type: The format in which the state of the model and optimizers gets saved into the checkpoint.
 
             - ``"full"``: The full weights and optimizer states get assembled on rank 0 and saved to a single file.
@@ -146,6 +155,7 @@ def __init__(
         activation_checkpointing_policy: Optional["_POLICY"] = None,
         sharding_strategy: "_SHARDING_STRATEGY" = "FULL_SHARD",
         state_dict_type: Literal["full", "sharded"] = "sharded",
+        device_mesh: Optional[Union[Tuple[int], "DeviceMesh"]] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -163,6 +173,11 @@ def __init__(
         # Enables joint setup of model and optimizer, multiple optimizer param groups, and `torch.compile()`
         self._fsdp_kwargs.setdefault("use_orig_params", True)
 
+        if device_mesh is not None:
+            if not _TORCH_GREATER_EQUAL_2_2:
+                raise ValueError("The device_mesh argument is only supported in torch >= 2.2.")
+            self._fsdp_kwargs["device_mesh"] = device_mesh
+
         self._activation_checkpointing_kwargs = _activation_checkpointing_kwargs(
             activation_checkpointing, activation_checkpointing_policy
         )
@@ -244,6 +259,12 @@ def setup_environment(self) -> None:
         super().setup_environment()
         self._setup_distributed()
 
+        # if 'device_mesh' in the `_fsdp_kwargs` is provided as a tuple, update it into the `DeviceMesh` object here
+        if isinstance(self._fsdp_kwargs.get("device_mesh"), tuple):
+            from torch.distributed.device_mesh import init_device_mesh
+
+            self._fsdp_kwargs["device_mesh"] = init_device_mesh("cuda", self._fsdp_kwargs["device_mesh"])
+
     @override
     def setup_module_and_optimizers(
         self, module: Module, optimizers: List[Optimizer]

@@ -16,7 +16,21 @@
 from contextlib import contextmanager, nullcontext
 from datetime import timedelta
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Literal, Mapping, Optional, Set, Type, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
 
 import torch
 from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only
@@ -53,7 +67,10 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1
+from lightning.fabric.utilities.imports import (
+    _TORCH_GREATER_EQUAL_2_1,
+    _TORCH_GREATER_EQUAL_2_2,
+)
 from lightning.fabric.utilities.init import _EmptyInit, _has_meta_device_parameters_or_buffers
 from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
@@ -76,6 +93,11 @@
     _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy]
     _SHARDING_STRATEGY = Union[ShardingStrategy, Literal["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD"]]
 
+    if _TORCH_GREATER_EQUAL_2_2:
+        from torch.distributed._tensor import DeviceMesh
+    else:
+        DeviceMesh = None  # type: ignore
+
 
 log = logging.getLogger(__name__)
 
@@ -114,10 +136,14 @@ class FSDPStrategy(ParallelStrategy):
             - ``"SHARD_GRAD_OP"``: Shards gradients and optimizer states only. Model parameters get replicated.
             - ``"NO_SHARD"``: No sharding (identical to regular DDP).
             - ``"HYBRID_SHARD"``: Shards model parameters, gradients, and optimizer states within a single machine, but
-              replicates across machines.
+              replicates across machines. See also the `device_mesh` parameter below.
 
             Also accepts a :class:`torch.distributed.fsdp.ShardingStrategy` enum value.
 
+        device_mesh: A tuple `(replication size, sharding size)` that defines over how many devices to shard and
+            replicate the model. The product of the two numbers must equal the world size. Only valid in combination
+            with the `HYBRID_SHARD` sharding strategy.
+
         state_dict_type: The format in which the state of the model and optimizers gets saved into the checkpoint.
 
             - ``"full"``: The full weights and optimizer states get assembled on rank 0 and saved to a single file.
@@ -147,6 +173,7 @@ def __init__(
         activation_checkpointing_policy: Optional["_POLICY"] = None,
         sharding_strategy: "_SHARDING_STRATEGY" = "FULL_SHARD",
         state_dict_type: Literal["full", "sharded"] = "full",
+        device_mesh: Optional[Union[Tuple[int], "DeviceMesh"]] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -162,6 +189,12 @@ def __init__(
         self.cpu_offload = _init_cpu_offload(cpu_offload)
         self.mixed_precision = mixed_precision
         self.kwargs = _auto_wrap_policy_kwargs(auto_wrap_policy, kwargs)
+
+        if device_mesh is not None:
+            if not _TORCH_GREATER_EQUAL_2_2:
+                raise ValueError("The device_mesh argument is only supported in torch >= 2.2.")
+            self.kwargs["device_mesh"] = device_mesh
+
         self.sharding_strategy = _init_sharding_strategy(sharding_strategy, self.kwargs)
 
         # Avoids the need for user to reference params in `configure_optimizers` via
@@ -242,6 +275,12 @@ def setup_environment(self) -> None:
         assert self.cluster_environment is not None
         _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
 
+        # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here
+        if isinstance(self.kwargs.get("device_mesh"), tuple):
+            from torch.distributed.device_mesh import init_device_mesh
+
+            self.kwargs["device_mesh"] = init_device_mesh("cuda", self.kwargs["device_mesh"])
+
     def _get_process_group_backend(self) -> str:
         return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
 

@@ -85,13 +85,14 @@ def test_hybrid_shard_configuration(sharding_strategy):
     assert strategy.sharding_strategy.name == sharding_strategy
     assert strategy._fsdp_kwargs["process_group"] is process_group
 
-    device_mesh = Mock()
-    strategy = FSDPStrategy(sharding_strategy=sharding_strategy, device_mesh=device_mesh)
-    assert strategy.sharding_strategy.name == sharding_strategy
-    assert strategy._fsdp_kwargs["device_mesh"] is device_mesh
-
-    with pytest.raises(ValueError, match="process_group.* device_mesh=.* are mutually exclusive"):
-        FSDPStrategy(sharding_strategy=sharding_strategy, process_group=process_group, device_mesh=device_mesh)
+    with mock.patch("lightning.fabric.strategies.fsdp._TORCH_GREATER_EQUAL_2_2", True):
+        device_mesh = Mock()
+        strategy = FSDPStrategy(sharding_strategy=sharding_strategy, device_mesh=device_mesh)
+        assert strategy.sharding_strategy.name == sharding_strategy
+        assert strategy._fsdp_kwargs["device_mesh"] is device_mesh
+
+        with pytest.raises(ValueError, match="process_group.* device_mesh=.* are mutually exclusive"):
+            FSDPStrategy(sharding_strategy=sharding_strategy, process_group=process_group, device_mesh=device_mesh)
 
 
 def test_checkpoint_io_unsupported():

@@ -514,13 +514,14 @@ def test_hybrid_sharding_strategy(sharding_strategy):
     assert strategy.sharding_strategy.name == sharding_strategy
     assert strategy.kwargs["process_group"] is process_group
 
-    device_mesh = Mock()
-    strategy = FSDPStrategy(sharding_strategy=sharding_strategy, device_mesh=device_mesh)
-    assert strategy.sharding_strategy.name == sharding_strategy
-    assert strategy.kwargs["device_mesh"] is device_mesh
-
-    with pytest.raises(ValueError, match="process_group.* device_mesh=.* are mutually exclusive"):
-        FSDPStrategy(sharding_strategy=sharding_strategy, process_group=process_group, device_mesh=device_mesh)
+    with mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_2", True):
+        device_mesh = Mock()
+        strategy = FSDPStrategy(sharding_strategy=sharding_strategy, device_mesh=device_mesh)
+        assert strategy.sharding_strategy.name == sharding_strategy
+        assert strategy.kwargs["device_mesh"] is device_mesh
+
+        with pytest.raises(ValueError, match="process_group.* device_mesh=.* are mutually exclusive"):
+            FSDPStrategy(sharding_strategy=sharding_strategy, process_group=process_group, device_mesh=device_mesh)
 
 
 def test_use_orig_params():