ManagedDeviceMesh: support TP (#155)

d4l3k · web-flow · commit 9200e745a8f6 · 2025-04-03T14:34:53.000-07:00
diff --git a/torchft/device_mesh_test.py b/torchft/device_mesh_test.py
@@ -75,10 +75,40 @@ def _test_init_device_mesh(world_size: int, rank: int) -> None:
         torch.load(buffer, weights_only=False)
 
     def test_init_device_mesh(self) -> None:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
         with ProcessPoolExecutor(max_workers=4) as executor:
             futures = []
             for i in range(4):
                 future = executor.submit(self._test_init_device_mesh, 4, i)
                 futures.append(future)
             for f in futures:
                 f.result()
+
+    def test_repr_hash(self) -> None:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = str(12346)
+        os.environ["RANK"] = str(0)
+        os.environ["WORLD_SIZE"] = str(1)
+
+        manager = Mock(spec=Manager)
+        manager._pg = ProcessGroupGloo()
+
+        device_mesh = ft_init_device_mesh(
+            device_type="cpu",
+            mesh_shape=(1, 1),
+            mesh_dim_names=("dp_replicate", "dp_shard"),
+            replicate_dim=0,
+            manager=manager,
+        )
+
+        self.assertIsInstance(repr(device_mesh), str)
+        self.assertIsInstance(str(device_mesh), str)
+        self.assertEqual(hash(device_mesh), hash(device_mesh))
+        self.assertIsInstance(hash(device_mesh), int)
+
+        dist.destroy_process_group()
diff --git a/torchft/fsdp_test.py b/torchft/fsdp_test.py
@@ -30,17 +30,34 @@
 )
 from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+    SequenceParallel,
+    parallelize_module,
+)
 
 from torchft.manager import Manager
-from torchft.process_group import ManagedProcessGroup, ft_init_device_mesh
+from torchft.process_group import (
+    ManagedProcessGroup,
+    ProcessGroupGloo,
+    ft_init_device_mesh,
+)
 
 
 class FSDPTest(unittest.TestCase):
     @staticmethod
-    def _test_fsdp(world_size: int, rank: int) -> None:
+    def _test_fsdp(
+        world_size: int,
+        rank: int,
+        dp_replicate: int = 2,
+        dp_shard: int = 2,
+        tp: int = 1,
+    ) -> None:
         torch.cuda.set_device(rank)
 
-        group_size = world_size // 2
+        group_size = world_size // dp_replicate
         group = rank // group_size
         group_rank = rank % group_size
 
@@ -50,17 +67,28 @@ def _test_fsdp(world_size: int, rank: int) -> None:
         os.environ["WORLD_SIZE"] = str(group_size)
 
         manager = Mock(spec=Manager)
+        manager._pg = ProcessGroupGloo()
         device_mesh = ft_init_device_mesh(
             device_type="cuda",
-            mesh_shape=(2, 2),
-            mesh_dim_names=("dp_replicate", "dp_shard"),
+            mesh_shape=(dp_replicate, dp_shard, tp),
+            mesh_dim_names=("dp_replicate", "dp_shard", "tp"),
             replicate_dim=0,
             manager=manager,
         )
         manager.num_participants.return_value = 1
         model = nn.Linear(128, 128).cuda()
         batch = torch.randn(4, 128).cuda()
-        shard_model = fully_shard(model, mesh=device_mesh)
+
+        fsdp_mesh = device_mesh["dp_replicate", "dp_shard"]
+
+        if tp > 1:
+            tp_mesh = device_mesh["tp"]
+            model = parallelize_module(
+                model,
+                tp_mesh,
+                ColwiseParallel(),
+            )
+        shard_model = fully_shard(model, mesh=fsdp_mesh)
         shard_model(batch).mean().backward()
 
     # pyre-ignore[56]: Pyre was not able to infer the type of argument
@@ -72,3 +100,21 @@ def test_fsdp(self) -> None:
             for i in range(4):
                 future = executor.submit(self._test_fsdp, 4, i)
                 futures.append(future)
+
+            for fut in futures:
+                fut.result()
+
+    # pyre-ignore[56]: Pyre was not able to infer the type of argument
+    @unittest.skipIf(torch.cuda.device_count() < 4, "Not enough GPUs")
+    def test_fsdp_tp(self) -> None:
+        context = multiprocessing.get_context("spawn")
+        with ProcessPoolExecutor(max_workers=4, mp_context=context) as executor:
+            futures = []
+            for i in range(4):
+                future = executor.submit(
+                    self._test_fsdp, 4, i, dp_replicate=1, dp_shard=2, tp=2
+                )
+                futures.append(future)
+
+            for fut in futures:
+                fut.result()
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -64,6 +64,7 @@
     ReduceScatterOptions,
     Work,
 )
+from torch.distributed.tensor.device_mesh import _mesh_resources
 from torch.futures import Future
 from torch.utils._pytree import tree_any
 
@@ -1790,6 +1791,7 @@ def __init__(
             self.device_type = parent.device_type
         self._flatten_mesh_list: Tuple[DeviceMesh, ...] = tuple()
         self._thread_id: Optional[int] = None
+        self._hash: Optional[int] = None
 
     def __getstate__(self) -> Dict[str, Any]:
         state = self.__dict__.copy()
@@ -1804,36 +1806,43 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
     def __getitem__(self, mesh_dim_names: Union[str, Tuple[str, ...]]) -> DeviceMesh:
         if isinstance(mesh_dim_names, str):
             if mesh_dim_names == self.replicate_dim_name:
-                return ManagedDeviceMesh(
+                res_submesh = ManagedDeviceMesh(
                     mesh=None,
                     mesh_dim_names=(mesh_dim_names,),
                     replicate_pg=self.replicate_pg,
                     replicate_dim=0,
                     parent=self,
                 )
             elif mesh_dim_names in self.flatten_meshes:
-                return self.flatten_meshes[mesh_dim_names]
+                res_submesh = self.flatten_meshes[mesh_dim_names]
             else:
                 assert self.mesh is not None
-                return self.mesh[mesh_dim_names]
+                res_submesh = self.mesh[mesh_dim_names]
         else:
             assert isinstance(mesh_dim_names, tuple)
             if self.replicate_dim_name not in mesh_dim_names:
                 assert self.mesh is not None
-                return self.mesh[mesh_dim_names]
+                res_submesh = self.mesh[mesh_dim_names]
             else:
                 mesh_dim_names_wo_replicate = tuple(
                     n for n in mesh_dim_names if n != self.replicate_dim_name
                 )
                 assert self.mesh is not None
-                return ManagedDeviceMesh(
+                res_submesh = ManagedDeviceMesh(
                     self.mesh[mesh_dim_names_wo_replicate],
                     mesh_dim_names,
                     self.replicate_pg,
                     mesh_dim_names.index(self.replicate_dim_name),
                     parent=self,
                 )
 
+        # TODO: find a better way to do this that doesn't depend on device mesh
+        # internals
+        root = _mesh_resources.get_root_mesh(self)
+        _mesh_resources.child_to_root_mapping[res_submesh] = root
+
+        return res_submesh
+
     def _real_mesh_dim(self, mesh_dim: int) -> int:
         return mesh_dim - 1 if mesh_dim > self.replicate_dim else mesh_dim
 
@@ -1937,6 +1946,25 @@ def get_coordinate(self) -> Optional[List[int]]:
     def get_all_groups(self) -> List[BaseProcessGroup]:
         raise NotImplementedError
 
+    def __repr__(self) -> str:
+        return f"ManagedDeviceMesh(mesh={self.mesh})"
+
+    def __hash__(self) -> int:
+        # lazily compute hash
+        if not self._hash:
+            self._hash = hash(
+                (
+                    self.mesh,
+                    self.mesh_dim_names,
+                    self.replicate_pg,
+                    self.replicate_dim,
+                    self.replicate_dim_name,
+                    self.parent,
+                    self.device_type,
+                )
+            )
+        return self._hash
+
 
 class _FlattenDeviceMesh(DeviceMesh):
     def __init__(self, managed_mesh: ManagedDeviceMesh) -> None: