Revert "[Functional Collectives] Migrate DeviceMesh::all_reduce to use functional all_reduce. (pytorch#95009)"

pytorchmergebot · pytorchmergebot · commit d950f45577ee · 2023-02-27T19:21:58.000Z
This reverts commit 0765dbc. Reverted pytorch#95009 on behalf of https://github.com/jeanschmidt due to this PR is causing internal breakages. Check https://fburl.com/diff/me41urq8
diff --git a/test/distributed/_spmd/test_tracing.py b/test/distributed/_spmd/test_tracing.py
@@ -47,9 +47,10 @@ def _test_tracing_all_reduce_nd(self, mesh_tensor):
             ]
 
             def fn(tensor: torch.Tensor):
-                tensor = mesh.all_reduce(tensor, mesh_dim=dim)
+                tensor_to_reduce = CommTensor(tensor.clone())
+                mesh.all_reduce(tensor_to_reduce, mesh_dim=dim)
                 # multiply with 1 to trigger wait on read during tracing.
-                return tensor * 1
+                return tensor_to_reduce * 1
 
             # use a local_tensor + 1 for tracing to make sure that we are not
             # simply replaying recorded tensor value
diff --git a/test/distributed/_tensor/test_device_mesh.py b/test/distributed/_tensor/test_device_mesh.py
@@ -13,7 +13,6 @@
     is_initialized,
     new_group,
     ProcessGroup,
-    get_process_group_ranks
 )
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
@@ -240,8 +239,7 @@ def world_size(self):
     def test_all_reduce_1d(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
-        # We have to clone the result tensor because assertEqual fails to compare AsyncTensor with plain tensor.
-        local_tensor = mesh.all_reduce(local_tensor, mesh_dim=0).clone()
+        mesh.all_reduce(local_tensor, mesh_dim=0)
         res_num = ((0 + self.world_size - 1) * self.world_size) / 2
         self.assertEqual(local_tensor, torch.ones(3, 3) * res_num)
 
@@ -481,9 +479,12 @@ def test_all_reduce_nd(self):
         # check all dim groups
         dim_to_subgroups = mesh.get_dim_groups()
         for dim, dim_group in enumerate(dim_to_subgroups):
-            global_ranks = get_process_group_ranks(dim_group)
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
             cloned_local_tensor = local_tensor.clone()
-            cloned_local_tensor = mesh.all_reduce(cloned_local_tensor, mesh_dim=dim).clone()
+            mesh.all_reduce(cloned_local_tensor, mesh_dim=dim)
             res_num = sum(global_ranks)
             self.assertEqual(cloned_local_tensor, torch.ones(3, 3) * res_num)
 
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
@@ -145,7 +145,7 @@ def _all_reduce(self, reduceOp, tag, ranks, group_size):
     group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
     assert group is not None
 
-    inplace_tensor = self.clone(memory_format=torch.contiguous_format)
+    inplace_tensor = self.clone()
     work = dist.all_reduce(inplace_tensor, op=op, group=group, async_op=True)
     _register_tensor_work(inplace_tensor, work)
 
diff --git a/torch/distributed/_spmd/distribute.py b/torch/distributed/_spmd/distribute.py
@@ -249,7 +249,7 @@ def _convert_output(
 
         traced_dispatch, result_obj = _build_dummy_add_graph(dt, node_to_obj)
 
-        wait = [n for n in traced_dispatch.graph.nodes if n.name == "wait_comm" or n.name == "wait_tensor"]
+        wait = [n for n in traced_dispatch.graph.nodes if n.name == "wait_comm"]
         add = [n for n in traced_dispatch.graph.nodes if n.name == "add"]
         assert len(wait) == 1 and len(add) == 1
 
diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py
@@ -7,6 +7,7 @@
 from torch.distributed.distributed_c10d import (
     _get_default_group,
     all_gather,
+    all_reduce,
     all_to_all,
     broadcast,
     get_global_rank,
@@ -22,9 +23,6 @@
     scatter,
     Work,
 )
-import torch.distributed.distributed_c10d as c10d
-
-import torch.distributed._functional_collectives as funcol
 
 _global_device_mesh: Optional["DeviceMesh"] = None
 
@@ -420,7 +418,8 @@ def all_reduce(
         tensor: torch.Tensor,
         op: ReduceOp = ReduceOp.SUM,  # type: ignore[assignment]
         mesh_dim: int = 0,
-    ) -> torch.Tensor:
+        async_op: bool = False,
+    ) -> Optional[Work]:
         """
         all_reduce the tensor on each rank on a device mesh dimension, and
         return an output tensor on each rank after all_reduce.
@@ -433,10 +432,10 @@ def all_reduce(
                 to reduce on.
 
         Returns:
-            A :class:`torch.Tensor` object
+            A :class:`Work` object
         """
-        op_name: str = op.name  # type: ignore[attr-defined]
-        return funcol.all_reduce(tensor, reduceOp=op_name, group=(self, mesh_dim,))
+        dim_group = self._dim_groups[mesh_dim]
+        return all_reduce(tensor, op=op, group=dim_group, async_op=async_op)
 
     def reduce_scatter(
         self,
@@ -494,9 +493,9 @@ def reduce_scatter(
             flat_tensor = torch.cat(flattened_list).clone(
                 memory_format=torch.contiguous_format
             )
-            dim_group = self._dim_groups[mesh_dim]
-            fut = c10d.all_reduce(flat_tensor, op=op, group=dim_group, async_op=async_op)
-
+            fut = self.all_reduce(
+                flat_tensor, op=op, mesh_dim=mesh_dim, async_op=async_op
+            )
             # scatter the tensor
             output_offset = offset_list[my_coordinate]
             output.copy_(
diff --git a/torch/distributed/_tensor/placement_types.py b/torch/distributed/_tensor/placement_types.py
@@ -250,9 +250,13 @@ def __init__(self, reduce_op: c10d.ReduceOp = c10d.ReduceOp.SUM):  # type: ignor
     def _to_replicate(
         self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
     ) -> torch.Tensor:
-        return mesh.all_reduce(
-            tensor, self.reduce_op, mesh_dim=mesh_dim  # type: ignore[call-arg]
+        # out-of-place all_reduce to replicate, since the current partial DTensor
+        # might get used by other ops as well, so we can't inplace modify it
+        cloned_local = CommTensor(tensor.clone(memory_format=torch.contiguous_format))
+        mesh.all_reduce(
+            cloned_local, self.reduce_op, mesh_dim=mesh_dim  # type: ignore[call-arg]
         )
+        return cloned_local
 
     def _to_shard(
         self,