Revert "Fix CompiledDDP failure when the gradient is not contiguous (pytorch#138174)"

pytorchmergebot · pytorchmergebot · commit 26ac5671dc47 · 2024-10-18T16:17:54.000Z
This reverts commit 0ecafda. Reverted pytorch#138174 on behalf of https://github.com/huydhn due to Sorry for reverting your PR, but I think it fails test_compute_comm_reordering in trunk for rocm and multigpu setup ([comment](pytorch#138174 (comment)))
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
@@ -597,11 +597,14 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         (
             FileCheck()
             .check("buf0 = empty")
-            # We always call .contiguous() on the input to all_reduce_,
-            # so input will not be a view anymore.
-            .check("torch.ops._c10d_functional.all_reduce_.default(buf0")
-            .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
-            .check("return (buf0")
+            # Ensure the all_reduce_ input is a view
+            .check(
+                "torch.ops._c10d_functional.all_reduce_.default(reinterpret_tensor(buf0"
+            )
+            .check(
+                "torch.ops._c10d_functional.wait_tensor.default(reinterpret_tensor(buf0"
+            )
+            .check("return (reinterpret_tensor(buf0")
             .run(code)
         )
 
@@ -621,16 +624,6 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         # clone induced by non contig input
         assert "torch.ops._c10d_functional.wait_tensor.default" in code
 
-        def func2(arg: torch.Tensor) -> torch.Tensor:
-            torch.ops._c10d_functional.all_reduce_(arg, "avg", "0")
-            return arg
-
-        compiled = torch.compile(func)
-
-        code = run_and_get_triton_code(compiled, arg)
-        # clone induced by non contig input
-        assert "torch.ops._c10d_functional.wait_tensor.default" in code
-
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
     def test_inductor_reuse_buffer_after_inplace_collective(self):
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
@@ -6470,7 +6470,6 @@ def _all_reduce(inp, reduce_op, group_name):
 
     @register_lowering(_c10d_functional.all_reduce_)
     def _all_reduce_(inp, reduce_op, group_name):
-        inp = ir.ExternKernel.require_contiguous(inp)
         ir._CollectiveKernel.create_inplace(
             _c10d_functional.all_reduce_.default, inp, reduce_op, group_name
         )

Original file line number	Diff line number	Diff line change
`@@ -6470,7 +6470,6 @@ def _all_reduce(inp, reduce_op, group_name):`
`6470`	`6470`
`6471`	`6471`	`@register_lowering(_c10d_functional.all_reduce_)`
`6472`	`6472`	`def _all_reduce_(inp, reduce_op, group_name):`
`6473`		`- inp = ir.ExternKernel.require_contiguous(inp)`
`6474`	`6473`	`ir._CollectiveKernel.create_inplace(`
`6475`	`6474`	`_c10d_functional.all_reduce_.default, inp, reduce_op, group_name`
`6476`	`6475`	`)`