Revert "[CI] Add Compiled DDP and Compiled FSDP2 tests to test_inductor_distributed (pytorch#138178)"

pytorchmergebot · pytorchmergebot · commit ada7a8c2178f · 2024-10-18T17:51:54.000Z
This reverts commit 8cb9110. Reverted pytorch#138178 on behalf of https://github.com/yf225 due to because pytorch#138174 is reverted, we need to revert this too ([comment](pytorch#138178 (comment)))
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -320,7 +320,6 @@ test_inductor_distributed() {
   python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
   python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
   python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
-  python test/run_test.py -i distributed/_composable/test_replicate_with_compiler.py --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@@ -332,7 +331,6 @@ test_inductor_distributed() {
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_compile.py --verbose
   python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
 
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -613,8 +613,7 @@ def input_creation_fn():
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_aot_eager(self):
-        # TODO: fix fwd_fullgraph=False case
-        for fwd_fullgraph in [True]:
+        for fwd_fullgraph in [True, False]:
             self._test_traceable_fsdp(
                 *self._create_nested_fully_shard_factory_fns(
                     fwd_fullgraph=fwd_fullgraph
@@ -626,8 +625,7 @@ def test_nested_fully_shard_backend_aot_eager(self):
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_aot_eager_decomp_partition(self):
-        # TODO: fix fwd_fullgraph=False case
-        for fwd_fullgraph in [True]:
+        for fwd_fullgraph in [True, False]:
             self._test_traceable_fsdp(
                 *self._create_nested_fully_shard_factory_fns(
                     fwd_fullgraph=fwd_fullgraph
@@ -732,7 +730,6 @@ def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
                     )
                 file_check.run(bwd_code)
 
-    @unittest.skip("TODO: fix fwd_fullgraph=False case")
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_inductor_fullgraph_False(self):
@@ -813,9 +810,8 @@ def _sdpa_with_graph_break(*args, **kwargs):
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_transformer_backend_aot_eager(self):
-        # TODO: fix fwd_fullgraph=False case
         for fwd_fullgraph, all_requires_grad in itertools.product(
-            [True], [True, False]
+            [True, False], [True, False]
         ):
             with self._maybe_add_graph_break_to_sdpa(
                 fwd_fullgraph
@@ -833,9 +829,8 @@ def test_transformer_backend_aot_eager(self):
     # TODO: native_dropout has worse accuracy after decomp, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
     def test_transformer_backend_aot_eager_decomp_partition(self):
-        # TODO: fix fwd_fullgraph=False case
         for fwd_fullgraph, all_requires_grad in itertools.product(
-            [True], [True, False]
+            [True, False], [True, False]
         ):
             with self._maybe_add_graph_break_to_sdpa(fwd_fullgraph):
                 self._test_traceable_fsdp(
@@ -951,7 +946,6 @@ def test_transformer_backend_inductor_fullgraph_True(self):
                         )
                 file_check.run(bwd_code)
 
-    @unittest.skip("TODO: fix fwd_fullgraph=False case")
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout causes CUDA IMA error, need to figure out why
diff --git a/test/distributed/_composable/test_replicate_with_compiler.py b/test/distributed/_composable/test_replicate_with_compiler.py
@@ -385,9 +385,6 @@ def setUp(self):
     def tearDown(self):
         dist.destroy_process_group()
 
-    @unittest.skip(
-        "Temporarily disabled due to SymInt error: `unhashable type: non-nested SymInt`"
-    )
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skipIfRocm
     def test_ddp_tp(self):