feat: Add feature to toggle partitioner

gs-olive · gs-olive · commit bbf514f43283 · 2023-08-07T09:51:26.000-07:00
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -10,3 +10,4 @@
 OPTIMIZATION_LEVEL = None
 USE_PYTHON_RUNTIME = None
 TRUNCATE_LONG_AND_DOUBLE = False
+USE_FAST_PARTITIONER = True
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -12,6 +12,7 @@
     OPTIMIZATION_LEVEL,
     USE_PYTHON_RUNTIME,
     TRUNCATE_LONG_AND_DOUBLE,
+    USE_FAST_PARTITIONER,
 )
 
 
@@ -28,3 +29,4 @@ class CompilationSettings:
     optimization_level: Optional[int] = OPTIMIZATION_LEVEL
     use_python_runtime: Optional[bool] = USE_PYTHON_RUNTIME
     truncate_long_and_double: bool = TRUNCATE_LONG_AND_DOUBLE
+    use_fast_partitioner: bool = USE_FAST_PARTITIONER
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -11,10 +11,7 @@
 from torch_tensorrt.dynamo.lowering._pre_aot_lowering import (
     pre_aot_substitutions,
 )
-from torch_tensorrt.dynamo.partitioning import (
-    partition,
-    get_submod_inputs,
-)
+from torch_tensorrt.dynamo import partitioning
 from torch_tensorrt.dynamo.utils import parse_dynamo_kwargs
 from torch_tensorrt.dynamo.conversion import (
     convert_module,
@@ -118,12 +115,20 @@ def _compile_module(
         Compiled FX GraphModule
     """
     # Partition module into components that can be TRT-accelerated
-    partitioned_module = partition(
-        gm,
-        verbose=settings.debug,
-        min_block_size=settings.min_block_size,
-        torch_executed_ops=settings.torch_executed_ops,
-    )
+    if settings.use_fast_partitioner:
+        partitioned_module = partitioning.fast_partition(
+            gm,
+            verbose=settings.debug,
+            min_block_size=settings.min_block_size,
+            torch_executed_ops=settings.torch_executed_ops,
+        )
+    else:
+        partitioned_module = partitioning.global_partition(
+            gm,
+            verbose=settings.debug,
+            min_block_size=settings.min_block_size,
+            torch_executed_ops=settings.torch_executed_ops,
+        )
 
     # Store TRT replicas of Torch subgraphs
     trt_modules = {}
@@ -133,13 +138,13 @@ def _compile_module(
     for name, _ in partitioned_module.named_children():
 
         # Criteria for a module to be convertible to TRT
-        if "_run_on_acc" not in name:
+        if settings.use_fast_partitioner and "_run_on_acc" not in name:
             continue
 
         submodule = getattr(partitioned_module, name)
 
         # Get submodule inputs
-        submodule_inputs = get_submod_inputs(
+        submodule_inputs = partitioning.get_submod_inputs(
             partitioned_module, submodule, sample_inputs
         )
 
diff --git a/py/torch_tensorrt/dynamo/compile.py b/py/torch_tensorrt/dynamo/compile.py
@@ -31,6 +31,7 @@
     OPTIMIZATION_LEVEL,
     USE_PYTHON_RUNTIME,
     TRUNCATE_LONG_AND_DOUBLE,
+    USE_FAST_PARTITIONER,
 )
 
 
@@ -64,6 +65,7 @@ def compile(
     version_compatible=VERSION_COMPATIBLE,
     optimization_level=OPTIMIZATION_LEVEL,
     use_python_runtime=USE_PYTHON_RUNTIME,
+    use_fast_partitioner=USE_FAST_PARTITIONER,
     **kwargs,
 ):
     if debug:
@@ -73,7 +75,7 @@ def compile(
         "The Dynamo backend is an experimental feature, for which only the "
         + "following arguments are supported: "
         + "{enabled_precisions, debug, workspace_size, min_block_size, "
-        + "torch_executed_ops, pass_through_build_failures}"
+        + "torch_executed_ops, pass_through_build_failures, use_fast_partitioner}"
     )
 
     if not isinstance(inputs, collections.abc.Sequence):
@@ -111,17 +113,13 @@ def compile(
         "optimization_level": optimization_level,
         "use_python_runtime": use_python_runtime,
         "truncate_long_and_double": truncate_long_and_double,
+        "use_fast_partitioner": use_fast_partitioner,
     }
 
     settings = CompilationSettings(**compilation_options)
-    if kwargs.get("use_capability_partitioner", None):
-        model = lower_model(gm, torch_inputs)
-        return _compile_module(model, torch_inputs, settings)
-    else:
-        split_result = lower_model_using_trt_splitter(gm, torch_inputs)
-        trt_module = _compile_graph(split_result, torch_inputs, settings)
 
-        return trt_module
+    model = lower_model(gm, torch_inputs)
+    return _compile_module(model, torch_inputs, settings)
 
 
 def _compile_graph(
diff --git a/py/torch_tensorrt/dynamo/partitioning/__init__.py b/py/torch_tensorrt/dynamo/partitioning/__init__.py
@@ -1,4 +1,3 @@
 from .common import get_submod_inputs
-from ._adjacency_partitioner import (
-    partition,
-)
+from ._adjacency_partitioner import partition as fast_partition
+from ._global_partitioner import partition as global_partition
diff --git a/tests/py/dynamo/backend/test_backend_compiler.py b/tests/py/dynamo/backend/test_backend_compiler.py
@@ -1,6 +1,6 @@
 import torch
 import torch_tensorrt
-from torch_tensorrt.dynamo.partitioning import partition
+from torch_tensorrt.dynamo.partitioning import fast_partition
 from torch.testing._internal.common_utils import run_tests, TestCase
 from copy import deepcopy
 from utils import lower_graph_testing, DECIMALS_OF_AGREEMENT
@@ -17,7 +17,7 @@ def forward(self, x, y):
                 return torch.mean(out, dim=1)
 
         fx_graph = torch.fx.symbolic_trace(FullySupportedMultiOp())
-        partitioned_graph = partition(deepcopy(fx_graph), min_block_size=3)
+        partitioned_graph = fast_partition(deepcopy(fx_graph), min_block_size=3)
 
         self.assertEquals(
             len(
@@ -192,7 +192,7 @@ def forward(self, x, y):
                 )
 
         fx_graph = torch.fx.symbolic_trace(FullySupportedMultiOp())
-        partitioned_graph = partition(deepcopy(fx_graph), min_block_size=3)
+        partitioned_graph = fast_partition(deepcopy(fx_graph), min_block_size=3)
 
         self.assertEquals(
             len(list(partitioned_graph.named_children())),
diff --git a/tests/py/dynamo/backend/test_partitioning.py b/tests/py/dynamo/backend/test_partitioning.py
@@ -1,12 +1,12 @@
-from torch_tensorrt.dynamo.partitioning import partition
+from torch_tensorrt.dynamo import partitioning
 from torch.testing._internal.common_utils import run_tests, TestCase
 from utils import lower_graph_testing
 import torch
 from copy import deepcopy
 import numpy as np
 
 
-class TestPartitioning(TestCase):
+class TestFastPartitioning(TestCase):
     def test_partition_fully_supported_one_op(self):
         class FullySupportedOneOp(torch.nn.Module):
             def __init__(self, *args, **kwargs) -> None:
@@ -16,7 +16,7 @@ def forward(self, x, y):
                 return torch.ops.aten.add.Tensor(x, y)
 
         fx_graph = torch.fx.symbolic_trace(FullySupportedOneOp())
-        partitioned_graph = partition(deepcopy(fx_graph))
+        partitioned_graph = partitioning.fast_partition(deepcopy(fx_graph))
         self.assertEquals(
             len(
                 [
@@ -42,7 +42,9 @@ def forward(self, x, y):
                 return pow_
 
         fx_graph = torch.fx.symbolic_trace(FullySupportedMultiOp())
-        partitioned_graph = partition(deepcopy(fx_graph), min_block_size=2)
+        partitioned_graph = partitioning.fast_partition(
+            deepcopy(fx_graph), min_block_size=2
+        )
         self.assertEquals(
             len(
                 [
@@ -69,7 +71,9 @@ def forward(self, x, y):
                 return pow_
 
         fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
-        partitioned_graph = partition(deepcopy(fx_graph), min_block_size=2)
+        partitioned_graph = partitioning.fast_partition(
+            deepcopy(fx_graph), min_block_size=2
+        )
         self.assertEquals(
             len(
                 [
@@ -118,6 +122,7 @@ def forward(self, x, y):
             min_block_size=2,
             torch_executed_ops={"torch.ops.aten.add.Tensor"},
             testing_partitioning=True,
+            use_fast_partitioner=True,
         )
 
         self.assertEquals(
@@ -144,5 +149,124 @@ def forward(self, x, y):
         )
 
 
+class TestGlobalPartitioning(TestCase):
+    def test_partition_fully_supported_one_op(self):
+        class FullySupportedOneOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                return torch.ops.aten.add.Tensor(x, y)
+
+        fx_graph = torch.fx.symbolic_trace(FullySupportedOneOp())
+        partitioned_graph = partitioning.global_partition(deepcopy(fx_graph))
+        self.assertEquals(
+            len(list(partitioned_graph.named_children())),
+            0,
+            "Single operators should not be segmented",
+        )
+
+    def test_partition_fully_supported_multi_op(self):
+        class FullySupportedMultiOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                sum_ = torch.ops.aten.sub.Tensor(x, y)
+                concat_ = torch.ops.aten.cat.default(x, sum_)
+                relu_ = torch.ops.aten.relu.default(concat_)
+                pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2)
+                return pow_
+
+        fx_graph = torch.fx.symbolic_trace(FullySupportedMultiOp())
+        partitioned_graph = partitioning.global_partition(
+            deepcopy(fx_graph), min_block_size=2
+        )
+        self.assertEquals(
+            len(list(partitioned_graph.named_children())),
+            1,
+            "All operators are supported, there should be one segment",
+        )
+
+    def test_partition_partially_supported_multi_op(self):
+        class PartiallySupportedMultiOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                sum_1 = torch.ops.aten.add.Tensor(x, y)
+                sum_2 = torch.ops.aten.add.Tensor(x, sum_1)
+                sum_ = np.sum(sum_1) + np.sum(sum_2)
+                relu_ = torch.ops.aten.relu.default(sum_)
+                pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2)
+                return pow_
+
+        fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
+        partitioned_graph = partitioning.global_partition(
+            deepcopy(fx_graph), min_block_size=2
+        )
+        self.assertEquals(
+            len(list(partitioned_graph.named_children())),
+            2,
+            "Unsupported operators interleave supported ones, expected 2 segments",
+        )
+
+    def test_partition_partially_supported_with_torch_executed_ops(self):
+        class PartiallySupportedMultiOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                sum_1 = torch.ops.aten.add.Tensor(x, y)
+                sum_2 = torch.ops.aten.add.Tensor(x, sum_1)
+                sum_ = torch.ops.aten.add.Tensor(sum_1, sum_2)
+                relu_ = torch.ops.aten.relu.default(sum_)
+                pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2)
+                return pow_
+
+        unexpected_ops = {torch.ops.aten.add.Tensor}
+
+        inputs = [
+            torch.randint(
+                1,
+                10,
+                (5,),
+            ),
+            torch.randint(
+                1,
+                10,
+                (5,),
+            ),
+        ]
+
+        fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
+        (unexpected_ops_seen, _, partitioned_graphs,) = lower_graph_testing(
+            fx_graph,
+            inputs,
+            unexpected_ops=unexpected_ops,
+            min_block_size=2,
+            torch_executed_ops={"torch.ops.aten.add.Tensor"},
+            testing_partitioning=True,
+            use_fast_partitioner=False,
+        )
+
+        self.assertEquals(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+
+        self.assertEquals(
+            len(partitioned_graphs),
+            1,
+            "Without control flow breaks, there should only be a single graph",
+        )
+        self.assertEquals(
+            len(list(partitioned_graphs[0].named_children())),
+            1,
+            "Certain operators are set to run in Torch, expected 1 segment",
+        )
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/tests/py/dynamo/backend/utils.py b/tests/py/dynamo/backend/utils.py