refactor

zewenli98 · zewenli98 · commit d7184648fea4 · 2025-02-28T10:10:54.000-08:00
diff --git a/py/torch_tensorrt/dynamo/_engine_cache.py b/py/torch_tensorrt/dynamo/_engine_cache.py
@@ -107,7 +107,7 @@ def pack(
         input_specs: Sequence[Input],
         compilation_settings: CompilationSettings,
         weight_name_map: Optional[Dict[Any, Any]],
-        engine_is_dds: bool,
+        requires_output_allocator: bool,
     ) -> bytes:
         """Pack serialized engine, input names, output names, and weight map into a single blob
 
@@ -118,7 +118,7 @@ def pack(
             input_specs (Sequence[Input]): input specs of TRT engine
             compilation_settings (CompilationSettings): compilation settings of TRT engine
             weight_name_map (Optional[Dict[Any, Any]]): weight name map for refitting
-            engine_is_dds (bool): whether the engine is data-dependent shape
+            requires_output_allocator (bool): whether the engine requires output allocator
         Returns:
             bytes: packed blob
         """
@@ -132,7 +132,7 @@ def pack(
                 "input_specs": input_specs,
                 "compilation_settings": settings,
                 "weight_name_map": weight_name_map,
-                "engine_is_dds": engine_is_dds,
+                "requires_output_allocator": requires_output_allocator,
             }
         )
 
@@ -154,7 +154,7 @@ def unpack(packed_obj: bytes) -> UnpackedCacheHit:
             unpacked["input_specs"],
             unpacked["compilation_settings"],
             unpacked["weight_name_map"],
-            unpacked["engine_is_dds"],
+            unpacked["requires_output_allocator"],
         )
 
     def insert(
diff --git a/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py b/py/torch_tensorrt/dynamo/conversion/_ConversionContext.py
@@ -11,9 +11,11 @@ class ConversionContext:
     Args:
         net: TensorRT Network being built
         compilation_settings: Settings selected by the user for compilation
+        requires_output_allocator: Whether the network requires output allocator
     """
 
     net: TRTNetwork
     compilation_settings: CompilationSettings = field(
         default_factory=CompilationSettings
     )
+    requires_output_allocator: bool = False
diff --git a/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py b/py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py
@@ -18,6 +18,7 @@
     cast,
 )
 
+import tensorrt as trt
 import torch
 from torch import SymBool, SymFloat, SymInt
 from torch._ops import OpOverloadPacket
@@ -26,8 +27,6 @@
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.fx.converter_registry import CONVERTERS as FX_CONVERTERS
 
-import tensorrt as trt
-
 logger = logging.getLogger(__name__)
 
 LegacyConverterImplSignature = Callable[
@@ -81,13 +80,15 @@ class ConverterSupport:
             whether that node can be supported by its companion converter. Note that
             this function must not modify the node or its graph
         supports_dynamic_shapes: Boolean flag indicating if the converter has support for dynamic inputs.
+        requires_output_allocator: Boolean flag indicating if the converter requires to run in output allocator.
     """
 
     converter_implementation: ConverterImplSignature
     capability_validator: Callable[[Node, CompilationSettings], bool] = field(
         default=lambda node, compilation_settings: True
     )
     supports_dynamic_shapes: bool = False
+    requires_output_allocator: bool = False
 
 
 # Dictionary representing Dynamo aten-only converters
@@ -197,6 +198,7 @@ def dynamo_tensorrt_converter(
     capability_validator: Optional[Callable[[Node, CompilationSettings], bool]] = None,
     priority: ConverterPriority = ConverterPriority.STANDARD,
     supports_dynamic_shapes: bool = False,
+    requires_output_allocator: bool = False,
 ) -> Callable[[ConverterImplSignature], ConverterImplSignature]:
     """Decorator for Dynamo TensorRT Converter
 
@@ -212,6 +214,8 @@ def dynamo_tensorrt_converter(
             this means all nodes of "key" kind can be supported by this converter
         priority: Converter's level of priority relative to other converters with the
             same target
+        supports_dynamic_shapes: Boolean flag indicating if the converter has support for dynamic shapes.
+        requires_output_allocator: Boolean flag indicating if the converter requires to run in output allocator.
     Returns:
         The converter being decorated
     """
@@ -225,6 +229,7 @@ def register_converter(converter: ConverterImplSignature) -> ConverterImplSignat
             converter_support = ConverterSupport(
                 converter_implementation=converter,
                 supports_dynamic_shapes=supports_dynamic_shapes,
+                requires_output_allocator=requires_output_allocator,
             )
         else:
             assert callable(
@@ -234,6 +239,7 @@ def register_converter(converter: ConverterImplSignature) -> ConverterImplSignat
                 converter_implementation=converter,
                 capability_validator=capability_validator,
                 supports_dynamic_shapes=supports_dynamic_shapes,
+                requires_output_allocator=requires_output_allocator,
             )
 
         # OpOverloadPackets are only valid if they have a single overload, or
@@ -404,7 +410,7 @@ def __getitem_without_validation__(
     def __getitem__(
         self, node: Node
     ) -> Tuple[
-        Any, CallingConvention
+        Any, CallingConvention, bool
     ]:  # TODO: Narrow to ConverterImplSignature this when we can remove FX converters
         """Get the first-found validated converter in any registry
 
@@ -462,6 +468,7 @@ def __getitem__(
                             return (
                                 candidate.converter_implementation,
                                 calling_convention,
+                                candidate.requires_output_allocator,
                             )
                         else:
                             logger.debug(
@@ -471,7 +478,11 @@ def __getitem__(
                 else:
                     # Assuming FX converters don't have dynamic shapes supported
                     if not node_has_dynamic_shapes(node):
-                        return converters, calling_convention
+                        return (
+                            converters,
+                            calling_convention,
+                            candidate.requires_output_allocator,
+                        )
 
         raise KeyError(
             f"None of the converter registries have a validated entry for {key}, with node {node}"
@@ -495,7 +506,7 @@ def get_unvalidated(
     def get(
         self, node: Node, value: Optional[ConverterImplSignature] = None
     ) -> Union[
-        Any, Tuple[Any, CallingConvention]
+        Any, Tuple[Any, CallingConvention, bool]
     ]:  # TODO: Narrow to ConverterImplSignature this when we can remove FX converters
         """Get validated converter for input node with a default return"""
         try:
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -64,7 +64,7 @@ class TRTInterpreterResult(NamedTuple):
     input_names: Sequence[str]
     output_names: Sequence[str]
     weight_name_map: Optional[dict[Any, Any]]
-    engine_is_dds: bool
+    requires_output_allocator: bool
 
 
 class TRTInterpreter(torch.fx.Interpreter):  # type: ignore[misc]
@@ -139,9 +139,6 @@ def __init__(
         # Engine cache for storing and reusing TRT engines
         self.engine_cache = engine_cache
 
-        # Whether the engine is data-dependent shape (dds)
-        self.engine_is_dds: bool = False
-
     def validate_conversion(self) -> Set[str]:
         missing_converters: Set[str] = set()
 
@@ -581,7 +578,7 @@ def _insert_engine_to_cache(self, hash_val: str, serialized_engine: bytes) -> No
                 self.input_specs,
                 self.compilation_settings,
                 self.weight_name_map,
-                self.engine_is_dds,
+                self.ctx.requires_output_allocator,
             ),
         )
 
@@ -596,7 +593,7 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
                 cached_engine_input_specs,
                 engine_compilation_settings,
                 self.weight_name_map,
-                self.engine_is_dds,
+                self.ctx.requires_output_allocator,
             ) = cached_data
 
             setting_compatiblity, incompattible_settings = settings_are_compatible(
@@ -658,20 +655,10 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
                 self._input_names,
                 self._output_names,
                 self.weight_name_map,
-                self.engine_is_dds,
+                self.ctx.requires_output_allocator,
             )
         return None
 
-    def check_dds(self, serialized_engine: bytes, output_names: List[str]) -> bool:
-        runtime = trt.Runtime(TRT_LOGGER)
-        engine = runtime.deserialize_cuda_engine(serialized_engine)
-
-        for output_name in output_names:
-            output_shape = engine.get_tensor_shape(output_name)
-            if -1 in output_shape:
-                return True
-        return False
-
     def run(
         self,
         strict_type_constraints: bool = False,
@@ -728,8 +715,6 @@ def run(
         )
         assert serialized_engine
 
-        self.engine_is_dds = self.check_dds(serialized_engine, self._output_names)
-
         _LOGGER.info(
             f"Build TRT engine elapsed time: {datetime.now() - build_engine_start_time}"
         )
@@ -756,7 +741,7 @@ def run(
             self._input_names,
             self._output_names,
             self.weight_name_map,
-            self.engine_is_dds,
+            self.ctx.requires_output_allocator,
         )
 
     def run_node(self, n: torch.fx.Node) -> torch.fx.Node:
@@ -850,7 +835,7 @@ def call_module(
                 f"Conversion of module of type {submod_type} not currently supported!"
             )
 
-        converter, calling_convention = converter_packet
+        converter, calling_convention, requires_output_allocator = converter_packet
 
         assert self._cur_node_name is not None
 
@@ -867,7 +852,10 @@ def call_function(self, target: str, args: Any, kwargs: Any) -> Any:
                 f"Conversion of function {torch.typename(target)} not currently supported!"
             )
 
-        converter, calling_convention = converter_packet
+        converter, calling_convention, requires_output_allocator = converter_packet
+        if requires_output_allocator:
+            self.ctx.requires_output_allocator = True
+            _LOGGER.debug(f"{target} requires output allocator")
 
         if calling_convention is CallingConvention.LEGACY:
             return converter(self.ctx.net, target, args, kwargs, self._cur_node_name)
@@ -897,7 +885,7 @@ def call_method(self, target: str, args: Any, kwargs: Any) -> Any:
             raise UnsupportedOperatorException(
                 f"Conversion of method {target} not currently supported!"
             )
-        converter, calling_convention = converter_packet
+        converter, calling_convention, requires_output_allocator = converter_packet
 
         if calling_convention is CallingConvention.LEGACY:
             return converter(self.ctx.net, target, args, kwargs, self._cur_node_name)
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -112,5 +112,5 @@ def convert_module(
         name=name,
         settings=settings,
         weight_name_map=interpreter_result.weight_name_map,
-        engine_is_dds=interpreter_result.engine_is_dds,
+        requires_output_allocator=interpreter_result.requires_output_allocator,
     )
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -3554,7 +3554,11 @@ def aten_ops_full(
     )
 
 
-@dynamo_tensorrt_converter(torch.ops.aten.nonzero.default, supports_dynamic_shapes=True)
+@dynamo_tensorrt_converter(
+    torch.ops.aten.nonzero.default,
+    supports_dynamic_shapes=True,
+    requires_output_allocator=True,
+)
 def aten_ops_nonzero(
     ctx: ConversionContext,
     target: Target,
diff --git a/py/torch_tensorrt/dynamo/partitioning/common.py b/py/torch_tensorrt/dynamo/partitioning/common.py
@@ -31,8 +31,10 @@ def construct_dynamic_input(
         if isinstance(dim, torch.SymInt):
             min_max_opt = extract_var_range_info(dim)
             min_shape.append(min_max_opt["min"])
-            # opt might not exist
-            opt_shape.append(min_max_opt.get("opt"))
+            # if opt not exist, set it to the mean of min and max
+            opt_shape.append(
+                min_max_opt.get("opt", int(min_max_opt["min"] + min_max_opt["max"] / 2))
+            )
             max_shape.append(min_max_opt["max"])
         else:
             min_shape.append(dim)
diff --git a/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
@@ -79,11 +79,6 @@ def set_output_allocator_outputs(self, enable: bool) -> None:
 
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         cudagraphs_enabled = torch_tensorrt.runtime.get_whole_cudagraphs_mode()
-        if cudagraphs_enabled and self.use_output_allocator_outputs:
-            raise RuntimeError(
-                "There are non-TRT submodules in the module. OutputAllocator is not compatible with modules with non-TRT submodules."
-            )
-
         if cudagraphs_enabled:
             shape_changed = self.validate_input_shapes(inputs)
             need_cudagraphs_record = shape_changed or self.is_weight_streaming_set
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -127,7 +127,7 @@ def __init__(
         name: str = "",
         settings: CompilationSettings = CompilationSettings(),
         weight_name_map: Optional[dict[Any, Any]] = None,
-        engine_is_dds: bool = False,
+        requires_output_allocator: bool = False,
     ):
         """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
         a PyTorch ``torch.nn.Module`` around it. Uses TensorRT Python APIs to run the engine
@@ -141,7 +141,7 @@ def __init__(
             name (str): Name for module
             settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed
             weight_name_map (dict): Mapping of engine weight name to state_dict weight name
-            engine_is_dds (bool): Whether the engine is Data Dependent Shape
+            requires_output_allocator (bool): Whether the engine requires an output allocator
 
         Example:
 
@@ -206,7 +206,7 @@ def __init__(
         self.pre_allocated_outputs: List[torch.Tensor] = []
         self.use_pre_allocated_outputs = False
 
-        self.engine_is_dds = engine_is_dds
+        self.requires_output_allocator = requires_output_allocator
         self.output_allocator: Optional[DynamicOutputAllocator] = None
         self.use_output_allocator_outputs = False
 
@@ -281,7 +281,7 @@ def setup_engine(self) -> None:
             for output_name in self.output_names
         ]
 
-        if self.engine_is_dds:
+        if self.requires_output_allocator:
             self.create_output_allocator()
 
         if torch_tensorrt.runtime.get_cudagraphs_mode():
@@ -678,22 +678,20 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                     ]
                     logger.warning(f"Moved all input Tensors to cuda:{device_id}")
 
-            if self.engine_is_dds:
+            if self.requires_output_allocator:
                 if self.cudagraphs_enabled:
                     raise RuntimeError(
-                        "The module is Data-Dependent Shape (DDS). It has to be handled by OutputAllocator which is not compatible with CUDA Graphs. Please disable CUDA Graphs."
+                        "This module requires OutputAllocator which is not compatible with CUDA Graphs. Please disable CUDA Graphs."
                     )
-                logger.debug(
-                    "The module is Data-Dependent Shape (DDS). Using output allocator."
-                )
+                logger.debug("Using OutputAllocator in runtime.")
                 return run_output_allocator()
             else:
                 if self.cudagraphs_enabled and self.use_output_allocator_outputs:
                     raise RuntimeError(
                         "Both CUDA Graphs and OutputAllocator are enabled. Please disable either one."
                     )
                 if self.use_output_allocator_outputs:
-                    logger.debug("Using output allocator.")
+                    logger.debug("Using OutputAllocator in runtime.")
                     return run_output_allocator()
                 logger.debug(
                     f"Using standard execution with cudagraphs={self.cudagraphs_enabled}."
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
diff --git a/py/torch_tensorrt/runtime/_cudagraphs.py b/py/torch_tensorrt/runtime/_cudagraphs.py
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
diff --git a/tests/py/dynamo/runtime/test_output_allocator_py.py b/tests/py/dynamo/runtime/test_output_allocator_py.py

Original file line number	Diff line number	Diff line change
`@@ -112,5 +112,5 @@ def convert_module(`
`112`	`112`	`name=name,`
`113`	`113`	`settings=settings,`
`114`	`114`	`weight_name_map=interpreter_result.weight_name_map,`
`115`		`- engine_is_dds=interpreter_result.engine_is_dds,`
	`115`	`+ requires_output_allocator=interpreter_result.requires_output_allocator,`
`116`	`116`	`)`