pytorch · lanluo-nvidia · Oct 6, 2024 · Oct 6, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -37,6 +37,7 @@
 )
 from torch_tensorrt.dynamo.utils import (
     get_flat_args_with_check,
+    get_output_metadata,
     parse_graph_io,
     prepare_inputs,
     set_log_level,
@@ -352,7 +353,6 @@ def compile(
 
     settings = CompilationSettings(**compilation_options)
     logger.info("Compilation Settings: %s\n", settings)
-
     exported_program = pre_export_lowering(exported_program, settings)
     exported_program = exported_program.run_decompositions(
         get_decompositions(enable_experimental_decompositions)
@@ -483,6 +483,12 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
     if not settings.use_fast_partitioner:
         dryrun_tracker.to_run_in_torch.extend(parse_non_trt_nodes(partitioned_module))
 
+    submodule_node_dict = {}
+    for node in partitioned_module.graph.nodes:
+        if "_run_on_acc" not in node.name:
+            continue
+        submodule_node_dict[node.name] = node
+
     # Store TRT replicas of Torch subgraphs
     trt_modules = {}
     # Iterate over all components that can be accelerated
@@ -502,6 +508,23 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
             )
             continue
 
+        if name not in submodule_node_dict:
+            raise ValueError(
+                f"node_name: {name} does not exist in the submodule node dictionary"
+            )
+
+        # set the submodule metadata back to the parent trt_module_node
+        metadata_list = get_output_metadata(submodule)
+        assert len(metadata_list) > 0
+        if "val" not in submodule_node_dict[name].meta:
+            meta_val_list = [
+                metadata["val"] for metadata in metadata_list if "val" in metadata
+            ]
+            submodule_node_dict[name].meta["val"] = meta_val_list
+            logger.debug(
+                f"Update submodule output metadata back to the parent trt_module_node: {name}"
+            )
+
         subgraph_data = PerSubgraphData()
         subgraph_data.subgraph_name = name
         subgraph_data.subgraph_op_count = len(
@@ -762,7 +785,7 @@ def convert_exported_program_to_serialized_trt_engine(
     CONVERTERS.set_compilation_settings(settings)
 
     try:
-        interpreter_result, _ = interpret_module_to_result(
+        interpreter_result = interpret_module_to_result(
             gm,
             inputs=flattened_input_list,
             arg_inputs=arg_input_list,

diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py
@@ -389,7 +389,11 @@ def inline_trt_modules(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
         trt_module_node = trt_module_node[0]
         assert trt_module_node.args
 
-        num_outputs = len(trt_module.output_shapes)
+        if "val" not in trt_module_node.meta:
+            raise ValueError(
+                f"trt_module_node: {trt_module_node.name} does not have the metadata which should be set during dynamo compile_module step."
+            )
+        num_outputs = len(trt_module_node.meta["val"])
         # Insert a call_function node to perform inference on TRT engine
         with gm.graph.inserting_before(trt_module_node):
             engine_name = f"{name}_engine"
@@ -400,19 +404,9 @@ def inline_trt_modules(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 torch.ops.tensorrt.execute_engine.default,
                 (trt_module_node.args, engine_node),
             )
-            trt_node.meta["val"] = []
+            # set trt_node.meta with trt_module_node.meta
             assert num_outputs > 0
-            # Generate meta data for TRT node (a FakeTensor with corresponding output shape)
-            for idx in range(num_outputs):
-                trt_node.meta["val"].append(
-                    cast(
-                        FakeTensor,
-                        torch.empty_strided(
-                            tuple(trt_module.output_shapes[idx]),
-                            tuple([1] * len(trt_module.output_shapes[idx])),
-                        ),
-                    )
-                )
+            trt_node.meta["val"] = trt_module_node.meta["val"]
 
             # meta["val"] should be a lighter version of a tensor. For eg: it should be a FakeTensor (with output shape and dtype properties)
             # Lighter version of a custom_obj is not defined clearly. meta["val"] does not have any type expectations but

diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -14,7 +14,7 @@
 from torch_tensorrt.dynamo import partitioning
 from torch_tensorrt.dynamo._exporter import inline_torch_modules
 from torch_tensorrt.dynamo._settings import CompilationSettings
-from torch_tensorrt.dynamo.conversion._conversion import infer_module_outputs
+from torch_tensorrt.dynamo.conversion._conversion import infer_module_output_dtypes
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     DYNAMO_CONVERTERS as CONVERTERS,
 )
@@ -72,10 +72,8 @@ def construct_refit_mapping(
         "CONSTANT": (trt.IConstantLayer, [("weights", "CONSTANT")]),
     }
 
-    _, output_dtypes = infer_module_outputs(
+    output_dtypes = infer_module_output_dtypes(
         module,
-        inputs,
-        settings.device,
         truncate_double=settings.truncate_double,
     )
 

diff --git a/py/torch_tensorrt/dynamo/_tracer.py b/py/torch_tensorrt/dynamo/_tracer.py
@@ -115,6 +115,9 @@ def get_dynamic_shapes_args(mod: torch.nn.Module, inputs: Any) -> dict[str, Any]
     args = list(signature(mod.forward).parameters.keys())
     dynamic_shapes = {}
     for input, input_name in zip(inputs, args[: len(inputs)]):
+        # if input.name is not None, also not empty str, use the input.name
+        if input.name is not None and len(input.name) > 0 and input.name != input_name:
+            input_name = input.name
         dynamic_shapes[input_name] = get_dynamic_shapes(input)
     return dynamic_shapes
 

diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -1,12 +1,10 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, List, Optional, Sequence, Tuple
+from typing import Any, List, Optional, Sequence
 
 import tensorrt as trt
 import torch
-from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
-from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
@@ -17,64 +15,22 @@
     TRTInterpreterResult,
 )
 from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
-from torch_tensorrt.dynamo.utils import (
-    get_model_device,
-    get_torch_inputs,
-    unwrap_tensor_shape,
-)
+from torch_tensorrt.dynamo.utils import get_output_dtypes
 
 logger = logging.getLogger(__name__)
 
 
-def infer_module_outputs(
+def infer_module_output_dtypes(
     module: torch.fx.GraphModule,
-    inputs: Sequence[Input],
-    device: Device,
-    kwarg_inputs: Optional[dict[str, Any]] = None,
     truncate_double: bool = False,
-) -> Tuple[List[Tuple[int]], List[dtype]]:
+) -> List[dtype]:
     """
-    This function performs model inference to determine the output shapes and output dtypes
-    and truncates them accordingly. inputs can be either arg_inputs or flattened input list.
-    If it is flattened list, kwarg_inputs should be None, as it is already included in the flattened input.
+    This function get the output dtypes from node.meta['val'] which was set during dynamo compile_module step
+    and truncates them accordingly.
     """
-    # TODO: We can also determine output dtypes from the module.graph based on node metadata.
-    # However, our converter tests use fx.symbolic_trace which sometimes does not provide metadata,
-    # so we stick to the model inference approach currently.
-    with unset_fake_temporarily():
-        # Get the device on which the model exists
-        # For large models, this can be done on CPU to save GPU memory allocation for TRT.
-        device = get_model_device(module)
-        torch_inputs = get_torch_inputs(inputs, device)
-        if kwarg_inputs is None:
-            kwarg_inputs = {}
-        torch_kwarg_inputs = get_torch_inputs(kwarg_inputs, device)
-        module_outputs = module(*torch_inputs, **torch_kwarg_inputs)
-        if not isinstance(module_outputs, (list, tuple)):
-            module_outputs = [module_outputs]
-
-    # Int64 outputs can sometimes be generated from within other operators
-    # such as aten.sum - such outputs can be truncated
-    output_dtypes = []
-    output_shapes = []
-    for output in module_outputs:
-        output_ = output
-        # We don't need to check if output is nested here because the input module will be flattened
-        if not isinstance(output, torch.Tensor):
-            if isinstance(output, str):
-                raise ValueError(
-                    f"Received an output type {type(output)} that's not in the acceptable datatypes (https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)"
-                )
-            else:
-                output_ = torch.tensor(output)
-
-        output_shapes.append(unwrap_tensor_shape(output_))
-        if truncate_double and output_.dtype == dtype.float64:
-            output_dtypes.append(dtype.float32)
-        else:
-            output_dtypes.append(dtype._from(output_.dtype))
-
-    return output_shapes, output_dtypes
+    outputs = [node for node in module.graph.nodes if node.op == "output"]
+    outputs = outputs[0].args
+    return get_output_dtypes(outputs, truncate_double)
 
 
 def interpret_module_to_result(
@@ -84,8 +40,8 @@ def interpret_module_to_result(
     arg_inputs: Optional[Sequence[Input]] = None,
     kwarg_inputs: Optional[dict[str, Any]] = None,
     engine_cache: Optional[BaseEngineCache] = None,
-) -> Tuple[TRTInterpreterResult, List[Tuple[int]]]:
-    """Interpret an FX module to the output shapes and a TRTInterpreterResult
+) -> TRTInterpreterResult:
+    """Interpret an FX module to a TRTInterpreterResult
     Args:
         module: FX GraphModule to interpret
         inputs: Sequence of FLATTENED Tensors representing inputs to the module. It should include both
@@ -95,24 +51,11 @@ def interpret_module_to_result(
         settings: Compilation settings
         engine_cache: Engine cache instance
     Returns:
-        (TRTInterpreterResult, List[Tuple[int]])
+        TRTInterpreterResult
     """
-    if arg_inputs is not None:
-        output_shapes, output_dtypes = infer_module_outputs(
-            module,
-            arg_inputs,
-            settings.device,
-            kwarg_inputs=kwarg_inputs,
-            truncate_double=settings.truncate_double,
-        )
-    else:
-        # args and kwargs are combined and flattened to one list
-        output_shapes, output_dtypes = infer_module_outputs(
-            module,
-            inputs,
-            settings.device,
-            truncate_double=settings.truncate_double,
-        )
+    output_dtypes = infer_module_output_dtypes(
+        module, truncate_double=settings.truncate_double
+    )
 
     interpreter = TRTInterpreter(
         module,
@@ -124,7 +67,7 @@ def interpret_module_to_result(
     )
 
     interpreter_result = interpreter.run()
-    return interpreter_result, output_shapes
+    return interpreter_result
 
 
 def convert_module(
@@ -144,8 +87,7 @@ def convert_module(
     Returns:
         PythonTorchTensorRTModule or TorchTensorRTModule
     """
-
-    interpreter_result, output_shapes = interpret_module_to_result(
+    interpreter_result = interpret_module_to_result(
         module, inputs, settings, engine_cache=engine_cache
     )
 
@@ -169,7 +111,6 @@ def convert_module(
         serialized_engine=interpreter_result.serialized_engine,
         input_binding_names=list(interpreter_result.input_names),
         output_binding_names=list(interpreter_result.output_names),
-        output_shapes=output_shapes,
         name=name,
         settings=settings,
         weight_name_map=interpreter_result.weight_name_map,

diff --git a/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py b/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py
@@ -1,11 +1,11 @@
 import logging
-from typing import Callable, Tuple
 
 import torch
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
     clean_up_graph_after_modifications,
 )
+from torch_tensorrt.dynamo.utils import get_metadata, set_metadata
 
 logger = logging.getLogger(__name__)
 
@@ -14,33 +14,29 @@ def lower_linear(
     gm: torch.fx.GraphModule, settings: CompilationSettings
 ) -> torch.fx.GraphModule:
     """Replace aten.linear with an equivalent implementation which can be easily converted to TRT"""
-    orig, replacement = linear_replacement()
-
-    if torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement):
-        gm = clean_up_graph_after_modifications(gm)
-        logger.debug(f"Graph after lowering linear:\n{gm.graph}")
-
-    return gm
-
-
-def linear_replacement() -> Tuple[
-    torch.fx.GraphModule,
-    Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor],
-]:
-    """Constructs the original and replacement functions for linear"""
+    orig_op = torch.ops.aten.addmm.default
+    replacement_op = torch.ops.aten.linear.default
 
     # Original graph
     def orig(
         input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
     ) -> torch.Tensor:
         W_T = torch.ops.aten.permute.default(weight, [1, 0])
-        out = torch.ops.aten.addmm.default(bias, input, W_T)
+        out = orig_op(bias, input, W_T)
         return out
 
     # Replacement graph
     def replacement(
         input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
     ) -> torch.Tensor:
-        return torch.ops.aten.linear.default(input, weight, bias)
+        return replacement_op(input, weight, bias)
+
+    metadata = get_metadata(gm, orig_op)
+    replaced_nodes = torch.fx.subgraph_rewriter.replace_pattern(gm, orig, replacement)
+
+    if len(replaced_nodes) > 0:
+        gm = clean_up_graph_after_modifications(gm)
+        set_metadata(gm, replacement_op, metadata)
+        logger.debug(f"Graph after lowering linear:\n{gm.graph}")
 
-    return orig, replacement
+    return gm
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py
@@ -1,4 +1,4 @@
-from typing import Any, List
+from typing import List
 
 import torch
 
@@ -29,24 +29,3 @@ def get_tensor_placeholders(
     ]
 
     return placeholders
-
-
-def get_metadata(
-    gm: torch.fx.GraphModule, target_op: Any
-) -> List[torch._ops.OpOverload]:
-    """
-    Return the list which has the metadata of all the target_op nodes present in the graph.
-    """
-    return [node.meta for node in gm.graph.nodes if node.target == target_op]
-
-
-def set_metadata(
-    gm: torch.fx.GraphModule, target_op: Any, metadata: List[torch._ops.OpOverload]
-) -> None:
-    """
-    Return the list which has the metadata of all the target_op nodes present in the graph.
-    """
-    target_nodes = [node for node in gm.graph.nodes if node.target == target_op]
-    assert len(target_nodes) == len(metadata)
-    for idx, node in enumerate(target_nodes):
-        node.meta = metadata[idx]
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py b/py/torch_tensorrt/dynamo/lowering/passes/view_to_reshape.py
@@ -5,9 +5,8 @@
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
     clean_up_graph_after_modifications,
-    get_metadata,
-    set_metadata,
 )
+from torch_tensorrt.dynamo.utils import get_metadata, set_metadata
 
 logger = logging.getLogger(__name__)
 

diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -35,7 +35,6 @@ def __init__(
         serialized_engine: Optional[bytes] = None,
         input_binding_names: Optional[List[str]] = None,
         output_binding_names: Optional[List[str]] = None,
-        output_shapes: Optional[List[Tuple[int]]] = None,
         *,
         name: str = "",
         settings: CompilationSettings = CompilationSettings(),
@@ -93,7 +92,6 @@ def __init__(
         self.output_names = (
             output_binding_names if output_binding_names is not None else []
         )
-        self.output_shapes = output_shapes
         self.initialized = False
         self.target_device_id = (
             settings.device.gpu_id