Qualcomm AI Engine Direct - Support tile op for different I/O rank

DannyYuyang-quic · DannyYuyang-quic · commit fc5c30bee96b · 2025-04-10T12:21:41.000+08:00
Summary:

- Support if the rank of input tensor is less than the rank of output tensor.
- make_quantizer kwargs alignment.
- Remove module.eval() since calling eval() is not supported for exported models.
diff --git a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
@@ -5,29 +5,38 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+
+from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from executorch.exir.passes import dead_code_elimination_pass
 
+from .utils import dq_ops, get_quant_attrs
+
 
 class ExpandBroadcastTensorShape(ExportPass):
     """
     Make tensors have same rank for layout-transform to work properly.
     """
 
-    def __init__(self):
+    def __init__(self, edge_program):
         super(ExpandBroadcastTensorShape, self).__init__()
         self.broadcast_op_targets = [
             exir_ops.edge.aten.add.Tensor,
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.mul.Tensor,
             exir_ops.edge.aten.div.Tensor,
+            # Support if the rank of input tensor: {input_dims} is less than the rank of output tensor: {output_dims}.
+            exir_ops.edge.aten.expand_copy.default,
         ]
+        self.edge_program = edge_program
 
     def traverse_broadcast_node(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             if node.target in self.broadcast_op_targets:
                 for arg in node.args:
+                    if not isinstance(arg, torch.fx.Node):
+                        continue
                     input_rank = len(arg.meta["val"].shape)
                     output_rank = len(node.meta["val"].shape)
                     if input_rank != output_rank:
@@ -45,6 +54,9 @@ def traverse_broadcast_node(self, graph_module: torch.fx.GraphModule):
                             # to be updated correctly and not affect meta of arg
                             for k, v in arg.meta.items():
                                 reshape_node.meta[k] = v
+                            if arg.target in dq_ops:
+                                quant_attrs = get_quant_attrs(self.edge_program, arg)
+                                reshape_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
                             reshape_node.meta["val"] = reshape_node.meta["val"].reshape(
                                 new_rank
                             )
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -69,7 +69,11 @@
 from collections import defaultdict
 from typing import List
 
-from executorch.backends.qualcomm._passes import FoldQDQ, TagQuantIO
+from executorch.backends.qualcomm._passes import (
+    ExpandBroadcastTensorShape,
+    FoldQDQ,
+    TagQuantIO,
+)
 from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors
 from executorch.backends.qualcomm.debugger.utils import DrawGraph
 from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model
@@ -430,10 +434,20 @@ def test_qnn_backend_equal(self):
 
     def test_qnn_backend_expand(self):
         modules = [ExpandAs(), ExpandCopy()]  # noqa: F405
-        sample_input = (torch.randn([3, 1]),)
-        for i, module in enumerate(modules):
-            with self.subTest(i=i):
-                self.lower_module_and_test_output(module, sample_input)
+        sample_inputs = [
+            (torch.randn([3, 1]),),
+            (torch.randn([4]),),
+        ]
+        passes_job = get_capture_program_passes()
+        passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
+        index = 0
+        for module in modules:
+            for sample_input in sample_inputs:
+                with self.subTest(i=index):
+                    self.lower_module_and_test_output(
+                        module, sample_input, passes_job=passes_job
+                    )
+                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -1506,11 +1520,21 @@ def test_qnn_backend_equal(self):
 
     def test_qnn_backend_expand(self):
         modules = [ExpandAs(), ExpandCopy()]  # noqa: F405
-        sample_input = (torch.randn([3, 1]),)
-        for i, module in enumerate(modules):
-            with self.subTest(i=i):
-                module = self.get_qdq_module(module, sample_input)
-                self.lower_module_and_test_output(module, sample_input)
+        sample_inputs = [
+            (torch.randn([3, 1]),),
+            (torch.randn([4]),),
+        ]
+        passes_job = get_capture_program_passes()
+        passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
+        index = 0
+        for module in modules:
+            for sample_input in sample_inputs:
+                with self.subTest(i=index):
+                    module = self.get_qdq_module(module, sample_input)
+                    self.lower_module_and_test_output(
+                        module, sample_input, passes_job=passes_job
+                    )
+                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -9,7 +9,7 @@
 import subprocess
 import tempfile
 import unittest
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, OrderedDict, Tuple
 
 import numpy as np
 import torch
@@ -435,6 +435,7 @@ def lower_module_and_test_output(
         expected_profile_events: int = -1,
         expected_intermediate_events: int = -1,
         assert_output_equal: bool = True,
+        passes_job: Optional[OrderedDict] = None,
         skip_node_id_set: set = None,
         skip_node_op_set: set = None,
         dynamic_shapes: Dict = None,
@@ -444,6 +445,7 @@ def lower_module_and_test_output(
             sample_inputs,
             self.compiler_specs,
             dynamic_shapes=dynamic_shapes,
+            passes_job=passes_job,
             skip_node_id_set=skip_node_id_set,
             skip_node_op_set=skip_node_op_set,
         )
@@ -504,9 +506,8 @@ def get_qdq_module(
         dynamic_shapes: Dict = None,
         bypass_check: bool = False,
         block_size_map: Dict[str, Tuple] = None,
-        submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None,
+        callback_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None,
     ) -> torch.fx.GraphModule:
-        module = module.eval()
         m = torch.export.export(
             module, inputs, dynamic_shapes=dynamic_shapes, strict=True
         ).module()
@@ -516,7 +517,7 @@ def get_qdq_module(
             custom_annotations=custom_quant_annotations,
             per_channel_conv=is_conv_per_channel,
             per_channel_linear=is_linear_per_channel,
-            submodule_qconfig_list=submodule_qconfig_list,
+            callback_qconfig_list=callback_qconfig_list,
         )
         if block_size_map is not None:
             quantizer.set_block_size_map(block_size_map)
@@ -544,7 +545,7 @@ def get_prepared_qat_module(
         is_linear_per_channel: Optional[bool] = False,
         custom_quant_annotations: Tuple[Callable] = (),
         quant_dtype: QuantDtype = QuantDtype.use_8a8w,
-        submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None,
+        callback_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None,
     ) -> torch.fx.GraphModule:
         m = torch.export.export_for_training(module, inputs, strict=True).module()
 
@@ -554,11 +555,11 @@ def get_prepared_qat_module(
             per_channel_conv=is_conv_per_channel,
             per_channel_linear=is_linear_per_channel,
             is_qat=True,
-            submodule_qconfig_list=submodule_qconfig_list,
+            callback_qconfig_list=callback_qconfig_list,
         )
 
-        submodule_qconfig_list = submodule_qconfig_list or []
-        quantizer.set_submodule_qconfig_list(submodule_qconfig_list)
+        callback_qconfig_list = callback_qconfig_list or []
+        quantizer.set_submodule_qconfig_list(callback_qconfig_list)
 
         prepared = prepare_qat_pt2e(m, quantizer)
         return torch.ao.quantization.move_exported_model_to_train(prepared)