Qualcomm AI Engine Direct - oss model enablement (EfficientSAM)

DannyYuyang-quic · DannyYuyang-quic · commit 1f614ea50dee · 2025-03-14T16:56:32.000+08:00
- e2e script for https://github.com/yformer/EfficientSAM - Fastvit breakage fix - Passes order correction - Add support for cum_sum
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
@@ -53,6 +53,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.ceil.default,
         exir_ops.edge.aten.clamp.default,
         exir_ops.edge.aten.constant_pad_nd.default,
+        exir_ops.edge.aten.cumsum.default,
         exir_ops.edge.aten.div.Tensor,
         exir_ops.edge.aten.eq.Tensor,
         exir_ops.edge.aten.full.default,
diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
@@ -46,6 +46,7 @@ class TensorOpInfo:
     aten.pow.Tensor_Scalar: TensorOpInfo(aten.pow.Tensor_Tensor, False),
     # The scalar number arg[1] is missing when using default. Result in a corner case to deal
     aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True),
+    aten.where.ScalarOther: TensorOpInfo(aten.where.self, False),
 }
 
 
diff --git a/backends/qualcomm/_passes/recompose_pixel_unshuffle.py b/backends/qualcomm/_passes/recompose_pixel_unshuffle.py
@@ -45,13 +45,11 @@ def call(self, graph_module: torch.fx.GraphModule):
                         continue
 
                     view_node = premute_node.args[0]
-                    if any(
-                        [
-                            view_node.op != "call_function",
-                            view_node.target != self.view_target,
-                            len(view_node.args[1]) != 6,
-                            len(premute_node.args[1]) != 6,
-                        ]
+                    if (
+                        view_node.op != "call_function"
+                        or view_node.target != self.view_target
+                        or len(view_node.args[1]) != 6
+                        or len(premute_node.args[1]) != 6
                     ):
                         continue
 
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
@@ -93,7 +93,7 @@ def get_passes_dependency_for_capture_program():
         ConvertToLinear: [RecomposePixelUnshuffle],
         DecomposeAny: [RemoveRedundancy],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
-        ExpandBroadcastTensorShape: [RemoveRedundancy],
+        ExpandBroadcastTensorShape: [ConstantI64toI32, TensorI64toI32],
         FoldQDQ: [AnnotateQuantAttrs, AnnotateDecomposed],
         LayoutTransform: [
             AnnotateQuantAttrs,
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
@@ -19,6 +19,7 @@
     op_clamp,
     op_conv2d,
     op_cos,
+    op_cum_sum,
     op_depth_to_space,
     op_dequantize,
     op_div,
@@ -99,6 +100,7 @@
     op_clamp,
     op_conv2d,
     op_cos,
+    op_cum_sum,
     op_depth_to_space,
     op_dequantize,
     op_div,
diff --git a/backends/qualcomm/builders/op_cos.py b/backends/qualcomm/builders/op_cos.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
diff --git a/backends/qualcomm/builders/op_cum_sum.py b/backends/qualcomm/builders/op_cum_sum.py
@@ -0,0 +1,84 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpCumulativeSum, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class CumulativeSum(NodeVisitor):
+    target = ["aten.cumsum.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def get_param(self, node, input_tensor):
+        dim = node.args[1]
+
+        if dim < 0:
+            dim = dim % len(input_tensor.shape)
+        if QCOM_AXIS_ORDER in node.meta:
+            dim = node.meta[QCOM_AXIS_ORDER].index(dim)
+
+        return cast(np.uint32, dim)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        dim = self.get_param(node, input_tensor)
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        cumsum_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpCumulativeSum.op_name,
+        )
+        cumsum_op.AddInputTensors([input_tensor_wrapper])
+        cumsum_op.AddOutputTensors([output_tensor_wrapper])
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {QCOM_DATA: dim},
+        )
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_exclusive,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+            {QCOM_DATA: False},
+        )
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_reverse,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+            {QCOM_DATA: False},
+        )
+
+        return cumsum_op
diff --git a/backends/qualcomm/builders/op_sin.py b/backends/qualcomm/builders/op_sin.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
@@ -50,6 +50,14 @@ class OpConvert:
     op_name: str = "Convert"
 
 
+@dataclass(init=False, frozen=True)
+class OpCumulativeSum:
+    op_name = "CumulativeSum"
+    param_axis = "axis"
+    param_exclusive = "exclusive"
+    param_reverse = "reverse"
+
+
 @dataclass(init=False, frozen=True)
 class OpDepthToSpace:
     op_name: str = "DepthToSpace"
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -925,6 +925,11 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
     )
 
 
+@register_annotator([torch.ops.aten.cumsum.default])
+def annotate_cumsum(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.linear.default])
 def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None:
     act_node = node.args[0]
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -529,6 +529,14 @@ def forward(self, x):
         return torch.cos(x)
 
 
+class CumSum(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.cumsum(dim=0)
+
+
 class Div(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1469,3 +1477,11 @@ def __init__(self, pos, neg):
 
     def forward(self, x):
         return torch.where(x >= torch.zeros(x.shape), self.pos, self.neg)
+
+
+class WhereConstantOther(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.where(x >= 0, torch.ones(x.shape), 0)
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -208,6 +208,11 @@ def test_qnn_backend_cos(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cumsum(self):
+        module = CumSum()  # noqa: F405
+        sample_input = (torch.randn(4),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
         x = torch.randn(5)
@@ -790,10 +795,12 @@ def test_qnn_backend_where(self):
         modules = [
             Where(),  # noqa: F405
             WhereConstant(torch.randn(3, 2), torch.randn(3, 2)),  # noqa: F405
+            WhereConstantOther(),  # noqa: F405
         ]
         sample_inputs = [
             (torch.randn(3, 2), torch.randn(3, 2), torch.randn(3, 2)),
             (torch.randn(3, 2),),
+            (torch.randn(3, 2),),
         ]
         for i, module in enumerate(modules):
             self.lower_module_and_test_output(module, sample_inputs[i])
@@ -1165,6 +1172,12 @@ def test_qnn_backend_cos(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cumsum(self):
+        module = CumSum()  # noqa: F405
+        sample_input = (torch.randn(4),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
         x = torch.randn(5)
@@ -1826,10 +1839,12 @@ def test_qnn_backend_where(self):
         modules = [
             Where(),  # noqa: F405
             WhereConstant(torch.randn(3, 2), torch.randn(3, 2)),  # noqa: F405
+            WhereConstantOther(),  # noqa: F405
         ]
         sample_inputs = [
             (torch.randn(3, 2), torch.randn(3, 2), torch.randn(3, 2)),
             (torch.randn(3, 2),),
+            (torch.randn(3, 2),),
         ]
         for i, module in enumerate(modules):
             module = self.get_qdq_module(module, sample_inputs[i])
@@ -3421,6 +3436,46 @@ def test_dino_v2(self):
                 self.assertGreaterEqual(msg["top_1"], 70)
                 self.assertGreaterEqual(msg["top_5"], 85)
 
+    def test_efficientSAM(self):
+        if not self.required_envs(
+            [self.image_dataset, self.pretrained_weight, self.oss_repo]
+        ):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/efficientSAM.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--oss_repo",
+            self.oss_repo,
+            "--pretrained_weight",
+            self.pretrained_weight,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["MIoU"], 0.55)
+
     def test_esrgan(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -459,12 +459,13 @@ def lower_module_and_test_output(
         skip_node_id_set: set = None,
         skip_node_op_set: set = None,
         dynamic_shapes: Dict = None,
+        passes_job: collections.OrderedDict = None,
     ):
         qnn_partitioner = QnnPartitioner(
             self.compiler_specs, skip_node_id_set, skip_node_op_set
         )
         delegated_program = capture_program(
-            module, sample_inputs, dynamic_shapes=dynamic_shapes
+            module, sample_inputs, dynamic_shapes=dynamic_shapes, passes_job=passes_job
         )
 
         # this is needed for the ETRecord as lowering modifies the graph in-place
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py b/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py
diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ class TensorOpInfo:`
`46`	`46`	`aten.pow.Tensor_Scalar: TensorOpInfo(aten.pow.Tensor_Tensor, False),`
`47`	`47`	`# The scalar number arg[1] is missing when using default. Result in a corner case to deal`
`48`	`48`	`aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True),`
	`49`	`+ aten.where.ScalarOther: TensorOpInfo(aten.where.self, False),`
`49`	`50`	`}`
`50`	`51`
`51`	`52`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,6 @@`
`3`	`3`	`#`
`4`	`4`	`# This source code is licensed under the BSD-style license found in the`
`5`	`5`	`# LICENSE file in the root directory of this source tree.`
`6`		`-`
`7`	`6`	`from typing import Dict`
`8`	`7`
`9`	`8`	`import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper`
Original file line number	Diff line number	Diff line change
`@@ -459,12 +459,13 @@ def lower_module_and_test_output(`
`459`	`459`	`skip_node_id_set: set = None,`
`460`	`460`	`skip_node_op_set: set = None,`
`461`	`461`	`dynamic_shapes: Dict = None,`
	`462`	`+ passes_job: collections.OrderedDict = None,`
`462`	`463`	`):`
`463`	`464`	`qnn_partitioner = QnnPartitioner(`
`464`	`465`	`self.compiler_specs, skip_node_id_set, skip_node_op_set`
`465`	`466`	`)`
`466`	`467`	`delegated_program = capture_program(`
`467`		`- module, sample_inputs, dynamic_shapes=dynamic_shapes`
	`468`	`+ module, sample_inputs, dynamic_shapes=dynamic_shapes, passes_job=passes_job`
`468`	`469`	`)`
`469`	`470`
`470`	`471`	`# this is needed for the ETRecord as lowering modifies the graph in-place`