Enable int16 rsqrt on Ethos-U55/U85 (#14770)

3l1 · meta-codesync[bot] · commit 3a754b3bd385 · 2025-10-22T11:33:29.000-07:00
Summary: Pull Request resolved: #14770 Fix Rsqrt op for int16 Add unit tests bypass-github-export-checks bypass-github-pytorch-ci-checks bypass-github-executorch-ci-checks Reviewed By: Ninja91, digantdesai Differential Revision: D83802158
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
@@ -185,18 +185,27 @@ def f(x: torch.Tensor) -> torch.Tensor:
             )
             # Dont use the 7 LSBs.
             x = in_quantargs.dequantize_value((x & ~0x7F))
+            # x = in_quantargs.dequantize_value(x) // (1 << 7)
             x = torch_op(x)
+            # x = x * (1 << 7)
             return out_quantargs.quantize_value(x)
 
-        lut_values = f(
-            torch.linspace(
-                start=in_quantargs.qmin,
-                end=in_quantargs.qmax + 1,
-                steps=513,
-                # use torch.int32 to avoid overflow for end=in_quantargs.qmax + 1.
-                dtype=torch.int32,
-            )
+        # Create the 9.7 fixed-point value
+        r = torch.linspace(
+            start=in_quantargs.qmin,
+            end=in_quantargs.qmax + 1,
+            steps=513,
+            # use torch.int32 to avoid overflow for end=in_quantargs.qmax + 1.
+            dtype=torch.int32,
         )
+        # # Cast input to a wider type (int32)
+        # r_int32 = r.to(torch.int32)
+        # # Extract most significant 9 bits
+        # index = (r_int32 >> 7) & 0x1FF
+        # # Extract the fractional 7 bits
+        # fraction = r_int32 & 0x7F
+
+        lut_values = f(r)
         # Calculate how much we need to shift table values to fit in 16 signed bits
         # ceil(log2(max absolute table value)) + 1 bit for signedness - 16
         # Example:
diff --git a/backends/arm/test/ops/test_rsqrt.py b/backends/arm/test/ops/test_rsqrt.py
@@ -9,16 +9,21 @@
 from typing import Tuple
 
 import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_symmetric_a16w8_quantization_config,
+    TOSAQuantizer,
+)
+from executorch.backends.arm.test import common, conftest
 
-from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
     EthosU85PipelineINT,
     TosaPipelineFP,
     TosaPipelineINT,
     VgfPipeline,
 )
-
+from executorch.backends.arm.tosa import TosaSpecification
+from executorch.backends.xnnpack.test.tester import Quantize
 
 aten_op = "torch.ops.aten.rsqrt.default"
 input_t1 = Tuple[torch.Tensor]  # Input x
@@ -29,7 +34,7 @@ class Rsqrt(torch.nn.Module):
         "ones_4d": lambda: (torch.ones(1, 10, 10, 10),),
         "rand_4d_1": lambda: (torch.rand(1, 10, 10, 10),),
         "rand_4d_2": lambda: (torch.rand(1, 5, 10, 20),),
-        "rand_3d": lambda: (torch.rand(5, 10, 20),),
+        "rand_3d": lambda: (torch.rand(5, 10, 20) + 1.0,),
     }
 
     def forward(self, x: torch.Tensor):
@@ -104,3 +109,102 @@ def test_rsqrt_vgf_INT(test_tensor: torch.Tensor):
         tosa_version="TOSA-1.0+INT",
     )
     pipeline.run()
+
+
+def get_symmetric_a16w8_rsqrt_quantizer(
+    u55_config=False, per_channel_quantization=False
+):
+    tosa_version = conftest.get_option("tosa_version")
+    tosa_profiles = {
+        "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT+int16"),
+    }
+
+    quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
+    quantizer.set_global(
+        get_symmetric_a16w8_quantization_config(is_per_channel=per_channel_quantization)
+    )
+    quantizer.set_module_type(
+        torch.nn.Linear,
+        get_symmetric_a16w8_quantization_config(
+            is_per_channel=per_channel_quantization
+        ),
+    )
+    
+    return Quantize(
+        quantizer,
+        get_symmetric_a16w8_quantization_config(
+            is_per_channel=per_channel_quantization
+        ),
+    )
+
+
+@common.parametrize("test_tensor", Rsqrt.test_parameters)
+def test_rsqrt_int16_tosa_INT(test_tensor: torch.Tensor):
+    """Test rsqrt operation with int16 quantization"""
+    pipeline = TosaPipelineINT[input_t1](
+        Rsqrt(),
+        test_tensor(),
+        aten_op,
+        exir_op=[],
+        per_channel_quantization=False,
+        use_to_edge_transform_and_lower=True,
+        tosa_extensions=["int16"],
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_rsqrt_quantizer(
+            per_channel_quantization=False
+        ),
+    )
+    # Run the pipeline
+    pipeline.run()
+
+
+@common.parametrize("test_tensor", Rsqrt.test_parameters)
+@common.XfailIfNoCorstone300
+def test_rsqrt_int16_u55_INT16(test_tensor: torch.Tensor):
+    """Test rsqrt operation with int16 quantization on U55"""
+    pipeline = EthosU55PipelineINT[input_t1](
+        Rsqrt(),
+        test_tensor(),
+        aten_op,
+        exir_ops=[],
+        per_channel_quantization=True,
+        use_to_edge_transform_and_lower=True,
+        atol=1e-02,
+        rtol=1e-02,
+        run_on_fvp=True,
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_rsqrt_quantizer(
+            per_channel_quantization=True
+        ),
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_tensor", Rsqrt.test_parameters)
+@common.XfailIfNoCorstone320
+def test_rsqrt_int16_u85_INT16(test_tensor: torch.Tensor):
+    """Test rsqrt operation with int16 quantization on U85"""
+    pipeline = EthosU85PipelineINT[input_t1](
+        Rsqrt(),
+        test_tensor(),
+        aten_op,
+        exir_ops=[],
+        use_to_edge_transform_and_lower=True,
+        atol=1e-02,
+        rtol=1e-02,
+        run_on_fvp=True,
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_rsqrt_quantizer(
+            per_channel_quantization=False
+        ),
+    )
+    pipeline.run()
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
@@ -24,6 +24,7 @@ def define_arm_tests():
         "ops/test_linear.py", 
         "ops/test_mul.py",
         "ops/test_permute.py",
+        "ops/test_rsqrt.py",
         "ops/test_slice.py",
         "ops/test_sigmoid.py",
         "ops/test_sub.py",