pytorch
diff --git a/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 13 additions & 2 deletions b/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 13 additions & 2 deletions
diff --git a/‎setup.py
Lines changed: 32 additions & 3 deletions b/‎setup.py
Lines changed: 32 additions & 3 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_linear.py
Lines changed: 9 additions & 1 deletion b/‎test/prototype/mx_formats/test_mx_linear.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎test/prototype/mx_formats/test_mx_tensor.py
Lines changed: 1 addition & 1 deletion b/‎test/prototype/mx_formats/test_mx_tensor.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/quantization/pt2e/test_quantize_pt2e.py
Lines changed: 186 additions & 0 deletions b/‎test/quantization/pt2e/test_quantize_pt2e.py
Lines changed: 186 additions & 0 deletions
@@ -14,7 +14,7 @@ jobs:
   test-cpu-ops:
     strategy:
       matrix:
-        runner: [macos-14]
+        runner: [macos-14, linux.arm64.2xlarge]
     runs-on: ${{matrix.runner}}
     defaults:
       run:
@@ -30,7 +30,8 @@ jobs:
           python-version: "3.10"
           miniconda-version: "latest"
           activate-environment: venv
-      - name: Install requirements
+      - name: Install requirements mac
+        if: runner.os == 'macOS'
         run: |
           conda activate venv
           # Install executorch first because it installs its own version
@@ -39,27 +40,37 @@ jobs:
           pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cpu --force-reinstall
           pip install -r dev-requirements.txt
           USE_CPP=1 TORCHAO_BUILD_KLEIDIAI=1 pip install .
+      - name: Install requirements linux
+        if: runner.os == 'Linux'
+        run: |
+          conda activate venv
+          pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cpu --force-reinstall
+          pip install -r dev-requirements.txt
+          BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_PARALLEL_BACKEND=OPENMP pip install .
       - name: Run python tests
         run: |
           conda activate venv
           pytest torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
           python torchao/experimental/tests/test_embedding_xbit_quantizer.py
           python torchao/experimental/tests/test_quant_passes.py
       - name: Run kernels/cpu/aarch64/tests
+        if: runner.os == 'macOS'
         run: |
           conda activate venv
           pushd torchao/experimental/kernels/cpu/aarch64/tests
           sh build_and_run_tests.sh
           rm -rf /tmp/cmake-out
           popd
       - name: Run torchao/experimental/ops/tests
+        if: runner.os == 'macOS'
         run: |
           conda activate venv
           pushd torchao/experimental/ops/tests
           sh build_and_run_tests.sh
           rm -rf /tmp/cmake-out
           popd
       - name: ET ops build
+        if: runner.os == 'macOS'
         run: |
           conda activate venv
           pushd torchao/experimental
 
@@ -49,7 +49,7 @@ def read_version(file_path="version.txt"):
 
 import platform
 
-build_torchao_experimental = (
+build_macos_arm_auto = (
     use_cpp == "1"
     and platform.machine().startswith("arm64")
     and platform.system() == "Darwin"
@@ -117,8 +117,33 @@ def __init__(self):
                 "TORCHAO_BUILD_EXPERIMENTAL_MPS requires MPS be available"
             )
 
+        # TORCHAO_PARALLEL_BACKEND specifies which parallel backend to use
+        # Possible values: aten_openmp, executorch, openmp, pthreadpool, single_threaded
+        self.parallel_backend = os.getenv("TORCHAO_PARALLEL_BACKEND", "aten_openmp")
+
+        # TORCHAO_ENABLE_ARM_NEON_DOT enable ARM NEON Dot Product extension
+        # Enabled by default on macOS silicon
+        self.enable_arm_neon_dot = self._os_bool_var(
+            "TORCHAO_ENABLE_ARM_NEON_DOT",
+            default=(self._is_arm64() and self._is_macos()),
+        )
+        if self.enable_arm_neon_dot:
+            assert self.build_cpu_aarch64, (
+                "TORCHAO_ENABLE_ARM_NEON_DOT requires TORCHAO_BUILD_CPU_AARCH64 be set"
+            )
+
+        # TORCHAO_ENABLE_ARM_I8MM enable ARM 8-bit Integer Matrix Multiply instructions
+        # Not enabled by default on macOS as not all silicon mac supports it
+        self.enable_arm_i8mm = self._os_bool_var(
+            "TORCHAO_ENABLE_ARM_I8MM", default=False
+        )
+        if self.enable_arm_i8mm:
+            assert self.build_cpu_aarch64, (
+                "TORCHAO_ENABLE_ARM_I8MM requires TORCHAO_BUILD_CPU_AARCH64 be set"
+            )
+
     def _is_arm64(self) -> bool:
-        return platform.machine().startswith("arm64")
+        return platform.machine().startswith("arm64") or platform.machine() == "aarch64"
 
     def _is_macos(self) -> bool:
         return platform.system() == "Darwin"
@@ -468,7 +493,8 @@ def get_extensions():
             )
         )
 
-    if build_torchao_experimental:
+    # Build CMakeLists from /torchao/experimental - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
+    if build_macos_arm_auto or os.getenv("BUILD_TORCHAO_EXPERIMENTAL") == "1":
         build_options = BuildOptions()
 
         def bool_to_on_off(value):
@@ -488,6 +514,9 @@ def bool_to_on_off(value):
                         f"-DTORCHAO_BUILD_CPU_AARCH64={bool_to_on_off(build_options.build_cpu_aarch64)}",
                         f"-DTORCHAO_BUILD_KLEIDIAI={bool_to_on_off(build_options.build_kleidi_ai)}",
                         f"-DTORCHAO_BUILD_MPS_OPS={bool_to_on_off(build_options.build_experimental_mps)}",
+                        f"-DTORCHAO_ENABLE_ARM_NEON_DOT={bool_to_on_off(build_options.enable_arm_neon_dot)}",
+                        f"-DTORCHAO_ENABLE_ARM_I8MM={bool_to_on_off(build_options.enable_arm_i8mm)}",
+                        f"-DTORCHAO_PARALLEL_BACKEND={build_options.parallel_backend}",
                         "-DTorch_DIR=" + torch_dir,
                     ]
                     + (
 
@@ -28,6 +28,7 @@
 from torchao.prototype.mx_formats.mx_subclass import MXFPInferenceConfig
 from torchao.quantization import quantize_
 from torchao.quantization.utils import compute_error
+from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
     is_sm_at_least_89,
@@ -396,18 +397,25 @@ def test_inference_print_str():
 @pytest.mark.skipif(
     not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
 )
-@pytest.mark.skipif(not is_sm_at_least_100, reason="Reqs sm100")
 @pytest.mark.parametrize("elem_dtype", [torch.float8_e4m3fn, torch.float4_e2m1fn_x2])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("compile", [True, False])
 @torch.no_grad()
+@skip_if_rocm(
+    "ROCm float4 gemm require gfx950"
+)  # TODO(future): deploy gfx950 in ROCM CI
 def test_inference_subclass(elem_dtype, bias: bool, compile: bool):
     """
     Smoke test for inference compile
     """
+    # TODO(future): figure out why these CUDA capability conditions are not properly
+    # applied when inside `pytest.mark.skipif` for this test
     if elem_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
         if not is_sm_at_least_89():
             pytest.skip("CUDA capability >= 8.9 required for float8 in triton")
+    elif elem_dtype == torch.float4_e2m1fn_x2:
+        if not is_sm_at_least_100():
+            pytest.skip("CUDA capability >= 10.0 required for float4 gemm")
 
     m = nn.Linear(32, 128, bias=bias, dtype=torch.bfloat16, device="cuda")
     m_mx = copy.deepcopy(m)
 
@@ -66,7 +66,7 @@ def assert_sqnr_gt_threshold(orig, new, threshold):
     if elem_dtype is torch.float8_e4m3fn:
         assert_sqnr_gt_threshold(data_hp, data_mx_dq, 18.0)
     else:
-        assert_sqnr_gt_threshold(data_hp, data_mx_dq, 14.0)
+        assert_sqnr_gt_threshold(data_hp, data_mx_dq, 13.0)
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 
@@ -2385,6 +2385,192 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             node_list,
         )
 
+    def test_conv3d_bn_relu(self):
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                act_qspec = QuantizationSpec(
+                    dtype=torch.uint8,
+                    quant_min=0,
+                    quant_max=255,
+                    qscheme=torch.per_tensor_affine,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.default_observer,
+                )
+                weight_qspec = QuantizationSpec(
+                    dtype=torch.int8,
+                    quant_min=-128,
+                    quant_max=127,
+                    qscheme=torch.per_tensor_affine,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.default_weight_observer,
+                )
+                bias_qspec = QuantizationSpec(
+                    dtype=torch.float32,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.PlaceholderObserver,
+                )
+                # conv_transpose + bn is fused automatically in PTQ (not configurable)
+                # so we just need to annotate conv + relu for conv + bn + relu
+                # pattern
+                for n in model.graph.nodes:
+                    if (
+                        n.op != "call_function"
+                        or n.target != torch.ops.aten.relu.default
+                    ):
+                        continue
+                    relu_node = n
+                    n = n.args[0]
+                    if (
+                        n.op != "call_function"
+                        and n.target != torch.ops.aten.conv3d.input
+                    ):
+                        continue
+                    conv_t_node = n
+                    input_act = conv_t_node.args[0]
+                    weight = conv_t_node.args[1]
+                    bias = conv_t_node.args[2]
+                    conv_t_node.meta["quantization_annotation"] = (
+                        QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                                weight: weight_qspec,
+                                bias: bias_qspec,
+                            },
+                            _annotated=True,
+                        )
+                    )
+                    relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+                        output_qspec=act_qspec,
+                        _annotated=True,
+                    )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv3d(2, 2, 3, padding=1)
+                self.bn = torch.nn.BatchNorm3d(2)
+
+            def forward(self, x):
+                return torch.nn.functional.relu(self.bn(self.conv(x)))
+
+        example_inputs = (torch.randn(1, 2, 2, 5, 5),)
+        node_occurrence = {
+            # two for input of the first conv, one for output for the first conv
+            torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default: 3,
+        }
+        node_list = [
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.aten.conv3d.default,
+            torch.ops.aten.relu.default,
+            torch.ops.quantized_decomposed.quantize_per_tensor.default,
+        ]
+        model = M().eval()
+        self._test_quantizer(
+            model,
+            example_inputs,
+            BackendAQuantizer(),
+            node_occurrence,
+            node_list,
+        )
+
+    def test_conv_transpose3d_bn_relu(self):
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                act_qspec = QuantizationSpec(
+                    dtype=torch.uint8,
+                    quant_min=0,
+                    quant_max=255,
+                    qscheme=torch.per_tensor_affine,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.default_observer,
+                )
+                weight_qspec = QuantizationSpec(
+                    dtype=torch.int8,
+                    quant_min=-128,
+                    quant_max=127,
+                    qscheme=torch.per_tensor_affine,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.default_weight_observer,
+                )
+                bias_qspec = QuantizationSpec(
+                    dtype=torch.float32,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.PlaceholderObserver,
+                )
+                # conv_transpose + bn is fused automatically in PTQ (not configurable)
+                # so we just need to annotate conv_transpose + relu for conv_transpose + bn + relu
+                # pattern
+                for n in model.graph.nodes:
+                    if (
+                        n.op != "call_function"
+                        or n.target != torch.ops.aten.relu.default
+                    ):
+                        continue
+                    relu_node = n
+                    n = n.args[0]
+                    if (
+                        n.op != "call_function"
+                        and n.target != torch.ops.aten.conv_transposed3d.input
+                    ):
+                        continue
+                    conv_t_node = n
+                    input_act = conv_t_node.args[0]
+                    weight = conv_t_node.args[1]
+                    bias = conv_t_node.args[2]
+                    conv_t_node.meta["quantization_annotation"] = (
+                        QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                                weight: weight_qspec,
+                                bias: bias_qspec,
+                            },
+                            _annotated=True,
+                        )
+                    )
+                    relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+                        output_qspec=act_qspec,
+                        _annotated=True,
+                    )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv_t = torch.nn.ConvTranspose3d(2, 2, 3, padding=1)
+                self.bn = torch.nn.BatchNorm3d(2)
+
+            def forward(self, x):
+                return torch.nn.functional.relu(self.bn(self.conv_t(x)))
+
+        example_inputs = (torch.randn(1, 2, 2, 5, 5),)
+        node_occurrence = {
+            # two for input of the first conv, one for output for the first conv
+            torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default: 3,
+        }
+        node_list = [
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.aten.conv_transpose3d.input,
+            torch.ops.aten.relu.default,
+            torch.ops.quantized_decomposed.quantize_per_tensor.default,
+        ]
+        model = M().eval()
+        self._test_quantizer(
+            model,
+            example_inputs,
+            BackendAQuantizer(),
+            node_occurrence,
+            node_list,
+        )
+
     def test_multi_users_without_output_observer(self):
         """
         Test the case in which a node is used by multiple users,