up

metascroy · metascroy · commit 4b3a742243bb · 2025-03-04T21:15:20.000-08:00
diff --git a/torchao/experimental/packed_linear_int8_dynamic_activation_intx_weight_layout.py b/torchao/experimental/packed_linear_int8_dynamic_activation_intx_weight_layout.py
@@ -6,7 +6,7 @@
 
 import logging
 from enum import Enum, auto
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from torch.utils._python_dispatch import return_and_correct_aliasing
@@ -53,39 +53,21 @@ def target_from_str(target: str) -> Target:
 
 
 class PackedLinearInt8DynamicActivationIntxWeightLayout(Layout):
-    bit_width: Optional[int]
-    group_size: Optional[int]
-    has_weight_zeros: Optional[bool]
-    has_bias: Optional[bool]
-    # The target platform for the layout, 'native' or 'aten'
-    target: Optional[Target]
-
     def __init__(
         self,
-        bit_width: Optional[int] = None,
-        group_size: Optional[int] = None,
-        has_weight_zeros: Optional[bool] = None,
-        has_bias: Optional[bool] = None,
-        target: Optional[str] = "native",
+        target: Union[str, Target] = "native",
     ):
-        if bit_width is not None:
-            assert bit_width >= 1 and bit_width <= 8, "bit_width must be 1 to 8"
-        if group_size is not None:
-            assert group_size >= 1, f"group_size must be positive, got {group_size}"
-
-        self.bit_width = bit_width
-        self.group_size = group_size
-        self.has_weight_zeros = has_weight_zeros
-        self.has_bias = has_bias
-        self.target = target_from_str(target)
-
-        if not self.has_params_set():
-            assert (
-                self.bit_width is None
-                and self.group_size is None
-                and self.has_weight_zeros is None
-                and self.has_bias is None
-            ), "bit_width, group_size, has_weight_zeros, has_bias must be None if has_params_set is False"
+        if isinstance(target, str):
+            target = target_from_str(target)
+        self.target = target
+
+        self.bit_width: Optional[int] = None
+        self.group_size: Optional[int] = None
+        self.has_weight_zeros: Optional[bool] = None
+        # has_bias is whether the packed weights
+        # have bias packed with them, not whether the
+        # linear operator has bias
+        self.has_bias: Optional[bool] = None
 
     def extra_repr(self):
         return f"group_size={self.group_size}, bit_width={self.bit_width}, has_weight_zeros={self.has_weight_zeros}, has_bias={self.has_bias}, target={self.target}"
@@ -99,6 +81,18 @@ def has_params_set(self) -> bool:
             and (self.target is not None)
         )
 
+    def set_params(
+        self, bit_width: int, group_size: int, has_weight_zeros: bool, has_bias: bool
+    ):
+        assert bit_width >= 1 and bit_width <= 8, "bit_width must be 1 to 8"
+        assert group_size >= 1, f"group_size must be positive, got {group_size}"
+
+        self.bit_width = bit_width
+        self.group_size = group_size
+        self.has_weight_zeros = has_weight_zeros
+        self.has_bias = has_bias
+        assert self.has_params_set()
+
 
 @register_layout(PackedLinearInt8DynamicActivationIntxWeightLayout)
 class PackedLinearInt8DynamicActivationIntxWeightAQTTensorImpl(AQTTensorImpl):
diff --git a/torchao/experimental/quant_api.py b/torchao/experimental/quant_api.py
@@ -15,7 +15,6 @@
     quantize_per_channel_group,
 )
 
-from torchao.dtypes import PlainLayout
 from torchao.quantization.granularity import (
     PerGroup,
     PerRow,
@@ -516,7 +515,7 @@ def quantize(self, model: nn.Module) -> nn.Module:
 @dataclass
 class Int8DynamicActivationIntxWeightConfig(AOBaseConfig):
     """
-    Configuration for dynamically quantizes activations with 8-bits and weights with a low-bit value for linear layers.
+    Configuration for dynamically quantizing activations with 8-bits and quantizing weights with a low-bit value.
     More specifically, activations are dynamically quantized to 8-bits in a channelwise manner with scales and zeros.
     Weights are quantized with scales and optionally zeros (controlled by has_weight_zeros) in a groupwise or channelwise
     manner using the number of bits specified by weight_dtype.
@@ -527,20 +526,17 @@ class Int8DynamicActivationIntxWeightConfig(AOBaseConfig):
         has_weight_zeros: Whether or not to include zeros in the weight quantization.
         weight_mapping_type: The type of mapping to use for the weight quantization.  Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.
         act_mapping_type: The type of mapping to use for the activation quantization.  Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.
-        layout: The layout to use for the packed weight tensor.  Must be PackedLinearInt8DynamicActivationIntxWeightLayout (default) or PlainLayout.
-            The layout does not affect the quantization numerically and both layouts will give the same results.  PlainLayout is a generic layout
-            that works on all devices, but it is much slower than PackedLinearInt8DynamicActivationIntxWeightLayout on CPU.
-            PackedLinearInt8DynamicActivationIntxWeightLayout is a specialized layout for CPU performance.
-            When using PackedLinearInt8DynamicActivationIntxWeightLayout,
-             - The weight tensor must have device=CPU
-             - The weight tensor must have dtype=float32 (note that after applying quantization, the weights will no longer be float32)
-             - act_mapping_type must be MappingType.ASYMMETRIC
+        layout: The layout to use for the packed weight tensor.  The layout does not affect the quantization numerically and different
+            layouts will give similar results.  The following are available layouts:
+            - PackedLinearInt8DynamicActivationIntxWeightLayout: This layout is optimized for CPU performance.
+            - QDQLayout: This layout is designed for export to ExecuTorch
+            - PlainLayout: This layout is a simple python-based layout.  It has low performance, but can be used
+                when PackedLinearInt8DynamicActivationIntxWeightLayout is unavailable.
     """
 
     weight_dtype: torch.dtype = torch.int4
     granularity: Union[PerRow, PerGroup] = PerRow()
     has_weight_zeros: bool = False
-    has_bias: bool = False
     weight_mapping_type: MappingType = MappingType.ASYMMETRIC
     act_mapping_type: MappingType = MappingType.ASYMMETRIC
     layout: Layout = PackedLinearInt8DynamicActivationIntxWeightLayout(target="native")
@@ -559,27 +555,10 @@ def _int8_dynamic_activation_intx_weigh_transform(
     weight_dtype = config.weight_dtype
     granularity = config.granularity
     has_weight_zeros = config.has_weight_zeros
-    has_bias = config.has_bias
     weight_mapping_type = config.weight_mapping_type
     act_mapping_type = config.act_mapping_type
     layout = config.layout
 
-    def is_torchao_op_skippable(layout):
-        return isinstance(layout, PlainLayout) or (
-            isinstance(layout, PackedLinearInt8DynamicActivationIntxWeightLayout)
-            and layout.target == Target.ATEN
-        )
-
-    if not is_torchao_op_skippable(layout):
-        try:
-            torch.ops.torchao._pack_8bit_act_4bit_weight
-        except AttributeError:
-            raise Exception(
-                "TorchAO experimental kernels are not loaded.  To install the kernels, run `USE_CPP=1 pip install .` from ao on a machine with an ARM CPU."
-                + " You can also set target to 'aten' if you are using ARM CPU."
-                + "  Alternatively, use layout=PlainLayout() with int8_dynamic_activation_intx_weight, but note that doing so will result in much slower performance."
-            )
-
     dtype_to_bit_width = {
         torch.int1: 1,
         torch.int2: 2,
@@ -603,7 +582,18 @@ def is_torchao_op_skippable(layout):
     else:
         raise ValueError(f"granularity must be PerGroup or PerRow, got {granularity}")
 
+    tensor_impl_ctr_kwargs = None
     if isinstance(layout, PackedLinearInt8DynamicActivationIntxWeightLayout):
+        # We need to create a new layout object for each module because when
+        # granulairty is PerRow, the layout objects cannot share the group_size
+        layout = PackedLinearInt8DynamicActivationIntxWeightLayout(layout.target)
+        layout.set_params(
+            bit_width=bit_width,
+            group_size=group_size,
+            has_weight_zeros=has_weight_zeros,
+            has_bias=False,
+        )
+
         assert (
             weight.device == torch.device("cpu")
         ), "PackedLinearInt8DynamicActivationIntxWeightLayout requires weight.device=CPU"
@@ -613,20 +603,24 @@ def is_torchao_op_skippable(layout):
         assert (
             act_mapping_type == MappingType.ASYMMETRIC
         ), "PackedLinearInt8DynamicActivationIntxWeightLayout requires act_mapping_type=MappingType.ASYMMETRIC"
-        assert not layout.has_params_set(), "PackedLinearInt8DynamicActivationIntxWeightLayout params should not already be set"
-        layout = PackedLinearInt8DynamicActivationIntxWeightLayout(
-            bit_width=bit_width,
-            group_size=group_size,
-            has_weight_zeros=has_weight_zeros,
-            has_bias=has_bias,
-            target="aten" if layout.target == Target.ATEN else "native",
-        )
 
-        # ATEN KleidiAI kernel
-        # TODO: long term, we want to disfavor this kernel and instead use KleidiAI kernels in torchao
-        # that are vailable via PackedLinearInt8DynamicActivationIntxWeightLayout(target="native")
-        # where applicable
-        if layout.target == Target.ATEN:
+        tensor_impl_ctr_kwargs = {"bias": bias}
+
+        if layout.target == Target.NATIVE:
+            # Check kernels are installed/loaded
+            try:
+                torch.ops.torchao._pack_8bit_act_4bit_weight
+            except AttributeError:
+                raise Exception(
+                    "TorchAO experimental kernels are not loaded.  To install the kernels, run `USE_CPP=1 pip install .` from ao on a machine with an ARM CPU."
+                    + " You can also set target to 'aten' if you are using ARM CPU."
+                )
+        elif layout.target == Target.ATEN:
+            # TODO: long term, we want to disfavor this route for using KleidiAI in torchao
+            # KleidiAI kernels are accessible via Target.NATIVE if torchao is built
+            # with TORCHAO_BUILD_KLEIDIAI=1.  The Target.NATIVE route has the advantage
+            # of it automatially dispatching to different kernel libaries based on the CPU
+            # capability and the desired quantization
             assert (
                 TORCH_VERSION_AT_LEAST_2_6
             ), "ATEN target requires torch version > 2.6.0"
@@ -657,7 +651,7 @@ def is_torchao_op_skippable(layout):
         else ZeroPointDomain.NONE,
         _layout=layout,
         use_hqq=False,
-        tensor_impl_ctr_kwargs={"bias": bias} if has_bias else None,
+        tensor_impl_ctr_kwargs=tensor_impl_ctr_kwargs,
     )
 
     # Note that PackedLinearInt8DynamicActivationIntxWeightLayout has dynamic activation quantization fused
@@ -678,7 +672,10 @@ def is_torchao_op_skippable(layout):
     module.weight = torch.nn.Parameter(weight, requires_grad=False)
 
     # If bias was packed with weights, set bias to None on module
-    if has_bias:
+    if (
+        isinstance(layout, PackedLinearInt8DynamicActivationIntxWeightLayout)
+        and layout.has_bias
+    ):
         module.bias = None
 
     return module
diff --git a/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py b/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
@@ -63,7 +63,7 @@ def test_accuracy(self, layout, weight_dtype, has_weight_zeros, granularity):
         """
         Checks the accuracy of different layouts by comparing the results to PlainLayout()
         """
-        m = 1
+        m = 3
         n = 1071
         k = 4096
         activations = torch.randn(m, k)
@@ -96,15 +96,12 @@ def test_accuracy(self, layout, weight_dtype, has_weight_zeros, granularity):
             result = quantized_model(activations)
             expected_result = quantized_model_reference(activations)
 
-        if self._use_relaxed_tolerance(layout, weight_dtype, has_weight_zeros):
-            self.assertTrue(
-                torch.nn.functional.mse_loss(result, expected_result) <= 1e-5
-            )
-        else:
-            self.assertTrue(torch.allclose(result, expected_result, atol=1e-4))
+        # When weight_dtype is int4, the quantization error may be larger
+        # because KleidiAI kernels may be used
+        self._assert_close(result, expected_result, strict=(weight_dtype != torch.int4))
 
     def test_accuracy_aten(self):
-        m = 1
+        m = 3
         n = 1024
         k = 4096
         activations = torch.randn(m, k)
@@ -113,7 +110,6 @@ def test_accuracy_aten(self):
         weight_dtype = torch.int4
         granularity = PerGroup(128)
         has_weight_zeros = False
-        has_bias = False  # KleidiAI throws if bias is packed with weights
 
         reference_layout = PlainLayout()
         quantized_model = copy.deepcopy(model)
@@ -123,7 +119,6 @@ def test_accuracy_aten(self):
                 weight_dtype=weight_dtype,
                 granularity=granularity,
                 has_weight_zeros=has_weight_zeros,
-                has_bias=has_bias,
                 layout=PackedLinearInt8DynamicActivationIntxWeightLayout(target="aten"),
             ),
         )
@@ -143,16 +138,12 @@ def test_accuracy_aten(self):
             result = quantized_model(activations)
             expected_result = quantized_model_reference(activations)
 
-        self.assertTrue(torch.nn.functional.mse_loss(result, expected_result) <= 1e-8)
+        self._assert_close(result, expected_result, strict=False)
 
-    def _use_relaxed_tolerance(self, layout, weight_dtype, has_weight_zeros):
-        # Use relaxed tolerance in cases where KleidiAI kernels might
-        # be selected.
-        return (
-            isinstance(layout, PackedLinearInt8DynamicActivationIntxWeightLayout)
-            and weight_dtype == torch.int4
-            and not has_weight_zeros
-        )
+    def _assert_close(self, result, expected_result, strict: bool = False):
+        self.assertTrue(torch.nn.functional.mse_loss(result, expected_result) <= 1e-8)
+        if strict:
+            self.assertTrue(torch.allclose(result, expected_result, atol=1e-3))
 
     def test_export_compile_aoti_PackedLinearInt8DynamicActivationIntxWeightLayout(
         self,
@@ -171,7 +162,7 @@ def test_export_compile_aoti_PackedLinearInt8DynamicActivationIntxWeightLayout(
         has_weight_zeros = True
         layers = [
             torch.nn.Linear(k0, k1, bias=False),
-            torch.nn.Linear(k1, k2, bias=False),
+            torch.nn.Linear(k1, k2, bias=True),
             torch.nn.Linear(k2, k3, bias=False),
         ]
         model = torch.nn.Sequential(*layers)