Merge branch 'main' into cpu_int_scaled_mm_2

Xia-Weiwen · Xia-Weiwen · commit 9f366a9697e0 · 2024-10-28T22:22:06.000-07:00
diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -40,7 +40,7 @@ jobs:
             gpu-arch-version: "12.1"
           - name: CUDA Nightly
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch==2.6.0.dev20241022 --index-url https://download.pytorch.org/whl/nightly/cu121'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
 
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -74,6 +74,7 @@
     AutoQuantizableLinearWeight,
     AQFloat8WeightOnlyQuantizedLinearWeight,
     AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight,
+    AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight,
 )
 from torch.ao.quantization.quantize_fx import convert_to_reference_fx, prepare_fx
 import os
@@ -770,11 +771,23 @@ def test_aq_float8_weight_only_quant_subclass(self, device, dtype):
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_5, "autoquant+aqt needs newer pytorch")
     @unittest.skipIf(not is_H100, "Need H100 to run")
-    def test_aq_float8_dynamic_quant_subclass(self, device, dtype):
+    def test_aq_float8_dynamic_quant_rowwise_scaling_subclass(self, device, dtype):
         if dtype != torch.bfloat16:
-            self.skipTest("Fails for {dtype}")
+            with self.assertRaisesRegex(AssertionError, "PerRow quantization only works for bfloat16 precision"):
+                self._test_lin_weight_subclass_impl(
+                    AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight.from_float, device, 25, test_dtype=dtype
+                )
+        else:
+            self._test_lin_weight_subclass_impl(
+                AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight.from_float, device, 25, test_dtype=dtype
+            )
+    
+    @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_5, "autoquant+aqt needs newer pytorch")
+    @unittest.skipIf(not is_H100, "Need H100 to run")
+    def test_aq_float8_dynamic_quant_tensorwise_scaling_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
-            AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight.from_float, device, 25, test_dtype=dtype
+            AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight.from_float, device, 25, test_dtype=dtype
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -1,3 +1,4 @@
+from typing import Callable
 import torch
 import torchao
 from torchao.quantization.quant_primitives import (
@@ -500,7 +501,7 @@ class AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight(AQMixin, LinearActiv
     """
     AutoQuantizable version of Float8DynamicallyQuantizedLinearWeight using per row scaling
     """
-    activation_granularity: str = PerRow()
+    activation_granularity = PerRow()
     @classmethod
     def from_float(cls, weight):
 
@@ -537,6 +538,42 @@ def get_per_token_block_size(x):
         weight = super(AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight, cls).from_float(weight, input_quant_func)
         return weight
 
+class AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight(AQMixin, LinearActivationQuantizedTensor):
+    """
+    AutoQuantizable version of Float8DynamicallyQuantizedLinearWeight using per tensor scaling
+    """
+    activation_granularity = PerTensor()
+    @classmethod
+    def from_float(cls, weight):
+
+        # avoid circular dep
+        from torchao.dtypes import to_affine_quantized_floatx
+        from torchao.quantization.quant_api import _input_activation_quant_func_fp8
+        # weight settings
+        def get_weight_block_size(x):
+            assert x.ndim == 2, "Only works for 2D tensors"
+            return x.shape
+        target_dtype = torch.float8_e4m3fn
+
+        input_target_dtype = torch.float8_e4m3fn
+        _layout = Float8Layout(mm_config=Float8MMConfig(use_fast_accum=True))
+        input_quant_func = lambda x: _input_activation_quant_func_fp8(
+            x=x,
+            activation_granularity=cls.activation_granularity,
+            activation_dtype=input_target_dtype,
+        )
+        block_size = get_weight_block_size(weight)
+        weight = to_affine_quantized_floatx(
+                    input_float=weight,
+                    block_size=block_size,
+                    target_dtype=target_dtype,
+                    _layout=_layout,
+                    scale_dtype=torch.float32,
+        )
+        from torchao.float8.inference import _is_rowwise_scaled
+        weight = super(AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight, cls).from_float(weight, input_quant_func)
+        return weight
+
 
 # here we don't include int4 quantization in since int8 tends to be a better apples to apples comparison
 DEFAULT_AUTOQUANT_CLASS_LIST = [
@@ -557,6 +594,7 @@ def get_per_token_block_size(x):
 OTHER_AUTOQUANT_CLASS_LIST = [
     AQFloat8WeightOnlyQuantizedLinearWeight,
     AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight,
+    AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight,
 ]
 
 
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -389,7 +389,7 @@ class MyTensor(torch.Tensor):
         return cls._ATEN_OP_OR_TORCH_FN_TABLE[func](func, types, args, kwargs)
 
     arg_types = tuple(type(arg) for arg in args)
-    kwarg_types = {k: type(arg) for k, arg in kwargs}
+    kwarg_types = {k: type(arg) for k, arg in kwargs.items()}
     raise NotImplementedError(f"{cls.__name__} dispatch: attempting to run unimplemented operator/function: {func=}, {types=}, {arg_types=}, {kwarg_types=}")
 
 def _register_layout(tensor_class: Callable, layout_class: Callable):