Fix Per Row scaling for inference

drisspg · drisspg · commit 4d7c98f58cec · 2025-05-23T15:38:03.000-07:00
stack-info: PR: #2253, branch: drisspg/stack/56
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -297,21 +297,53 @@ def test_fp8_weight_dimension_warning(self):
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
-    def test_mm_float8dq(self):
+    @common_utils.parametrize(
+        "in_features,out_features", [(512, 1024), (256, 768), (1024, 512)]
+    )
+    @common_utils.parametrize(
+        "leading_shape", [(1,), (8,), (16,), (2, 8,), (2, 2, 16,)]
+    )  # fmt: skip
+    @common_utils.parametrize("bias", [True, False])
+    def test_mm_float8dq(self, in_features, out_features, leading_shape, bias: bool):
         device = "cuda"
         dtype = torch.bfloat16
-        weight = torch.randn(512, 1024).to(device).to(dtype)
-        weight = weight.t()
-
-        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
-        l.weight = torch.nn.Parameter(weight)
-        quantize_(l, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
-        # weight shape: 1024 x 512
-        weight = l.weight
-
-        input = torch.randn(1, 512, device=device, dtype=dtype)
-        # make sure it runs
-        torch.nn.functional.linear(input, weight)
+        input_shape = leading_shape + (in_features,)
+
+        ref_linear = (
+            torch.nn.Linear(in_features, out_features, bias=bias).to(device).to(dtype)
+        )
+        test_linear = copy.deepcopy(ref_linear)
+        quantize_(
+            test_linear, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+        )
+
+        quant_weight = test_linear.weight
+
+        self.assertTrue(hasattr(quant_weight, "original_weight_tensor"))
+        weight_impl = quant_weight.original_weight_tensor.tensor_impl
+
+        self.assertTrue(hasattr(weight_impl, "float8_data"))
+        self.assertTrue(hasattr(weight_impl, "scale"))
+        self.assertFalse(weight_impl.transposed)
+
+        # Verify scale shape for row-wise quantization
+        expected_scale_shape = (out_features, 1)
+        actual_scale_shape = weight_impl.scale.shape
+        self.assertEqual(actual_scale_shape, expected_scale_shape)
+
+        self.assertEqual(weight_impl.float8_data.shape, (out_features, in_features))
+
+        input_tensor = torch.randn(*input_shape, device=device, dtype=dtype)
+
+        with torch.no_grad():
+            ref_output = ref_linear(input_tensor)
+            quant_output = torch.nn.functional.linear(input_tensor, quant_weight)
+
+        expected_output_shape = input_tensor.shape[:-1] + (out_features,)
+        self.assertEqual(quant_output.shape, expected_output_shape)
+
+        error = compute_error(ref_output, quant_output)
+        assert error > 20, f"Quantization error is too high got a SQNR of {error}"
 
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -462,10 +462,10 @@ def from_hp_to_floatx(
         if target_dtype in FP8_TYPES:
             original_shape = input_float.shape
             input_float = _layout.pre_process(input_float)
-
-            scale = choose_qparams_affine_float8(input_float, float8_dtype=target_dtype)
+            scale = choose_qparams_affine_float8(
+                input_float, float8_dtype=target_dtype, block_size=block_size
+            )
             data = quantize_affine_float8(input_float, scale, target_dtype)
-
             data, scale, zero_point = _layout.post_process(
                 data, scale, None, block_size
             )
@@ -503,7 +503,6 @@ def from_hp_to_floatx_static(
                 input_float,
                 scale,
                 target_dtype,
-                scale_dtype,
             )
 
             data, scale, zero_point = _layout.post_process(
diff --git a/torchao/dtypes/floatx/float8_layout.py b/torchao/dtypes/floatx/float8_layout.py
@@ -195,28 +195,32 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
         elif func is aten.slice.Tensor:
             self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
             if dim == 0:
-                # TODO: scale replecation should be dependent on block size
-                if self.scale.ndim == 1:
+                if self.scale.ndim == 0 or (
+                    self.scale.ndim == 1 and self.scale.size(0) == 1
+                ):
+                    # Per Tensor
                     return return_and_correct_aliasing(
                         func,
                         args,
                         kwargs,
-                        args[0]._apply_fn_to_data(
-                            lambda x: aten.slice.Tensor(x, dim, start, end, step)
+                        Float8AQTTensorImpl(
+                            aten.slice.Tensor(self.float8_data, dim, start, end, step),
+                            self.scale,
+                            False,
+                            self._layout,
                         ),
                     )
-                elif self.scale.ndim == 0:
+                elif self.scale.ndim == 2:
+                    # TODO: scale replecation should be dependent on block size
                     return return_and_correct_aliasing(
                         func,
                         args,
                         kwargs,
-                        Float8AQTTensorImpl(
-                            aten.slice.Tensor(self.float8_data, dim, start, end, step),
-                            self.scale,
-                            None,
-                            self._layout,
+                        args[0]._apply_fn_to_data(
+                            lambda x: aten.slice.Tensor(x, dim, start, end, step)
                         ),
                     )
+
                 else:
                     raise NotImplementedError(
                         f"Float8AQTTensorImpl dispatch: attempting to run {func}, with scale ndim={dim}, that is not supported"
@@ -333,13 +337,12 @@ def _linear_fp8_act_fp8_weight_impl(
     input_scale = input_tensor.tensor_impl.scale
     # Handle case where input tensor is more than 2D
     inpt_data = inpt_data.reshape(-1, inpt_data.shape[-1])
-
     # Handle rowwise case
     if _is_rowwise_scaled(weight_tensor):
         assert _is_rowwise_scaled(input_tensor), (
             "Input tensor must be rowwise block size"
         )
-        w_scale = w_scale.unsqueeze(-1).T
+        w_scale = w_scale.T
         input_scale = preprocess_scale(input_scale, input_tensor.shape)
 
     # Preprocess data
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1434,18 +1434,21 @@ def _float8_weight_only_transform(
 _fp8_granularities = Union[PerTensor, PerRow]
 
 
-# Validate and process granularity input
 def _normalize_granularity(
     granularity: Optional[
-        Union[_fp8_granularities, Tuple[_fp8_granularities, _fp8_granularities]]
+        Union[
+            _fp8_granularities,
+            Tuple[_fp8_granularities, _fp8_granularities],
+            list[_fp8_granularities],
+        ]
     ],
 ) -> Tuple[_fp8_granularities, _fp8_granularities]:
     processed_granularity = None
     if granularity is None:
         processed_granularity = (PerTensor(), PerTensor())
     elif isinstance(granularity, (PerTensor, PerRow)):
         processed_granularity = (granularity, granularity)
-    elif isinstance(granularity, tuple) and len(granularity) == 2:
+    elif isinstance(granularity, (tuple, list)) and len(granularity) == 2:
         if not (
             isinstance(granularity[0], (PerTensor, PerRow))
             and isinstance(granularity[1], (PerTensor, PerRow))
@@ -1457,7 +1460,7 @@ def _normalize_granularity(
             raise ValueError(
                 f"Different granularities for activation and weight are not supported: {granularity}, only PerTensor or PerRow are supported."
             )
-        processed_granularity = granularity
+        processed_granularity = tuple(granularity)
     else:
         raise ValueError(
             f"Invalid granularity specification: {granularity}, only PerTensor or PerRow are supported."
@@ -1576,6 +1579,11 @@ def __post_init__(self):
         if self.mm_config is None:
             self.mm_config = Float8MMConfig(use_fast_accum=True)
 
+        activation_granularity, weight_granularity = _normalize_granularity(
+            self.granularity
+        )
+        self.granularity = (activation_granularity, weight_granularity)
+
 
 # for bc
 float8_dynamic_activation_float8_weight = Float8DynamicActivationFloat8WeightConfig
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -1970,20 +1970,38 @@ def choose_qparams_affine_float8(
     tensor: torch.Tensor,
     float8_dtype: torch.dtype = torch.float8_e4m3fn,
     scale_dtype: torch.dtype = torch.float32,
+    block_size: Optional[Tuple[int, ...]] = None,
 ) -> torch.Tensor:
     """
     Calculates float8 scaling factor for the given high precision tensor, using tensorwise granularity.
 
     Args:
         tensor (torch.Tensor): Input tensor to be quantized.
         float8_dtype (torch.dtype): Data type of the quantized tensor (e.g., torch.float8_e4m3fn, torch.float8_e5m2).
+        scale_dtype (torch.dtype): Data type of the scaling factor (e.g., torch.float32).
+        block_size (Optional[Tuple[int, ...]]): Block size for block-wise quantization. If None, tensorwise quantization is used.
     """
+    quant_max = torch.finfo(float8_dtype).max
     # only tensorwise scaling is supported for now:
-    quant_min, quant_max = torch.finfo(float8_dtype).min, torch.finfo(float8_dtype).max
-    min_val_neg = torch.min(tensor)
-    max_val_pos = torch.max(tensor)
-    max_val_pos = torch.max(-min_val_neg, max_val_pos)
-    scale = max_val_pos / (float(quant_max - quant_min) / 2)
+    if block_size is None:
+        max_abs = tensor.abs().max()
+        scale = max_abs / quant_max
+    else:
+        shape_for_reduction, reduction_dims = _get_reduction_params(
+            block_size, tensor.shape
+        )
+        tensor_reshaped = tensor.view(shape_for_reduction)
+        max_abs = tensor_reshaped.abs().amax(dim=reduction_dims, keepdim=True)
+
+        scale = max_abs / quant_max
+        # Reshape scale back to match the expected output shape
+        # The scale tensor should have the same shape as the input divided by block_size
+        output_shape = [
+            input_size // block_size[i] if block_size[i] > 1 else input_size
+            for i, input_size in enumerate(tensor.shape)
+        ]
+        scale = scale.reshape(output_shape)
+
     return scale.to(dtype=scale_dtype)
 
 
@@ -2027,5 +2045,24 @@ def dequantize_affine_float8(
     # upcasted to `float32` to divide by the scale, since scale is a fp32 for float8 quantization.
     # In order to match numerics between eager and compile, we upcast manually here.
     fp8_tensor = tensor.to(torch.float32)
-    hp_tensor = fp8_tensor * scale
+    # For block-wise quantization, we need to broadcast the scale to match tensor dimensions
+    if scale.shape != tensor.shape:
+        # Calculate the block size from the shape difference
+        block_size = tuple(
+            tensor.shape[i] // scale.shape[i]
+            if scale.shape[i] != tensor.shape[i]
+            else 1
+            for i in range(len(tensor.shape))
+        )
+
+        scale_expanded = scale
+        for i in range(len(tensor.shape)):
+            if block_size[i] > 1:
+                # Repeat the scale values for each block
+                scale_expanded = scale_expanded.repeat_interleave(block_size[i], dim=i)
+    else:
+        # Tensor-wise quantization - scale already matches
+        scale_expanded = scale
+
+    hp_tensor = fp8_tensor * scale_expanded
     return hp_tensor.to(output_dtype)