rowwise scaling test passing

danielvegamyhre · danielvegamyhre · commit a761549b4fba · 2025-03-26T18:40:32.000-07:00
diff --git a/torchao/float8/float8_ops.py b/torchao/float8/float8_ops.py
@@ -151,9 +151,6 @@ def float8_transpose(aten_op, args, kwargs=None):
     else:
         new_scale = args[0]._scale
 
-    if aten_op == aten.transpose.int:
-        _assert_tensorwise_scale(aten_op, args[0]._scale)
-
     old_axiswise_dim = args[0]._axiswise_dim
     new_axiswise_dim = old_axiswise_dim
     if old_axiswise_dim is not None:
diff --git a/torchao/prototype/grouped_mm/__init__.py b/torchao/prototype/grouped_mm/__init__.py
@@ -2,6 +2,7 @@
 from typing import Optional
 
 import torch
+from torchao import float8
 from torchao.float8.float8_scaling_utils import hp_tensor_to_float8_dynamic, get_maybe_axiswise_dim
 from torchao.float8.config import Float8LinearConfig, Float8LinearRecipeName
 from torchao.float8.float8_tensor import GemmInputRole
@@ -36,7 +37,9 @@ def forward(
         out_dtype: Optional[torch.dtype] = None,
         use_fast_accum: bool = False,
     ) -> torch.Tensor:
-        
+        # torch._scaled_grouped_mm only supports rowwise scaling currently.
+        assert float8_recipe_name == Float8LinearRecipeName.ROWWISE, "Only rowwise scaling is supported by torch._scaled_grouped_mm."
+
         # perform dynamic float8 quantization using the given recipe, if specified
         assert 2 <= A.ndim <= 3, "A must be 2D or 3D"
         assert 2 <= B.ndim <= 3, "B must be 2D or 3D"
@@ -68,19 +71,31 @@ def forward(
                 -1, float8_config.cast_config_input.scaling_granularity
             ),
             round_scales_to_power_of_2=float8_config.round_scales_to_power_of_2,
-        ) 
+        )
+        B_fp8_t = B_fp8.transpose(-2, -1)
 
         # Store what we need for backward.
         ctx.save_for_backward(A, B)
         ctx.float_config = float8_config
         ctx.offs = offs
 
+        # Scale shape adjustments for compatibility with torch._scaled_grouped_mm.
+        # For tensorwise scaling, torch._scaled_grouped_mm requires 1D scales, not 0D.
+        if float8_recipe_name == Float8LinearRecipeName.TENSORWISE:
+            A_fp8._scale = A_fp8._scale.unsqueeze(0)
+            B_fp8_t._scale = B_fp8_t._scale.unsqueeze(0)
+
+        # For rowwise scaling, torch._scaled_grouped_mm requires scales without any empty dims.
+        elif float8_recipe_name == Float8LinearRecipeName.ROWWISE:
+            A_fp8._scale = A_fp8._scale.squeeze()
+            B_fp8_t._scale = B_fp8_t._scale.squeeze()
+
         # Perform scaled grouped GEMM and return result.
         return torch._scaled_grouped_mm(
             A_fp8._data, 
-            B_fp8._data, 
-            A_fp8._scale, 
-            B_fp8._scale,
+            B_fp8_t._data, 
+            A_fp8._scale,
+            B_fp8_t._scale,
             offs, 
             out_dtype=out_dtype, 
             use_fast_accum=use_fast_accum,
diff --git a/torchao/prototype/grouped_mm/test_grouped_mm.py b/torchao/prototype/grouped_mm/test_grouped_mm.py
@@ -5,19 +5,37 @@
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.parametrize("float8_recipe", [Float8LinearRecipeName.TENSORWISE, Float8LinearRecipeName.ROWWISE])
 @pytest.mark.parametrize("use_fast_accum", [True, False])
-def test_grouped_gemm(float8_recipe, use_fast_accum):
+@pytest.mark.parametrize("strided", [True, False])
+def test_grouped_gemm_2d_3d(use_fast_accum, strided):
+    # unit test ensuring parity between torchao and pytorch core grouped_gemm
+    # https://github.com/pytorch/pytorch/blob/87bfd66c3c7061db6d36d8daa62f08f507f90e39/test/test_matmul_cuda.py#L1204
     device = "cuda"
-    m, n, k, n_groups = 16, 16, 16, 4
-    a = torch.randn(m, k * n_groups + k, device=device)
-    b = torch.randn(n, k * n_groups + k, device=device)
-    offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
+    s_int = int(strided)
+    m, n, k, n_groups = 16, 32, 16, 4
+    a = torch.randn(m * n_groups, k * (1 + s_int), device=device)[:, :k]
+    b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device)[::(1 + s_int), :, :k]
+    offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
     result = grouped_mm(
-        a, b.t(), 
+        a, b,
         offs=offs, 
-        float8_recipe=float8_recipe, 
+        float8_recipe=Float8LinearRecipeName.ROWWISE, 
         out_dtype=torch.bfloat16, 
         use_fast_accum=use_fast_accum
     )
     assert isinstance(result, torch.Tensor)
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_tensorwise_scaling_not_supported():
+    device = "cuda"
+    m, n, k, n_groups = 16, 32, 16, 4
+    a = torch.randn(m * n_groups, k, device=device)[:, :k]
+    b = torch.randn(n_groups, n, k, device=device)[::1, :, :k]
+    offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
+    with pytest.raises(AssertionError):
+        result = grouped_mm(
+            a, b,
+            offs=offs, 
+            float8_recipe=Float8LinearRecipeName.TENSORWISE, 
+            out_dtype=torch.bfloat16, 
+        )