test rowwise fp32

y-sq · facebook-github-bot · commit 0a45ccd1c555 · 2025-06-23T23:32:17.000-07:00
Summary:
Running rowwise scaling on fp32 tensors got the error, P1794222725
```
RuntimeError: Only bf16 high precision output types are supported for row-wise scaling.
```

This pr adds an option to explicitly use bfloat16 as the output of rowwise_scaled, and cast it back to the original precision.

It can be enabled by setting
```
config = dataclasses.replace(config, convert_dtypes_for_rowwise_scaled_mm=True)
```

Differential Revision: D73552660
diff --git a/torchao/float8/config.py b/torchao/float8/config.py
@@ -204,6 +204,11 @@ class Float8LinearConfig:
     # same value in the forward pass as the backward passes.
     round_scales_to_power_of_2: bool = False
 
+    # This is a workaround for using rowwise_scaled_mm for non-bf16 tensors.
+    # Currently, rowwise_scaled_mm only supports bf16 outputs.
+    # We workaround this by using bf16 as rowwise_scaled_mm output, and cast back to the original precision.
+    convert_dtypes_for_rowwise_scaled_mm: bool = False
+
     def __post_init__(self):
         # Populate the additional cast overrides, if the user did not specify them
         # Note: this hacks around the frozen-ness of this dataclass
diff --git a/torchao/float8/float8_linear.py b/torchao/float8/float8_linear.py
@@ -281,20 +281,23 @@ def __init__(self, *args, **kwargs):
                 self.config.gemm_config_output.use_fast_accum,
                 False,
                 self.config.pad_inner_dim,
+                config.convert_dtypes_for_rowwise_scaled_mm,
             ),
             # grad_input
             ScaledMMConfig(
                 config.emulate,
                 self.config.gemm_config_grad_input.use_fast_accum,
                 False,
                 self.config.pad_inner_dim,
+                config.convert_dtypes_for_rowwise_scaled_mm,
             ),
             # grad_weight
             ScaledMMConfig(
                 config.emulate,
                 self.config.gemm_config_grad_weight.use_fast_accum,
                 False,
                 self.config.pad_inner_dim,
+                config.convert_dtypes_for_rowwise_scaled_mm,
             ),
         )
 
diff --git a/torchao/float8/float8_ops.py b/torchao/float8/float8_ops.py
@@ -31,6 +31,7 @@ def addmm_float8_unwrapped(
     output_scale: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
     use_fast_accum: bool = False,
+    convert_dtypes_for_rowwise_scaled_mm: bool = False,
 ) -> torch.Tensor:
     """
     This is the unwrapped version of addmm_float8, which does not take in Float8Tensors
@@ -54,6 +55,11 @@ def addmm_float8_unwrapped(
         a_inverse_scale = a_inverse_scale.new_ones(())
         b_inverse_scale = a_inverse_scale.new_ones(())
 
+    orig_dtype = output_dtype
+
+    if convert_dtypes_for_rowwise_scaled_mm and is_rowwise_scaling:
+        output_dtype = torch.bfloat16
+
     post_bias = None
     if output_dtype == torch.float32:
         # Bias is not supported by _scaled_mm when output is fp32
@@ -76,6 +82,9 @@ def addmm_float8_unwrapped(
     if post_bias is not None:
         output += post_bias
 
+    if convert_dtypes_for_rowwise_scaled_mm and is_rowwise_scaling:
+        output = output.to(orig_dtype)
+
     return output
 
 
@@ -379,6 +388,7 @@ def float8_mm(aten_op, args, kwargs=None):
         output_scale=None,
         bias=None,
         use_fast_accum=scaled_mm_config.use_fast_accum,
+        convert_dtypes_for_rowwise_scaled_mm=scaled_mm_config.convert_dtypes_for_rowwise_scaled_mm,
     )
     return tensor_out
 
diff --git a/torchao/float8/float8_tensor.py b/torchao/float8/float8_tensor.py
@@ -59,6 +59,7 @@ class ScaledMMConfig(NamedTuple):
     use_fast_accum: bool = False
     fp8_output: bool = False
     pad_inner_dim: bool = False
+    convert_dtypes_for_rowwise_scaled_mm: bool = False
 
 
 class LinearMMConfig(NamedTuple):
@@ -75,9 +76,9 @@ class LinearMMConfig(NamedTuple):
         grad_weight (ScaledMMConfig): Configuration for the grad_weight gemm.
     """
 
-    output: ScaledMMConfig = ScaledMMConfig(False, True, False, False)
-    grad_input: ScaledMMConfig = ScaledMMConfig(False, False, False, False)
-    grad_weight: ScaledMMConfig = ScaledMMConfig(False, False, False, False)
+    output: ScaledMMConfig = ScaledMMConfig(False, True, False, False, False)
+    grad_input: ScaledMMConfig = ScaledMMConfig(False, False, False, False, False)
+    grad_weight: ScaledMMConfig = ScaledMMConfig(False, False, False, False, False)
 
 
 class GemmInputRole(enum.Enum):