[wip] make all 3 gemms in Float8Linear configurable

vkuzo · vkuzo · commit 9e9581defa66 · 2024-07-12T14:36:20.000-07:00
Summary: not ready for review yet Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 302e7c4 Pull Request resolved: #315
diff --git a/float8_experimental/__init__.py b/float8_experimental/__init__.py
@@ -5,11 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 # Lets define a few top level things here
 from float8_experimental.float8_linear import Float8Linear
-from float8_experimental.float8_tensor import Float8Tensor, ScaledMMConfig
+from float8_experimental.float8_tensor import Float8Tensor, ScaledMMConfig, GemmInputRole
 
 # Needed to load Float8Tensor with weights_only = True
 from torch.serialization import add_safe_globals
 
-add_safe_globals([Float8Tensor, ScaledMMConfig])
+add_safe_globals([Float8Tensor, ScaledMMConfig, GemmInputRole])
 
 __all__ = ["Float8Tensor", "Float8Linear"]
diff --git a/float8_experimental/float8_ops.py b/float8_experimental/float8_ops.py
@@ -12,6 +12,7 @@
     Float8Tensor,
     merge_mm_configs,
     ScaledMMConfig,
+    choose_scaled_mm_config,
 )
 from float8_experimental.float8_utils import is_row_major, pad_tensor_for_matmul
 
@@ -125,10 +126,16 @@ def preprocess_addmm(a: Float8Tensor, b: Float8Tensor):
     a_scale = a._scale
     b_data = b._data
 
-    if a._mm_config.pad_inner_dim:
-        assert (
-            b._mm_config.pad_inner_dim
-        ), "Both mm configs must have pad_inner_dim set to True"
+    scaled_mm_config = choose_scaled_mm_config(
+        a._gemm_input_role, a._mm_config,
+        b._gemm_input_role, b._mm_config,
+    )
+
+    if scaled_mm_config.pad_inner_dim:
+        # TODO(before land): assert this when choosing config
+        # assert (
+        #     b._mm_config.pad_inner_dim
+        # ), "Both mm configs must have pad_inner_dim set to True"
         assert a._data.size(1) == b._data.size(
             0
         ), f"Inner dims must match for mm, got {a._data.size(1)} and {b._data.size(0)}"
@@ -155,10 +162,14 @@ def float8_mm(aten_op, args, kwargs=None):
     )
     a_data, a_scale, b_data, b_scale = preprocess_addmm(a, b)
     output_dtype = a._orig_dtype
-    a_mm_config: ScaledMMConfig = a._mm_config
-    b_mm_config: ScaledMMConfig = b._mm_config
-    mm_config: ScaledMMConfig = merge_mm_configs(a_mm_config, b_mm_config)
-    if mm_config.emulate:
+    # a_mm_config: ScaledMMConfig = a._mm_config
+    # b_mm_config: ScaledMMConfig = b._mm_config
+    # mm_config: ScaledMMConfig = merge_mm_configs(a_mm_config, b_mm_config)
+    scaled_mm_config = choose_scaled_mm_config(
+        a._gemm_input_role, a._mm_config,
+        b._gemm_input_role, b._mm_config,
+    )
+    if scaled_mm_config.emulate:
         return torch.ops.aten.mm_float8_emulated(
             a._data, a._scale, b._data, b._scale, output_dtype
         )
@@ -170,7 +181,7 @@ def float8_mm(aten_op, args, kwargs=None):
         output_dtype,
         output_scale=None,
         bias=None,
-        use_fast_accum=mm_config.use_fast_accum,
+        use_fast_accum=scaled_mm_config.use_fast_accum,
     )
     return tensor_out
 
@@ -188,10 +199,14 @@ def float8_addmm(aten_op, args, kwargs=None):
     a_data, a_scale, b_data, b_scale = preprocess_addmm(a, b)
     output_dtype = a._orig_dtype
     assert bias.dtype == output_dtype, "bias dtype must match output dtype"
-    a_mm_config: ScaledMMConfig = a._mm_config
-    b_mm_config: ScaledMMConfig = b._mm_config
-    mm_config: ScaledMMConfig = merge_mm_configs(a_mm_config, b_mm_config)
-    if mm_config.emulate:
+    # a_mm_config: ScaledMMConfig = a._mm_config
+    # b_mm_config: ScaledMMConfig = b._mm_config
+    # mm_config: ScaledMMConfig = merge_mm_configs(a_mm_config, b_mm_config)
+    scaled_mm_config = choose_scaled_mm_config(
+        a._gemm_input_role, a._mm_config,
+        b._gemm_input_role, b._mm_config,
+    )
+    if scaled_mm_config.emulate:
         out = torch.ops.aten.mm_float8_emulated(
             a._data, a._scale, b._data, b._scale, output_dtype
         )
@@ -204,7 +219,7 @@ def float8_addmm(aten_op, args, kwargs=None):
         output_dtype,
         output_scale=None,
         bias=bias,
-        use_fast_accum=mm_config.use_fast_accum,
+        use_fast_accum=scaled_mm_config.use_fast_accum,
     )
     return tensor_out
 
diff --git a/float8_experimental/float8_tensor.py b/float8_experimental/float8_tensor.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 from collections import namedtuple
+import enum
 from typing import Dict, Optional
 
 import torch
@@ -18,6 +19,31 @@
 
 aten = torch.ops.aten
 
+#
+# A note on configuration of float8 logic in a linear
+# TODO(future): move all the configs to separate file
+#
+# There are three gemms in a forward + backward of a Linear layer:
+# 
+# 1.     x @ w_t   = y     (forward pass)
+# 2. dL_dY @ w     = dL_dX (backward pass)
+# 3.   x_t @ dL_dY = dL_dW (backward pass)
+#
+# In the formulas above, there are:
+# A. six input tensors (x, x_t, w, w_t, dL_dY, dL_dY_t). 
+#    - Note that dL_dY_t is implied because of memory format requirements 
+#      of float8 gemms
+# B. three output tensors (y, dL_dX, dL_dW)
+#
+# We want each input tensor, gemm, and output tensor to be configurable.
+# The state of this configuration today is:
+# 
+# i. pairs of input tensors (non-t and t variants) have their scaling 
+#    configurable via the scaling_type_{x_w_dL_dY} arguments to Float8Linear
+# ii. each gemm + output is configurable via ScaledMMConfig, which is not user facing
+# iii. LinearMMConfig is a container for the three ScaledMMConfig objects needed
+#    to configure all three gemms, also not user facing
+
 
 # ScaledMMConfig is a namedtuple that defines the configuration for the scaled_mm in the forward and backward pass.
 # emulate: whether to emulate the matmuls in fp32
@@ -30,6 +56,48 @@
     defaults=[False, False, False, False],
 )
 
+# The object below exists for convenience, to allow Float8Tensor to use
+# the right config based on which gemm from `y`, `dL_dX`, `dL_dW` is
+# being called.
+LinearMMConfig = namedtuple(
+    "LinearMMConfig",
+    ["y", "dL_dX", "dL_dW"],
+    defaults=[
+        ScaledMMConfig(False, True, False, False),
+        ScaledMMConfig(False, False, False, False),
+        ScaledMMConfig(False, False, False, False),
+    ]
+)
+
+# Given a Float8Tensor, the enum below describes the expected role of this
+# tensor in the three gemms present in the fw + bw pass of a Linear layer.
+# This is used to choose the right config for a float8 gemm when the
+# gemm is performed.
+class GemmInputRole(enum.Enum):
+    X = "x"
+    W = "w"
+    DL_DY = "dL_dY"
+
+# choose which scaled_mm_config to use based on gemm inputs
+def choose_scaled_mm_config(
+    a_role: GemmInputRole, 
+    a_linear_mm_config: LinearMMConfig, 
+    b_role: GemmInputRole, 
+    b_linear_mm_config: LinearMMConfig,
+):
+    if a_role is GemmInputRole.X and b_role is GemmInputRole.W:
+        assert a_linear_mm_config.y == b_linear_mm_config.y
+        return a_linear_mm_config.y
+    elif a_role is GemmInputRole.DL_DY and b_role is GemmInputRole.W:
+        assert a_linear_mm_config.dL_dX == b_linear_mm_config.dL_dX
+        return a_linear_mm_config.dL_dX
+    else:
+        assert a_role is GemmInputRole.X and b_role is GemmInputRole.DL_DY, \
+            f"unexpected a_role {a_role} and b_role {b_role}"
+        assert a_linear_mm_config.dL_dW == b_linear_mm_config.dL_dW
+        return a_linear_mm_config.dL_dW
+
+
 
 def merge_mm_configs(
     a_mm_config: ScaledMMConfig, b_mm_config: ScaledMMConfig
@@ -194,15 +262,18 @@ class Float8Tensor(torch.Tensor):
     _data: torch.Tensor
     _scale: torch.Tensor
     _orig_dtype: torch.dtype
-    _mm_config: ScaledMMConfig
+    # TODO(before land): change this to _linear_mm_config, wanted to do that after
+    # initial review
+    _mm_config: LinearMMConfig
     __slots__ = ["_data", "_scale", "_orig_dtype", "_mm_config"]
 
     def __new__(
         cls,
         data: torch.Tensor,
         scale: torch.Tensor,
         orig_dtype: torch.dtype,
-        mm_config: Optional[ScaledMMConfig],
+        mm_config: Optional[LinearMMConfig],
+        gemm_input_role: Optional[GemmInputRole] = GemmInputRole.X,
     ):
         assert (
             scale.numel() == 1
@@ -223,7 +294,8 @@ def __new__(
         self._data = data
         self._scale = scale
         self._orig_dtype = orig_dtype
-        self._mm_config = mm_config if mm_config is not None else ScaledMMConfig()
+        self._mm_config = mm_config if mm_config is not None else LinearMMConfig()
+        self._gemm_input_role = gemm_input_role
 
         return self
 
@@ -257,7 +329,8 @@ def to_float8(
         scale: torch.Tensor,
         float8_dtype: torch.dtype,
         amax_buffer: Optional[torch.Tensor] = None,
-        mm_config: Optional[ScaledMMConfig] = None,
+        mm_config: Optional[LinearMMConfig] = None,
+        gemm_input_role: Optional[GemmInputRole] = GemmInputRole.X,
     ):
         """Converts a higher precision tensor to float8 in a differentiable way.
 
@@ -272,7 +345,7 @@ def to_float8(
             Float8Tensor: a float8 tensor
         """
         return ToFloat8ConstrFunc.apply(
-            tensor, scale, float8_dtype, amax_buffer, mm_config
+            tensor, scale, float8_dtype, amax_buffer, mm_config, gemm_input_role,
         )
 
     @classmethod
diff --git a/test/test_base.py b/test/test_base.py
@@ -28,6 +28,7 @@
     Float8Tensor,
     merge_mm_configs,
     ScaledMMConfig,
+    GemmInputRole,
 )
 from float8_experimental.float8_utils import (
     compute_error,
@@ -438,9 +439,9 @@ def test_different_configs_error(self):
         x_fp32 = torch.randn(16, 16, device="cuda")
         x_scale = torch.tensor(1.0, device="cuda")
         fp8_dtype = e4m3_dtype
-        a = Float8Tensor.to_float8(x_fp32, x_scale, fp8_dtype)
+        a = Float8Tensor.to_float8(x_fp32, x_scale, fp8_dtype, gemm_input_role=GemmInputRole.X)
         b = Float8Tensor.to_float8(
-            x_fp32, x_scale, fp8_dtype, mm_config=ScaledMMConfig(True)
+            x_fp32, x_scale, fp8_dtype, mm_config=ScaledMMConfig(True), gemm_input_role=GemmInputRole.W
         )
         with pytest.raises(
             AssertionError,