still one failing test, but I think its a good failure

drisspg · drisspg · commit 146c6a12956b · 2023-10-20T19:18:51.000-07:00
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -302,9 +302,17 @@ def forward(self, x):
 
         x_fp8 = self.cast_x_to_float8(x, self.is_amax_initialized)
         w_fp8 = self.cast_w_to_float8(self.weight, self.is_amax_initialized)
-        y = self.float8_mm(x_fp8, w_fp8, self.is_amax_initialized)
-        y = self.cast_y_to_float8_in_bw(y)
+        # y = self.float8_mm(x_fp8, w_fp8, self.is_amax_initialized)
+        if self.emulate:
+            y = self.float8_mm(x_fp8, w_fp8, self.is_amax_initialized)
+        else:
+            orig_shape = x_fp8.shape
+            x_fp8 = x_fp8.reshape(-1, orig_shape[-1])
+            y = torch.mm(x_fp8, w_fp8.t())
+            y = y.reshape(*orig_shape[:-1], y.shape[-1])
 
+        y = self.cast_y_to_float8_in_bw(y)
+        # breakpoint()
         if self.bias is not None:
             y = y + self.bias.to(x_fp8._orig_dtype)
 
diff --git a/float8_experimental/float8_ops.py b/float8_experimental/float8_ops.py
@@ -0,0 +1,74 @@
+from typing import Any, Dict
+
+import torch
+
+from float8_experimental.float8_python_api import mm_float8_unwrapped
+from float8_experimental.float8_tensor import Float8Tensor
+from float8_experimental.float8_utils import (is_row_major, tensor_to_amax,
+                                              to_fp8_saturated)
+
+aten = torch.ops.aten
+FLOAT8_OPS_TABLE: Dict[Any, Any] = {}
+
+
+def implements(aten_ops):
+    """Register aten ops to the float8 op table"""
+
+    def decorator(func):
+        for op in aten_ops:
+            FLOAT8_OPS_TABLE[op] = func
+        return func
+
+    return decorator
+
+
+@implements(
+    [
+        aten.view.default,
+        aten._unsafe_view.default,
+        aten.t.default,
+        aten.as_strided.default,
+        aten.clone.default,
+        aten.detach.default,
+    ]
+)
+def float8_desugar_op(aten_op, args, kwargs=None):
+    # assert is_fake(args[0]), "Float8Tensor.__torch_dispatch__ for user code is not supported"
+    new_data = aten_op(args[0]._data, *args[1:], **kwargs)
+    return Float8Tensor(new_data, args[0]._scale, args[0]._orig_dtype)
+
+
+@implements([aten.mm.default])
+def float8_mm(aten_op, args, kwargs=None):
+    assert isinstance(args[0], Float8Tensor) and isinstance(args[1], Float8Tensor)
+    a = args[0]
+    b = args[1]
+    a_data = a._data
+    a_scale = a._scale
+    b_data = b._data
+
+    if not is_row_major(a_data.stride()):
+        a_data = a_data.contiguous()
+    if is_row_major(b_data.stride()):
+        b_data = b_data.t().contiguous().t()
+
+    b_scale = b._scale
+    output_dtype = a._orig_dtype
+    tensor_out, amax = mm_float8_unwrapped(
+        a_data, a_scale, b_data, b_scale, output_dtype, output_scale=None
+    )
+    return tensor_out
+
+
+@implements([aten.is_same_size.default])
+def float8_is_same_size(aten_op, args, kwargs=None):
+    return args[0].shape == args[1].shape
+
+
+@implements([aten._to_copy.default])
+def autocast_to_copy(aten_op, args, kwargs=None):
+    # This is needed for auto cast behavior
+    # TODO Also feels kind of sketch....
+    assert isinstance(args[0], Float8Tensor)
+    assert len(kwargs) == 1 and "dtype" in kwargs, "Only support dtype kwarg for autocast"
+    return Float8Tensor(args[0]._data, args[0]._scale, kwargs["dtype"])
diff --git a/float8_experimental/float8_python_api.py b/float8_experimental/float8_python_api.py
@@ -9,7 +9,6 @@
 
 import float8_experimental.float8_aten_api
 import torch
-from float8_experimental.float8_tensor import Float8Tensor
 
 
 def mm_float8_unwrapped(
@@ -45,8 +44,8 @@ def mm_float8_unwrapped(
 # For a,b going from fp8 -> fp32 we multiple by the inverse of the scale
 # For output going from fp32 -> fp8 we multiply by the scale
 def mm_float8(
-    a: Float8Tensor,  # input 1
-    b: Float8Tensor,  # input 2
+    a: "Float8Tensor",  # input 1
+    b: "Float8Tensor",  # input 2
     output_dtype: torch.dtype,  # output dtype
     output_scale: Optional[torch.Tensor] = None,  # output scale, precomputed
     emulate: bool = False,  # whether to emulate the operation using fp32
diff --git a/float8_experimental/float8_tensor.py b/float8_experimental/float8_tensor.py
@@ -1,45 +1,11 @@
-from typing import Any, Dict
+from typing import Dict
 
 import torch
+
 from float8_experimental.float8_utils import tensor_to_amax, to_fp8_saturated
-from torch._subclasses.fake_tensor import is_fake
 
 aten = torch.ops.aten
 
-FLOAT8_OPS_TABLE: Dict[Any, Any] = {}
-
-
-def implements(aten_ops):
-    """Register aten ops to the float8 op table"""
-
-    def decorator(func):
-        for op in aten_ops:
-            FLOAT8_OPS_TABLE[op] = func
-        return func
-
-    return decorator
-
-
-@implements(
-    [
-        aten.view.default,
-        aten._unsafe_view.default,
-        aten.t.default,
-        aten.as_strided.default,
-        aten.clone.default,
-        aten.detach.default,
-    ]
-)
-def float8_desugar_op(aten_op, args, kwargs=None):
-    assert is_fake(args[0]), "Float8Tensor.__torch_dispatch__ for user code is not supported"
-    new_data = aten_op(args[0]._data, *args[1:], **kwargs)
-    return Float8Tensor(new_data, args[0]._scale, args[0]._orig_dtype)
-
-
-@implements([aten.is_same_size.default])
-def float8_is_same_size(aten_op, args, kwargs=None):
-    return args[0].shape == args[1].shape
-
 
 class ToFloat8ConstrFunc(torch.autograd.Function):
     """
@@ -166,6 +132,10 @@ def __torch_dispatch__(cls, func, types, args, kwargs=None):
         # PT2.0, so we explicitly disallow it here for callsites from user code.
         # 2. We do need to handle a couple of ops in order for
         # TorchDynamo tracing to succeed.
+
+        # Lazy import to avoid circular dependency
+        from float8_experimental.float8_ops import FLOAT8_OPS_TABLE
+
         if func in FLOAT8_OPS_TABLE:
             return FLOAT8_OPS_TABLE[func](func, args, kwargs)
         raise NotImplementedError(f"attempting to run {func}, this is not supported")
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -82,3 +82,6 @@ def compute_error(x, y):
     Ps = torch.norm(x)
     Pn = torch.norm(x - y)
     return 20 * torch.log10(Ps / Pn)
+
+def is_row_major(stride):
+    return stride[0] > stride[1] and stride[1] == 1