pytorch-labs
diff --git a/‎float8_playground/float8_aten_api.py
Lines changed: 24 additions & 31 deletions b/‎float8_playground/float8_aten_api.py
Lines changed: 24 additions & 31 deletions
diff --git a/‎float8_playground/float8_linear.py
Lines changed: 26 additions & 28 deletions b/‎float8_playground/float8_linear.py
Lines changed: 26 additions & 28 deletions
diff --git a/‎float8_playground/float8_tensor.py
Lines changed: 12 additions & 18 deletions b/‎float8_playground/float8_tensor.py
Lines changed: 12 additions & 18 deletions
@@ -7,52 +7,51 @@
 from torch.library import Library
 
 from float8_utils import (
-    float32_to_float8,
-    float8_to_float32,
-    E4M3,
-    E5M2,
     tensor_to_scale,
 )
 
 
-def mm_float8(m1, s1, flavor1, m2, s2, flavor2, s3, flavor3):
+def mm_float8(m1, s1, m2, s2, s3, dtype3):
     # naive implementation: dq -> op -> q
     # TODO(future): hook up to real kernel
-    m1_fp32 = float8_to_float32(m1, flavor1) / s1
-    m2_fp32 = float8_to_float32(m2, flavor2) / s2
+    m1_fp32 = m1.float() / s1
+    m2_fp32 = m2.float() / s2
     m3_fp32 = torch.mm(m1_fp32, m2_fp32)
     # TODO(future): switch to delayed scaling
-    s3.fill_(tensor_to_scale(m3_fp32, flavor3))
+    s3.fill_(tensor_to_scale(m3_fp32, dtype3))
     m3_fp32_scaled = m3_fp32 * s3
-    return float32_to_float8(m3_fp32_scaled, flavor3)
+    if dtype3 == torch.float8_e4m3fn:
+        return m3_fp32_scaled.to(torch.float8_e4m3fn)
+    else:
+        return m3_fp32_scaled.to(torch.float8_e5m2)
 
 def add_float8_e5m2(m1, s1, m2, s2, s3):
     # for now this is only implemented for e5m2 because we only care about
     # this for adding gradients
     # naive implementation: dq -> op -> q
     # TODO(future): hook up to real kernel
-    # TODO(future): make this more accurate, accuracy is pretty low,
-    # can probably just calculate s3 dynamically since this is an edge case
-    # unlikely to affect e2e performance
-    m1_float32 = float8_to_float32(m1, E5M2) / s1
-    m2_float32 = float8_to_float32(m2, E5M2) / s2
+    m1_float32 = m1.float() / s1
+    m2_float32 = m2.float() / s2
     m3_float32 = m1_float32 + m2_float32
-    return float32_to_float8(m3_float32 * s3, E5M2)
+    s3_val = tensor_to_scale(m3_float32, torch.float8_e5m2)
+    s3.fill_(s3_val)
+    return (m3_float32 * s3).to(torch.float8_e5m2)
 
 # TODO naming of these vars is weird
-def addmm_float8(
-        inp1, inp_s1, inp_flavor1, m1, s1, flavor1, m2, s2, flavor2, 
-        s3, flavor3):
+def addmm_float8(inp1, inp_s1, m1, s1, m2, s2, s3, dtype3):
     # naive implementation: dq -> op -> q
     # TODO(future): hook up to real kernel
-    inp1_fp32 = float8_to_float32(inp1, inp_flavor1) / inp_s1
-    m1_fp32 = float8_to_float32(m1, flavor1) / s1
-    m2_fp32 = float8_to_float32(m2, flavor2) / s2
+    inp1_fp32 = inp1.float() / inp_s1
+    m1_fp32 = m1.float() / s1
+    m2_fp32 = m2.float() / s2
     m3_fp32 = torch.addmm(inp1_fp32, m1_fp32, m2_fp32)
     # TODO(future): switch to delayed scaling
-    s3.fill_(tensor_to_scale(m3_fp32, flavor3))
+    s3.fill_(tensor_to_scale(m3_fp32, dtype3))
     m3_fp32_scaled = m3_fp32 * s3
-    return float32_to_float8(m3_fp32_scaled, flavor3)
+    if dtype3 == torch.float8_e4m3fn:
+        return m3_fp32_scaled.to(torch.float8_e4m3fn)
+    else:
+        return m3_fp32_scaled.to(torch.float8_e5m2)
 
 
 #
@@ -65,17 +64,11 @@ def addmm_float8(
 
 # For now register on CPU,
 # TODO(future) add GPU and test there
-lib.define("float32_to_float8(Tensor t, int flavor) -> Tensor")
-lib.impl("float32_to_float8", float32_to_float8, "CPU")
-
-lib.define("float8_to_float32(Tensor t, int flavor) -> Tensor")
-lib.impl("float8_to_float32", float8_to_float32, "CPU")
-
-lib.define("mm_float8(Tensor m1, Tensor s1, int flavor1, Tensor m2, Tensor s2, int flavor2, Tensor s3, int flavor3) -> Tensor")
+lib.define("mm_float8(Tensor m1, Tensor s1, Tensor m2, Tensor s2, Tensor s3, int dtype3) -> Tensor")
 lib.impl("mm_float8", mm_float8, "CPU")
 
 lib.define("add_float8_e5m2(Tensor m1, Tensor s1, Tensor m2, Tensor s2, Tensor s3) -> Tensor")
 lib.impl("add_float8_e5m2", add_float8_e5m2, "CPU")
 
-lib.define("addmm_float8(Tensor inp1, Tensor inp_s1, int inp_flavor1, Tensor m1, Tensor s1, int flavor1, Tensor m2, Tensor s2, int flavor2, Tensor s3, int flavor3) -> Tensor")
+lib.define("addmm_float8(Tensor inp1, Tensor inp_s1, Tensor m1, Tensor s1, Tensor m2, Tensor s2, Tensor s3, int dtype3) -> Tensor")
 lib.impl("addmm_float8", addmm_float8, "CPU")
@@ -11,7 +11,7 @@
 
 import float8_aten_api
 
-from float8_utils import E4M3, E5M2, tensor_to_scale
+from float8_utils import tensor_to_scale
 from float8_tensor import Float8Tensor
 
 class float8_linear(torch.autograd.Function):
@@ -33,19 +33,18 @@ def forward(
         ctx.save_for_backward(
             x_fp8, w_fp8, b_fp8, fp8_s_dL_dX, fp8_s_dL_dW, fp8_s_dL_dY)
         if b_fp8 is not None:
-            # TODO add this
             res_bits = torch.ops.aten.addmm_float8(
-                b_fp8._data, b_fp8._scale, b_fp8._flavor,
-                x_fp8._data, x_fp8._scale, x_fp8._flavor,
-                w_fp8._data.t(), w_fp8._scale, w_fp8._flavor,
-                fp8_s_out, E4M3)
+                b_fp8._data, b_fp8._scale,
+                x_fp8._data, x_fp8._scale,
+                w_fp8._data.t(), w_fp8._scale,
+                fp8_s_out, torch.float8_e4m3fn)
         else:
             res_bits = torch.ops.aten.mm_float8(
-                x_fp8._data, x_fp8._scale, x_fp8._flavor,
-                w_fp8._data.t(), w_fp8._scale, w_fp8._flavor,
-                fp8_s_out, E4M3)
+                x_fp8._data, x_fp8._scale,
+                w_fp8._data.t(), w_fp8._scale,
+                fp8_s_out, torch.float8_e4m3fn)
 
-        res = Float8Tensor(res_bits, fp8_s_out, E4M3)
+        res = Float8Tensor(res_bits, fp8_s_out)
         # scale update would also happen here, for now no-op
         return res
 
@@ -56,25 +55,24 @@ def backward(ctx, go):
 
         if not isinstance(go, Float8Tensor):
             # TODO(future): switch to delayed scaling
-            fp8_s_dL_dY.fill_(tensor_to_scale(go, E5M2))
+            fp8_s_dL_dY.fill_(tensor_to_scale(go, torch.float8_e5m2))
             go_fp8 = Float8Tensor(
-                torch.ops.aten.float32_to_float8(go * fp8_s_dL_dY, E5M2),
-                fp8_s_dL_dY,
-                E5M2)
+                (go * fp8_s_dL_dY).to(torch.float8_e5m2),
+                fp8_s_dL_dY)
         else:
             go_fp8 = go
 
         dL_dX_bits = torch.ops.aten.mm_float8(
-            go_fp8._data, go_fp8._scale, go_fp8._flavor,
-            w_fp8._data, w_fp8._scale, w_fp8._flavor,
-            fp8_s_dL_dX, E5M2)
-        dL_dX_fp8 = Float8Tensor(dL_dX_bits, fp8_s_dL_dX, E5M2)
+            go_fp8._data, go_fp8._scale,
+            w_fp8._data, w_fp8._scale,
+            fp8_s_dL_dX, torch.float8_e5m2)
+        dL_dX_fp8 = Float8Tensor(dL_dX_bits, fp8_s_dL_dX)
 
         dL_dW_bits = torch.ops.aten.mm_float8(
-            x_fp8._data.t(), x_fp8._scale, x_fp8._flavor,
-            go_fp8._data, go_fp8._scale, go_fp8._flavor,
-            fp8_s_dL_dW, E5M2).t()
-        dL_dW_fp8 = Float8Tensor(dL_dW_bits, fp8_s_dL_dW, E5M2)
+            x_fp8._data.t(), x_fp8._scale,
+            go_fp8._data, go_fp8._scale,
+            fp8_s_dL_dW, torch.float8_e5m2).t()
+        dL_dW_fp8 = Float8Tensor(dL_dW_bits, fp8_s_dL_dW)
 
         # scale update would also happen here, for now no-op
         if b_fp8 is not None:
@@ -106,18 +104,18 @@ def __init__(self, *args, **kwargs):
     def forward(self, x):
         if not isinstance(x, Float8Tensor):
             # TODO(future): switch to delayed scaling
-            self.fp8_s_in.fill_(tensor_to_scale(x, E4M3))
-            x_fp8 = Float8Tensor.from_float32(x, self.fp8_s_in, E4M3)
+            self.fp8_s_in.fill_(tensor_to_scale(x, torch.float8_e4m3fn))
+            x_fp8 = Float8Tensor.from_float32(x, self.fp8_s_in, torch.float8_e4m3fn)
         else:
             x_fp8 = x
 
         # TODO(future): switch to delayed scaling
-        self.fp8_s_weight.fill_(tensor_to_scale(self.weight, E4M3))
-        w_fp8 = Float8Tensor.from_float32(self.weight, self.fp8_s_weight, E4M3)
+        self.fp8_s_weight.fill_(tensor_to_scale(self.weight, torch.float8_e4m3fn))
+        w_fp8 = Float8Tensor.from_float32(self.weight, self.fp8_s_weight, torch.float8_e4m3fn)
         maybe_b_fp8 = None
         if self.bias is not None:
-            self.fp8_s_bias.fill_(tensor_to_scale(self.bias, E4M3))
-            maybe_b_fp8 = Float8Tensor.from_float32(self.bias, self.fp8_s_bias, E4M3)
+            self.fp8_s_bias.fill_(tensor_to_scale(self.bias, torch.float8_e4m3fn))
+            maybe_b_fp8 = Float8Tensor.from_float32(self.bias, self.fp8_s_bias, torch.float8_e4m3fn)
 
         y_fp8 = float8_linear.apply(
             x_fp8, w_fp8, maybe_b_fp8, self.fp8_s_out, self.fp8_s_dL_dX,
 
@@ -2,8 +2,6 @@
 import torch
 from torch.utils._pytree import tree_map
 
-from float8_utils import E4M3, E5M2
-
 aten = torch.ops.aten
 
 class Float8ConstrFunc(torch.autograd.Function):
@@ -12,15 +10,15 @@ class Float8ConstrFunc(torch.autograd.Function):
     TODO(future): split into two for cleaner code
     """
     @staticmethod
-    def forward(ctx, tensor, scale: float=None, flavor=E4M3):
+    def forward(ctx, tensor, scale: float=None, dtype=torch.float8_e4m3fn):
         if isinstance(tensor, Float8Tensor):
             ctx.inp_is_float8 = True
-            return torch.ops.aten.float8_to_float32(tensor._data, tensor._flavor) / tensor._scale
+            return tensor._data.to(torch.float32) / tensor._scale
         else:
             ctx.inp_is_float8 = False
             tensor_scaled = tensor * scale
-            bits_fp8 = torch.ops.aten.float32_to_float8(tensor_scaled, flavor)
-            return Float8Tensor(bits_fp8, scale, flavor)
+            bits_fp8 = tensor_scaled.to(dtype)
+            return Float8Tensor(bits_fp8, scale)
 
     @staticmethod
     def backward(ctx, g):
@@ -41,7 +39,6 @@ class Float8Tensor(torch.Tensor):
     * `_scale`: the scale used to scale the original fp32 tensor. We multiply
       by scale to go from fp32 range to fp8 range, and divide by scale to go
       from fp8 range to fp32 range.
-    * `_flavor`: either E4M3 or E5M2
 
     The current purpose of this object is 99% to bundle raw data + fp8 metadata
     together for easy passing through PyTorch systems, and 1% to implement
@@ -57,11 +54,9 @@ class Float8Tensor(torch.Tensor):
     to fp32 for them.
     """
 
-    def __new__(cls, data, scale, flavor):
+    def __new__(cls, data, scale):
         # This is a non-differentiable constructor!
         assert not data.requires_grad
-        # TODO(future): make bits8 easier to work with and switch to using it
-        # assert data.dtype == torch.bits8
         assert scale.dtype == torch.float32
         assert scale.nelement() == 1
 
@@ -77,19 +72,18 @@ def __new__(cls, data, scale, flavor):
         )
         self._data = data
         self._scale = scale
-        self._flavor = flavor
 
         return self
 
     def __repr__(self):
-        return f"Float8Tensor(flavor={self._flavor}, scale={self._scale}, as_float32={self.to_float32()}"
+        return f"Float8Tensor(dtype={self._data.dtype}, scale={self._scale}, as_float32={self.to_float32()}"
 
     def to_float32(self):
         return Float8ConstrFunc.apply(self)
 
     @classmethod
-    def from_float32(cls, tensor, scale, flavor):
-        return Float8ConstrFunc.apply(tensor, scale, flavor)
+    def from_float32(cls, tensor, scale, dtype):
+        return Float8ConstrFunc.apply(tensor, scale, dtype)
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs=None):
@@ -113,14 +107,14 @@ def __torch_dispatch__(cls, func, types, args, kwargs=None):
             and isinstance(args[1], Float8Tensor)
         ):
             x1_fp8, x2_fp8 = args[0], args[1]
-            assert x1_fp8._flavor == E5M2 and x2_fp8._flavor == E5M2
-            # naive scale calculation: max of incoming two scales
-            x3_scale = torch.max(x1_fp8._scale, x2_fp8._scale)
+            assert x1_fp8._data.dtype == torch.float8_e5m2 and x2_fp8._data.dtype == torch.float8_e5m2
+            # scale will be filled in by the kernel, not using delayed scaling
+            x3_scale = torch.empty(1)
             res_bits = torch.ops.aten.add_float8_e5m2(
                 x1_fp8._data, x1_fp8._scale,
                 x2_fp8._data, x2_fp8._scale,
                 x3_scale)
-            res = Float8Tensor(res_bits, x3_scale, x1_fp8._flavor)
+            res = Float8Tensor(res_bits, x3_scale)
             return res
 
         # for all other ops, fall back to fp32