Merge pull request #2 from pytorch-labs/add_bias

vkuzo · web-flow · commit b8a68fb22441 · 2023-07-21T13:44:01.000-07:00
add bias support to float8linear
diff --git a/float8_playground/float8_aten_api.py b/float8_playground/float8_aten_api.py
@@ -39,6 +39,22 @@ def add_float8_e5m2(m1, s1, m2, s2, s3):
     m3_float32 = m1_float32 + m2_float32
     return float32_to_float8(m3_float32 * s3, E5M2)
 
+# TODO naming of these vars is weird
+def addmm_float8(
+        inp1, inp_s1, inp_flavor1, m1, s1, flavor1, m2, s2, flavor2, 
+        s3, flavor3):
+    # naive implementation: dq -> op -> q
+    # TODO(future): hook up to real kernel
+    inp1_fp32 = float8_to_float32(inp1, inp_flavor1) / inp_s1
+    m1_fp32 = float8_to_float32(m1, flavor1) / s1
+    m2_fp32 = float8_to_float32(m2, flavor2) / s2
+    m3_fp32 = torch.addmm(inp1_fp32, m1_fp32, m2_fp32)
+    # TODO(future): switch to delayed scaling
+    s3.fill_(tensor_to_scale(m3_fp32, flavor3))
+    m3_fp32_scaled = m3_fp32 * s3
+    return float32_to_float8(m3_fp32_scaled, flavor3)
+
+
 #
 # ATen op placeholders
 #
@@ -60,3 +76,6 @@ def add_float8_e5m2(m1, s1, m2, s2, s3):
 
 lib.define("add_float8_e5m2(Tensor m1, Tensor s1, Tensor m2, Tensor s2, Tensor s3) -> Tensor")
 lib.impl("add_float8_e5m2", add_float8_e5m2, "CPU")
+
+lib.define("addmm_float8(Tensor inp1, Tensor inp_s1, int inp_flavor1, Tensor m1, Tensor s1, int flavor1, Tensor m2, Tensor s2, int flavor2, Tensor s3, int flavor3) -> Tensor")
+lib.impl("addmm_float8", addmm_float8, "CPU")
diff --git a/float8_playground/float8_linear.py b/float8_playground/float8_linear.py
@@ -25,25 +25,34 @@ def forward(
         ctx,
         x_fp8,
         w_fp8,
+        b_fp8,
         fp8_s_out,
         fp8_s_dL_dX,
         fp8_s_dL_dW,
         fp8_s_dL_dY,
     ):
-        ctx.save_for_backward(x_fp8, w_fp8, fp8_s_dL_dX, fp8_s_dL_dW, fp8_s_dL_dY)
-
-        res_bits = torch.ops.aten.mm_float8(
-            x_fp8._data, x_fp8._scale, x_fp8._flavor,
-            w_fp8._data.t(), w_fp8._scale, w_fp8._flavor,
-            fp8_s_out, E4M3)
+        ctx.save_for_backward(
+            x_fp8, w_fp8, b_fp8, fp8_s_dL_dX, fp8_s_dL_dW, fp8_s_dL_dY)
+        if b_fp8 is not None:
+            # TODO add this
+            res_bits = torch.ops.aten.addmm_float8(
+                b_fp8._data, b_fp8._scale, b_fp8._flavor,
+                x_fp8._data, x_fp8._scale, x_fp8._flavor,
+                w_fp8._data.t(), w_fp8._scale, w_fp8._flavor,
+                fp8_s_out, E4M3)
+        else:
+            res_bits = torch.ops.aten.mm_float8(
+                x_fp8._data, x_fp8._scale, x_fp8._flavor,
+                w_fp8._data.t(), w_fp8._scale, w_fp8._flavor,
+                fp8_s_out, E4M3)
 
         res = Float8Tensor(res_bits, fp8_s_out, E4M3)
         # scale update would also happen here, for now no-op
         return res
 
     @staticmethod
     def backward(ctx, go):
-        x_fp8, w_fp8, fp8_s_dL_dX, fp8_s_dL_dW, fp8_s_dL_dY = \
+        x_fp8, w_fp8, b_fp8, fp8_s_dL_dX, fp8_s_dL_dW, fp8_s_dL_dY = \
             ctx.saved_tensors
 
         if not isinstance(go, Float8Tensor):
@@ -69,7 +78,10 @@ def backward(ctx, go):
         dL_dW_fp8 = Float8Tensor(dL_dW_bits, fp8_s_dL_dW, E5M2)
 
         # scale update would also happen here, for now no-op
-        return dL_dX_fp8, dL_dW_fp8, None, None, None, None
+        if b_fp8 is not None:
+            return dL_dX_fp8, dL_dW_fp8, go_fp8, None, None, None, None
+        else:
+            return dL_dX_fp8, dL_dW_fp8, None, None, None, None, None
 
 
 class Float8Linear(torch.nn.Linear):
@@ -86,6 +98,7 @@ def __init__(self, *args, **kwargs):
         # or PTQ calibration.
         self.register_buffer('fp8_s_in', torch.tensor(1.0))
         self.register_buffer('fp8_s_weight', torch.tensor(1.0))
+        self.register_buffer('fp8_s_bias', torch.tensor(1.0))
         self.register_buffer('fp8_s_out', torch.tensor(1.0))
         self.register_buffer('fp8_s_dL_dX', torch.tensor(1.0))
         self.register_buffer('fp8_s_dL_dW', torch.tensor(1.0))
@@ -102,9 +115,13 @@ def forward(self, x):
         # TODO(future): switch to delayed scaling
         self.fp8_s_weight.fill_(tensor_to_scale(self.weight, E4M3))
         w_fp8 = Float8Tensor.from_float32(self.weight, self.fp8_s_weight, E4M3)
+        maybe_b_fp8 = None
+        if self.bias is not None:
+            self.fp8_s_bias.fill_(tensor_to_scale(self.bias, E4M3))
+            maybe_b_fp8 = Float8Tensor.from_float32(self.bias, self.fp8_s_bias, E4M3)
 
         y_fp8 = float8_linear_no_bias.apply(
-            x_fp8, w_fp8, self.fp8_s_out, self.fp8_s_dL_dX,
+            x_fp8, w_fp8, maybe_b_fp8, self.fp8_s_out, self.fp8_s_dL_dX,
             self.fp8_s_dL_dW, self.fp8_s_dL_dY)
 
         # For now, hardcode returning Float8Tensor (propagate as much as we can).
@@ -116,7 +133,7 @@ def from_float(cls, mod):
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
         """
-        assert mod.bias is None, 'bias support not implemented yet'
         new_mod = cls(mod.in_features, mod.out_features, bias=False)
         new_mod.weight = mod.weight
+        new_mod.bias = mod.bias
         return new_mod
diff --git a/float8_playground/test.py b/float8_playground/test.py
@@ -154,13 +154,10 @@ def test_add(self):
         self.assertTrue(sqnr >= 10.0)
 
 class Float8LinearUnitTest(unittest.TestCase):
+    def _test_linear_impl(self, x, m_ref):
 
-    def test_e2e(self):
-        m_ref = nn.Linear(4, 4, bias=False)
         m_fp8 = Float8Linear.from_float(copy.deepcopy(m_ref))
 
-        x = torch.randn(4, 4)
-
         y_fp8 = m_fp8(x)
         y_fp8.sum().backward()
         y_ref = m_ref(x)
@@ -170,23 +167,38 @@ def test_e2e(self):
         g_sqnr = compute_error(m_ref.weight.grad, m_fp8.weight.grad)
 
         # verify sqnr is reasonable
-        self.assertTrue(y_sqnr >= 27.0)
-        self.assertTrue(g_sqnr >= 27.0)
+        self.assertTrue(y_sqnr >= 24.0)
+        self.assertTrue(g_sqnr >= 24.0)
+        if m_ref.bias is not None:
+            torch.testing.assert_close(m_ref.bias.grad, m_fp8.bias.grad)
 
         # verify all of the scales got updated
-        for buffer_name in (
+        buffer_names = [
             'fp8_s_in',
             'fp8_s_weight',
             'fp8_s_out',
             'fp8_s_dL_dX',
             'fp8_s_dL_dW',
             'fp8_s_dL_dY',
-        ):
+        ]
+        if m_ref.bias is not None:
+            buffer_names.append('fp8_s_bias')
+        for buffer_name in buffer_names:
             buffer_value = getattr(m_fp8, buffer_name)
             self.assertTrue(
                 torch.ne(buffer_value, torch.tensor(1.0)),
                 f"{buffer_name} not filled")
 
+    def test_linear_nobias(self):
+        x = torch.randn(2, 3)
+        m_ref = nn.Linear(3, 4, bias=False)
+        self._test_linear_impl(x, m_ref)
+
+    def test_linear_bias(self):
+        x = torch.randn(2, 3)
+        m_ref = nn.Linear(3, 4, bias=True)
+        self._test_linear_impl(x, m_ref)
+
 
 if __name__ == '__main__':
     unittest.main()