[Float8] Fix serialization of dynamic activation fp8

drisspg · drisspg · commit 3a7bcb2c0a02 · 2024-09-06T16:23:48.000-07:00
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -24,6 +24,7 @@
 from functools import partial
 from typing import Tuple
 from contextlib import nullcontext
+import io
 
 
 random.seed(0)
@@ -142,6 +143,67 @@ def test_per_row_with_float32(self):
                 model, float8_dynamic_activation_float8_weight(granularity=PerRow())
             )
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not is_cuda_8_9, "Requires GPU with compute capability >= 8.9")
+    @common_utils.parametrize("mode", ["dynamic", "weight-only"])
+    def test_serialization(self, mode: str):
+        # Create and quantize the model
+        model = ToyLinearModel(16, 32).to(device="cuda")
+        if mode == "dynamic":
+            factory = float8_dynamic_activation_float8_weight()
+        else:
+            factory = float8_weight_only()
+        quantize_(model, factory)
+
+        # Save the state dict to an in-memory buffer
+        buffer = io.BytesIO()
+        torch.save(model.state_dict(), buffer)
+
+        # Reset the buffer position
+        buffer.seek(0)
+
+        # Load the state dict from the buffer
+        loaded_state_dict = torch.load(buffer)
+
+        # Create a new model and load the state dict
+        with torch.device("meta"):
+            new_model = ToyLinearModel(16, 32)
+            new_model.load_state_dict(loaded_state_dict, assign=True)
+
+        # Compare the original and loaded models
+        if mode == "weight-only":
+            model_weight_1 = model.linear1.weight.layout_tensor.float8_data.to(
+                torch.float32
+            )
+            new_model_weight_1 = new_model.linear1.weight.layout_tensor.float8_data.to(
+                torch.float32
+            )
+
+            model_weight_2 = model.linear2.weight.layout_tensor.float8_data.to(
+                torch.float32
+            )
+            new_model_weight_2 = new_model.linear2.weight.layout_tensor.float8_data.to(
+                torch.float32
+            )
+
+        else:
+            model_weight_1 = model.linear1.weight.original_weight_tensor.layout_tensor.float8_data.to(
+                torch.float32
+            )
+            new_model_weight_1 = new_model.linear1.weight.original_weight_tensor.layout_tensor.float8_data.to(
+                torch.float32
+            )
+
+            model_weight_2 = model.linear2.weight.original_weight_tensor.layout_tensor.float8_data.to(
+                torch.float32
+            )
+            new_model_weight_2 = new_model.linear2.weight.original_weight_tensor.layout_tensor.float8_data.to(
+                torch.float32
+            )
+
+        assert torch.allclose(model_weight_1, new_model_weight_1)
+        assert torch.allclose(model_weight_2, new_model_weight_2)
+
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -671,6 +671,27 @@ def _validate_granularity(
         raise ValueError(f"Invalid granularity specification: {granularity}, only PerTensor or PerRow are supported.")
 
 
+def _input_quant_func_dyanmic_fp8(
+    x: torch.Tensor,
+    activation_granularity: _fp8_granularities,
+    activation_dtype: torch.dtype,
+):
+    if isinstance(activation_granularity, PerRow):
+        assert (
+            x.dtype == torch.bfloat16
+        ), "PerRow quantization only works for bfloat16 precision input activation"
+
+    block_size = get_block_size(x, activation_granularity)
+    activation = to_affine_quantized_floatx(
+        input_float=x,
+        block_size=block_size,
+        target_dtype=activation_dtype,
+        scale_dtype=torch.float32,
+        layout_type=Float8LayoutType(mm_config=None),  # Config is stored on weight
+    )
+    return activation
+
+
 def float8_dynamic_activation_float8_weight(
     activation_dtype: torch.dtype = torch.float8_e4m3fn,
     weight_dtype: torch.dtype = torch.float8_e4m3fn,
@@ -723,23 +744,11 @@ def apply_float8_dynamic_activation_quant(weight: torch.Tensor):
             layout_type=Float8LayoutType(mm_config=mm_config),
         )
 
-        def input_quant_func(x: torch.Tensor):
-            if isinstance(activation_granularity, PerRow):
-                assert (
-                    x.dtype == torch.bfloat16
-                ), "PerRow quantization only works for bfloat16 precision input activation"
-
-            block_size = get_block_size(x, activation_granularity)
-            activation = to_affine_quantized_floatx(
-                input_float=x,
-                block_size=block_size,
-                target_dtype=activation_dtype,
-                scale_dtype=torch.float32,
-                layout_type=Float8LayoutType(
-                    mm_config=None
-                ),  # Config is stored on weight
-            )
-            return activation
+        input_quant_func = partial(
+            _input_quant_func_dyanmic_fp8,
+            activation_granularity=activation_granularity,
+            activation_dtype=activation_granularity,
+        )
 
         quantized_weight = to_linear_activation_quantized(
             quantized_weight, input_quant_func