address comments

Daniel Vega-Myhre · Daniel Vega-Myhre · commit 450f1402db52 · 2024-12-17T21:29:59.000-08:00
diff --git a/torchao/prototype/float8nocompile/float8nocompile_linear.py b/torchao/prototype/float8nocompile/float8nocompile_linear.py
@@ -41,20 +41,11 @@ def __init__(self, *args, **kwargs):
         Additional arguments on top of `torch.nn.Linear`'s arguments:
         * `config`: Float8LinearConfig
         """
-
-        # Amax scales should always be kept as float32.
-        self.always_float32_buffers = set()
         config = kwargs.pop("config")
         emulate = config.emulate
         super().__init__(*args, **kwargs)
 
-        # Defines the scaling behavior of input, weight, grad_output
-        self.scaling_type_input = config.cast_config_input.scaling_type
-        self.scaling_type_weight = config.cast_config_weight.scaling_type
-        self.scaling_type_grad_output = config.cast_config_grad_output.scaling_type
-
         self.config = config
-        self.is_amax_initialized = not self.config.enable_amax_init
 
         self.linear_mm_config = LinearMMConfig(
             # output
@@ -81,31 +72,18 @@ def __init__(self, *args, **kwargs):
         )
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        # TODO(danielvegamyhre): modify to support for FSDP once dependencies are implemented
-        output = self.forward_fp8_matmul(input)
-        if self.bias is not None:
-            output = output + self.bias.to(output.dtype)
-        return output
-
-    def forward_fp8_matmul(self, input: torch.Tensor) -> torch.Tensor:
-        # perform hp to fp8 conversions
-        # TODO(danielvegamyhre): replace conversion with triton kernels
-        input_fp8 = self.cast_input_to_float8(input, self.is_amax_initialized)
-        weight_scale = self.get_weight_scale(self.weight)
-        weight_fp8_t = self.cast_weight_to_float8_t(
-            self.weight, self.is_amax_initialized, weight_scale
-        )
+        # TODO(danielvegamyhre): replace conversions with triton kernels
+        # TODO(danielvegamyhre): support for FSDP once dependencies are implemented
+        input_fp8 = self.cast_input_to_float8(input)
+        weight_fp8_t = self.cast_weight_to_float8_t(self.weight)
 
         # compute fp8 matmul
         output = manual_float8_matmul_with_args_in_float8.apply(input_fp8, weight_fp8_t)
 
         # cast grad_output to float8_e5m2 during backward
-        # TODO(danielvegamyhre): replace with triton kernel
         return self.cast_output_to_float8_in_bw(output)
 
-    def cast_input_to_float8(
-        self, input: torch.Tensor, is_amax_initialized: bool
-    ) -> torch.Tensor:
+    def cast_input_to_float8(self, input: torch.Tensor) -> torch.Tensor:
         # Duplicate the autocast logic for F.linear, so that the output
         # of our module has the right original precision
         if torch.is_autocast_enabled():
@@ -122,32 +100,21 @@ def cast_input_to_float8(
             gemm_input_role=GemmInputRole.INPUT,
         )
 
-    def get_weight_scale(self, weight: torch.Tensor) -> Optional[torch.Tensor]:
-        # TODO(danielvegamyhre): replace scale calculation with triton kernel
-        if tensor_already_casted_to_fp8(weight):
-            return None
-        return tensor_to_scale(weight, self.config.cast_config_weight.target_dtype)
-
     def cast_weight_to_float8_t(
         self,
         weight: torch.Tensor,
-        is_amax_initialized: bool,
-        weight_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if tensor_already_casted_to_fp8(weight):
-            return weight.t()
-
         # TODO(danielvegamyhre): replace conversion with triton kernel
-        weight_fp8 = hp_tensor_and_scale_to_float8(
+        weight_fp8 = hp_tensor_to_float8nocompile_dynamic(
             weight,
-            weight_scale,
             self.config.cast_config_weight.target_dtype,
             self.linear_mm_config,
             gemm_input_role=GemmInputRole.WEIGHT,
         )
         return weight_fp8.t()
 
     def cast_output_to_float8_in_bw(self, output: torch.Tensor) -> torch.Tensor:
+        # casts grad_output to float8_e5m2 for backward
         # TODO(danielvegamyhre): replace conversion with triton kernel
         return NoopFwToFloat8BwDynamic.apply(
             output,
@@ -156,20 +123,15 @@ def cast_output_to_float8_in_bw(self, output: torch.Tensor) -> torch.Tensor:
         )
 
     @classmethod
-    def from_float(
-        cls,
-        mod,
-        config: Optional[Float8LinearConfig] = None,
-    ):
+    def from_float(cls, mod):
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
 
         Args:
             mod (torch.nn.Linear): nn.Linear to convert
             config (Optional[Float8LinearConfig]): configuration for conversion to float8
         """
-        if config is None:
-            config = Float8LinearConfig()
+        config = Float8LinearConfig()
         with torch.device("meta"):
             new_mod = cls(
                 mod.in_features,
diff --git a/torchao/prototype/float8nocompile/float8nocompile_linear_utils.py b/torchao/prototype/float8nocompile/float8nocompile_linear_utils.py
@@ -24,7 +24,6 @@ def convert_to_float8_nocompile_training(
     module: nn.Module,
     *,
     module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None,
-    config: Float8LinearConfig = None,
 ) -> nn.Module:
     """
     Swaps `torch.nn.Linear` in `module` with `Float8LinearNoCompile`.
@@ -39,12 +38,7 @@ def convert_to_float8_nocompile_training(
     Returns:
      nn.Module: The modified module with swapped linear layers.
     """
-    if config is None:
-        config = Float8LinearConfig()
-    from_float = lambda m: Float8LinearNoCompile.from_float(
-        m,
-        config=config,
-    )
+    from_float = lambda m: Float8LinearNoCompile.from_float(m)
     return swap_linear_layers(
         module,
         from_float,
diff --git a/torchao/prototype/float8nocompile/float8nocompile_scaling_utils.py b/torchao/prototype/float8nocompile/float8nocompile_scaling_utils.py
@@ -15,23 +15,23 @@
 from torchao.float8.config import ScalingGranularity
 from torchao.float8.distributed_utils import tensor_already_casted_to_fp8
 from torchao.float8.float8_tensor import (
+    _ToFloat8ConstrFunc,
     Float8Tensor,
     GemmInputRole,
-    hp_tensor_and_scale_to_float8,
     LinearMMConfig,
 )
 from torchao.float8.float8_utils import tensor_to_scale
 
+# avoid division by zero when calculating scale
+# TODO: align this value with NVIDIA's assumptions (current value is a guess)
+EPS = 1e-12
+
 
 def hp_tensor_to_float8nocompile_dynamic(
     hp_tensor: torch.Tensor,
     float8_dtype: torch.dtype,
     linear_mm_config: LinearMMConfig,
-    reduce_amax: bool = False,
     gemm_input_role: GemmInputRole = GemmInputRole.INPUT,
-    device_mesh=None,
-    scaling_granularity: ScalingGranularity = ScalingGranularity.TENSORWISE,
-    axiswise_dim: Optional[int] = None,
 ) -> Float8Tensor:
     """
     Given a high precision tensor `hp_tensor`,
@@ -42,28 +42,20 @@ def hp_tensor_to_float8nocompile_dynamic(
         float8_dtype: the float8 dtype to use
         linear_mm_config: Defines the configuration for the scaled_mm for
           the 3 fwd/bwd gemms of linear
-        reduce_amax: whether to reduce the max(abs(hp_tensor)) value across distributed ranks
         gemm_input_role: Defines the role of this tensor (input, weight or grad_output) in
           the 3 fwd/bwd gemms of linear
-        scaling_granularity: Defines the scaling granularity
-        axiswise_dim: if axiswise granularity is used, defines the dim to scale across
     """
     # TODO(danielvegamyhre): replace this torch implementation with custom triton kernel
-    if tensor_already_casted_to_fp8(hp_tensor):
-        return hp_tensor
-    scale = tensor_to_scale(
-        hp_tensor,
-        float8_dtype,
-        reduce_amax,
-        device_mesh,
-        scaling_granularity,
-        axiswise_dim,
-    )
-    return hp_tensor_and_scale_to_float8(
+    # torch.compile and eager show different numerics for 1.0 / float32,
+    # upcast to float64 to ensure same numeric between compile and eager
+    amax = torch.max(torch.abs(hp_tensor)).to(torch.float64)
+    scale = torch.finfo(float8_dtype).max / torch.clamp(amax, min=EPS)
+    scale = scale.to(torch.float32)  # scale must be fp32
+    return _ToFloat8ConstrFunc.apply(
         hp_tensor,
         scale,
         float8_dtype,
         linear_mm_config,
         gemm_input_role,
-        axiswise_dim,
+        None,
     )