Revert "Nvfp4 static gs (#61)"

yiliu30 · web-flow · commit ce27defbacb8 · 2025-08-01T19:39:30.000+08:00
This reverts commit c4ef813.
diff --git a/examples/offline_inference/basic/basic_hpu.py b/examples/offline_inference/basic/basic_hpu.py
@@ -41,9 +41,6 @@
 # model_path = "/software/users/yiliu4/deepseek-ai/DeepSeek-R1-NVFP4-OFFLINE"
 model_path = "/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-RTN"
 
-# model_path = "/software/users/yiliu4/HF_HOME/Yi30/DeepSeek-V2-Lite-NVFP4-W4A4-RTN-GLOBAL-SCALE-WW"
-
-
 import os
 
 os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true"
@@ -123,7 +120,6 @@ def main(args):
     # Create a sampling params object.
     max_model_len = 2048
     model_path = args.model_path
-    print(f"model_path: {model_path}")
     llm = LLM(
         model=model_path,
         #   quantization="inc",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -88,19 +88,13 @@ def get_moe_method(
 
 
 def nvfp4_unpacked_weight_gemm(
-    x,
-    weight,
-    weight_scale,
-    weight_global_scale,
-    input_global_scale=None,
+    x, weight_unpacked, weight_scale, weight_global_scale
 ):
-    weight_unpacked = weight
     # return self.run_nvfp4_emulations(x, layer)
     from vllm.model_executor.layers.quantization.utils.nvfp4_qdq import (
         unpacked_nvfp4_to_fp8,
         dequant_nvfp4,
         qdq_nvfp4,
-        qdq_nvfp4_with_gs,
     )
 
     # bs, seq_len, hidden_size = x.shape
@@ -113,7 +107,8 @@ def nvfp4_unpacked_weight_gemm(
         packed=False,
     )
 
-    x = qdq_nvfp4_with_gs(x, input_global_scale)
+    # breakpoint()
+    x = qdq_nvfp4(x)
     out = x @ hp_weight.t()
     # out = out.reshape(bs, seq_len, -1)
     return out
@@ -290,7 +285,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                               layer.w13_weight_global_scale[:, 1]):
             logger.warning_once(
                 "w1_weight_global_scale must match w3_weight_global_scale. "
-                f"Accuracy may be affected. {getattr(layer, 'layer_name', '')}")
+                "Accuracy may be affected.")
 
         # Take inverse of global scale saved to disk
         layer.w13_weight_scale_2 = torch.nn.Parameter(
@@ -480,19 +475,16 @@ def apply(
                 ]
                 local_w3_global_scale = local_w13_global_scale[1]
                 local_w3_input_global_scale = local_w13_input_global_scale[1]
-      
-                # breakpoint()
+
                 local_w1_out = nvfp4_unpacked_weight_gemm(
                     x=current_state_static,
-                    input_global_scale=local_w1_input_global_scale,
-                    weight=local_w1_unpacked,
+                    weight_unpacked=local_w1_unpacked,
                     weight_scale=local_w1_scale,
                     weight_global_scale=local_w1_global_scale,
                 )
                 local_w3_out = nvfp4_unpacked_weight_gemm(
                     x=current_state_static,
-                    input_global_scale=local_w3_input_global_scale,
-                    weight=local_w3_unpacked,
+                    weight_unpacked=local_w3_unpacked,
                     weight_scale=local_w3_scale,
                     weight_global_scale=local_w3_global_scale,
                 )
@@ -501,12 +493,10 @@ def apply(
 
                 local_w2_out = nvfp4_unpacked_weight_gemm(
                     x=w13_out,
-                    input_global_scale=local_w2_input_global_scale,
-                    weight=local_w2_unpacked,
+                    weight_unpacked=local_w2_unpacked,
                     weight_scale=local_w2_scale,
                     weight_global_scale=local_w2_global_scale,
                 )
-
                 padded_weight = experts_mask[expert_index + ep_shift].unsqueeze(
                     1
                 )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -212,7 +212,6 @@ def apply_weights(self,
         from vllm.model_executor.layers.quantization.utils.nvfp4_qdq import (
             dequant_nvfp4,
             qdq_nvfp4,
-            qdq_nvfp4_with_gs
         )
 
         need_reshape = False
@@ -230,7 +229,8 @@ def apply_weights(self,
             packed=False,
         )
 
-        x = qdq_nvfp4_with_gs(x, layer.input_global_scale)
+        # breakpoint()
+        x = qdq_nvfp4(x)
         out = x @ hp_weight.t()
         if need_reshape:
             out = out.reshape(bs, seq_len, -1)
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_qdq.py b/vllm/model_executor/layers/quantization/utils/nvfp4_qdq.py
@@ -265,30 +265,18 @@ def nvfp4_quantize(
     return out_scales, data_lp
 
 
-def to_nvfp4(x, x_global_scale=None, do_pack=True):
-    if x_global_scale is None:
-        tensor_amax = torch.max(torch.abs(x))
-        per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
-        x_global_scale = per_tensor_scale
+def to_nvfp4(x, do_pack=True):
+    tensor_amax = torch.max(torch.abs(x))
+    per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
     out_scales, data_lp = nvfp4_quantize(
         data_hp=x,
         block_size=16,
-        per_tensor_scale=x_global_scale,
+        per_tensor_scale=per_tensor_scale,
         do_pack=do_pack,
     )
     return data_lp, out_scales, per_tensor_scale
 
 
-def to_nvfp4_with_gs(x, x_global_scale, do_pack=True):
-    out_scales, data_lp = nvfp4_quantize(
-        data_hp=x,
-        block_size=16,
-        per_tensor_scale=x_global_scale,
-        do_pack=do_pack,
-    )
-    return data_lp, out_scales
-
-
 def dequant_nvfp4(
     data_lp,
     out_scales,
@@ -329,26 +317,11 @@ def check_nan(x):
     return torch.isnan(x).any() or torch.isinf(x).any()
 
 
-def qdq_nvfp4(x, x_global_scale=None):
-    if envs.VLLM_DISABLE_INPUT_QDQ:
-        return x
-
-    data_lp, x_scale = to_nvfp4(x, x_global_scale, do_pack=False)
-    x_dq = dequant_nvfp4(
-        data_lp,
-        x_scale,
-        x_global_scale,
-        original_dtype=x.dtype,
-        packed=False,
-    )
-    return x_dq
-
-
-def qdq_nvfp4(x, x_global_scale=None):
+def qdq_nvfp4(x):
     if envs.VLLM_DISABLE_INPUT_QDQ:
         return x
 
-    data_lp, x_scale = to_nvfp4_with_gs(x, x_global_scale, do_pack=False)
+    data_lp, x_scale, x_global_scale = to_nvfp4(x, do_pack=False)
     x_dq = dequant_nvfp4(
         data_lp,
         x_scale,
@@ -359,20 +332,6 @@ def qdq_nvfp4(x, x_global_scale=None):
     return x_dq
 
 
-def qdq_nvfp4_with_gs(x, x_global_scale):
-    if envs.VLLM_DISABLE_INPUT_QDQ:
-        return x
-
-    data_lp, x_scale = to_nvfp4_with_gs(x, x_global_scale, do_pack=False)
-    x_dq = dequant_nvfp4(
-        data_lp,
-        x_scale,
-        x_global_scale,
-        original_dtype=x.dtype,
-        packed=False,
-    )
-    return x_dq
-
 class NVFP4Linear(torch.nn.Module):
     def __init__(self, in_features, out_features):
         super().__init__()