vllm-project · tlrmchlsmth · May 8, 2025 · Apr 29, 2025 · rasmith · Apr 29, 2025
@@ -124,11 +124,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # These are used in the final Attention.forward()
         layer._q_scale.copy_(q_scale)
         layer._prob_scale.copy_(prob_scale)
-        if q_scale == 1.0 or prob_scale == 1.0:
+        if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0
+                                              or prob_scale == 1.0):
             logger.warning_once(
-                f"Using Q scale {q_scale} and prob scale {prob_scale} "
-                "with fp8 attention. This may cause accuracy issues. "
-                "Please make sure Q/prob scaling factors are "
+                f"Using uncalibrated q_scale {q_scale} and/or prob_scale "
+                f"{prob_scale} with fp8 attention. This may cause accuracy "
+                "issues. Please make sure q/prob scaling factors are "
                 "available in the fp8 checkpoint.")
 
         del layer.k_scale