rework

jmkuebler · jmkuebler · commit b30425b0ee11 · 2025-09-24T12:17:47.000Z
Signed-off-by: Jonas Kuebler &lt;kuebj@amazon.com&gt;
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
@@ -34,6 +34,12 @@ class AttentionBackend(ABC):
     # makes sure the output tensor is allocated inside the cudagraph.
     accept_output_buffer: bool = False
 
+    # Whether this backend supports receiving pre-quantized query input.
+    # If True, the attention layer will handle query quantization instead
+    # of the backend, allowing torch.compile to fuse quantization with
+    # previous operations.
+    supports_quant_query_input: bool = False
+
     @staticmethod
     @abstractmethod
     def get_name() -> str:
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -22,7 +22,10 @@
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
 from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import GiB_bytes, direct_register_custom_op
@@ -247,6 +250,13 @@ def __init__(
                 "This may be caused by insufficient memory to allocate "
                 "kv cache.") from e
 
+        # for attn backends supporting query quantization
+        self.query_quant = None
+        if self.kv_cache_dtype.startswith(
+                "fp8") and self.attn_backend.supports_quant_query_input:
+            self.query_quant = QuantFP8(static=True,
+                                        group_shape=GroupShape.PER_TENSOR)
+
     def forward(
         self,
         query: torch.Tensor,
@@ -270,6 +280,15 @@ def forward(
             attn_metadata = get_forward_context().attn_metadata
             if attn_metadata.enable_kv_scales_calculation:
                 self.calc_kv_scales(query, key, value)
+
+        if self.query_quant is not None:
+            # quantizing with a simple torch operation enables
+            # torch.compile to fuse this into previous ops
+            # which reduces overheads during decoding.
+            # Otherwise queries are quantized using custom ops
+            # which causes decoding overheads
+            query, _ = self.query_quant.forward_native(query, self._q_scale)
+
         if self.use_output:
             output_shape = (output_shape
                             if output_shape is not None else query.shape)
@@ -278,22 +297,6 @@ def forward(
                                  device=query.device)
             hidden_size = output_shape[-1]
 
-            if envs.VLLM_FUSE_QUERY_QUANT and self.kv_cache_dtype != "auto":
-                # quantizing with a simple torch operation enables
-                # torch.compile to fuse this into previous ops
-                # which reduces overheads during decoding.
-                # Otherwise queries are quantized using custom ops
-                # which causes decoding overheads
-                assert self._q_scale.numel() == 1
-                if self.kv_cache_dtype in ["fp8", "fp8_e4m3"]:
-                    query = (query / self._q_scale).to(torch.float8_e4m3fn)
-                elif self.kv_cache_dtype == "fp8_e5m2":
-                    query = (query / self._q_scale).to(torch.float8_e5m2)
-                else:
-                    raise NotImplementedError(
-                        "VLLM_FUSE_QUERY_QUANT only supported for fp8_e4m3 "
-                        "and fp8_e5m2")
-
             # We skip reshaping query, key and value tensors for the MLA
             # backend since these tensors have different semantics and are
             # processed differently.
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -7,7 +7,6 @@
 import numpy as np
 import torch
 
-from vllm import _custom_ops as ops
 from vllm import envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
@@ -38,6 +37,7 @@
 class FlashAttentionBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
+    supports_quant_query_input: bool = True
 
     @classmethod
     def get_supported_dtypes(cls) -> list[torch.dtype]:
@@ -506,17 +506,11 @@ def forward(
             )
 
         if self.kv_cache_dtype.startswith("fp8"):
+            # queries are quantized in the attention layer
             dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn(
                 self.kv_cache_dtype)
             key_cache = key_cache.view(dtype)
             value_cache = value_cache.view(dtype)
-            if not envs.VLLM_FUSE_QUERY_QUANT:
-                num_tokens, num_heads, head_size = query.shape
-                query, _ = ops.scaled_fp8_quant(
-                    query.reshape(
-                        (num_tokens, num_heads * head_size)).contiguous(),
-                    layer._q_scale)
-                query = query.reshape((num_tokens, num_heads, head_size))
 
         if not attn_metadata.use_cascade:
             cu_seqlens_q = attn_metadata.query_start_loc