vllm-project · simon-mo · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -530,6 +530,10 @@ def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
                     supported = flash_attn_supports_fp8()
                 else:
                     supported = True
+            elif attention_backend == "FLASHINFER":
+                supported = True
+            elif attention_backend == "TRITON_ATTN_VLLM_V1":
+                supported = cls.supports_fp8()
         return supported
 
     @classmethod

@@ -202,7 +202,11 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
         else:
             assert self.kv_cache_spec.dtype == self.model_config.dtype
             self.kv_cache_dtype = self.kv_cache_spec.dtype
-        self.q_data_type = self.kv_cache_dtype
+
+        if supports_trtllm_attention()[0]:
+            self.q_data_type = self.kv_cache_dtype
+        else:
+            self.q_data_type = self.model_config.dtype
 
         self._cascade_wrapper = None  # Wrapper for cascade attention