[HybridKVCache][Platform] Add support_hybrid_kv_cache for platform (#24646)

MengqingCao · web-flow · commit 4f6593b058dc · 2025-09-11T21:47:58.000+08:00
Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
@@ -3529,8 +3529,7 @@ def __post_init__(self):
             # logger should only print warning message for hybrid models. As we
             # can't know whether the model is hybrid or not now, so we don't log
             # warning message here and will log it later.
-            if not (current_platform.is_cuda() or current_platform.is_rocm()
-                    or current_platform.is_cpu()):
+            if not current_platform.support_hybrid_kv_cache():
                 # Hybrid KV cache manager is not supported on non-GPU platforms.
                 self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.kv_transfer_config is not None:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
@@ -347,3 +347,7 @@ def default_v1(cls, model_config) -> bool:
     @classmethod
     def opaque_attention_op(cls) -> bool:
         return True
+
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -571,6 +571,10 @@ def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
 
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -586,6 +586,13 @@ def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
         """
         raise NotImplementedError
 
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        """
+        Returns if the hybrid kv cache is supported by the current platform.
+        """
+        return False
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -498,3 +498,7 @@ def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
                     f"Your {gpu_name} GPU {compute_str}. "
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
+
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True