From 4f6593b058dc7ba66d887442ba5763c6c1b3886e Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Thu, 11 Sep 2025 21:47:58 +0800
Subject: [PATCH] [HybridKVCache][Platform] Add support_hybrid_kv_cache for
 platform (#24646)

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 vllm/config/__init__.py     | 3 +--
 vllm/platforms/cpu.py       | 4 ++++
 vllm/platforms/cuda.py      | 4 ++++
 vllm/platforms/interface.py | 7 +++++++
 vllm/platforms/rocm.py      | 4 ++++
 5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 8a75b28f38a5..24eaf2e360ab 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3529,8 +3529,7 @@ class VllmConfig:
             # logger should only print warning message for hybrid models. As we
             # can't know whether the model is hybrid or not now, so we don't log
             # warning message here and will log it later.
-            if not (current_platform.is_cuda() or current_platform.is_rocm()
-                    or current_platform.is_cpu()):
+            if not current_platform.support_hybrid_kv_cache():
                 # Hybrid KV cache manager is not supported on non-GPU platforms.
                 self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.kv_transfer_config is not None:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 31d626a9e966..c5b6d91a62b6 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -347,3 +347,7 @@ class CpuPlatform(Platform):
     @classmethod
     def opaque_attention_op(cls) -> bool:
         return True
+
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index dc94cfcc3ce8..52b33849dae6 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -571,6 +571,10 @@ class CudaPlatformBase(Platform):
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
 
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ab0eaa82ef20..59aa46818569 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -586,6 +586,13 @@ class Platform:
         """
         raise NotImplementedError
 
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        """
+        Returns if the hybrid kv cache is supported by the current platform.
+        """
+        return False
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index fd899dcc9a65..f4d136c5e0aa 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -498,3 +498,7 @@ class RocmPlatform(Platform):
                     f"Your {gpu_name} GPU {compute_str}. "
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
+
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True