From 4f6593b058dc7ba66d887442ba5763c6c1b3886e Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Thu, 11 Sep 2025 21:47:58 +0800 Subject: [PATCH] [HybridKVCache][Platform] Add support_hybrid_kv_cache for platform (#24646) Signed-off-by: MengqingCao --- vllm/config/__init__.py | 3 +-- vllm/platforms/cpu.py | 4 ++++ vllm/platforms/cuda.py | 4 ++++ vllm/platforms/interface.py | 7 +++++++ vllm/platforms/rocm.py | 4 ++++ 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 8a75b28f38a5..24eaf2e360ab 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3529,8 +3529,7 @@ class VllmConfig: # logger should only print warning message for hybrid models. As we # can't know whether the model is hybrid or not now, so we don't log # warning message here and will log it later. - if not (current_platform.is_cuda() or current_platform.is_rocm() - or current_platform.is_cpu()): + if not current_platform.support_hybrid_kv_cache(): # Hybrid KV cache manager is not supported on non-GPU platforms. self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_transfer_config is not None: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 31d626a9e966..c5b6d91a62b6 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -347,3 +347,7 @@ class CpuPlatform(Platform): @classmethod def opaque_attention_op(cls) -> bool: return True + + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index dc94cfcc3ce8..52b33849dae6 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -571,6 +571,10 @@ class CudaPlatformBase(Platform): "You can use float16 instead by explicitly setting the " "`dtype` flag in CLI, for example: --dtype=half.") + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index ab0eaa82ef20..59aa46818569 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -586,6 +586,13 @@ class Platform: """ raise NotImplementedError + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + """ + Returns if the hybrid kv cache is supported by the current platform. + """ + return False + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index fd899dcc9a65..f4d136c5e0aa 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -498,3 +498,7 @@ class RocmPlatform(Platform): f"Your {gpu_name} GPU {compute_str}. " "You can use float16 instead by explicitly setting the " "`dtype` flag in CLI, for example: --dtype=half.") + + @classmethod + def support_hybrid_kv_cache(cls) -> bool: + return True