[Misc][Hybrid allocator + kv connector] Optionally enable hybrid allocator + KV cache connector (#29805)

Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
Nicolò Lucchesi 2025-12-15 12:17:58 +01:00 committed by GitHub
parent e4806d973a
commit 185c22bf2f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 64 additions and 38 deletions

View File

@ -122,10 +122,12 @@ class SchedulerConfig:
the default scheduler. Can be a class directly or the path to a class of the default scheduler. Can be a class directly or the path to a class of
form "mod.custom_class".""" form "mod.custom_class"."""
disable_hybrid_kv_cache_manager: bool = False disable_hybrid_kv_cache_manager: bool | None = None
"""If set to True, KV cache manager will allocate the same size of KV cache """If set to True, KV cache manager will allocate the same size of KV cache
for all attention layers even if there are multiple type of attention layers for all attention layers even if there are multiple type of attention layers
like full attention and sliding window attention. like full attention and sliding window attention.
If set to None, the default value will be determined based on the environment
and starting configuration.
""" """
async_scheduling: bool = False async_scheduling: bool = False

View File

@ -887,17 +887,48 @@ class VllmConfig:
if not self.instance_id: if not self.instance_id:
self.instance_id = random_uuid()[:5] self.instance_id = random_uuid()[:5]
if not self.scheduler_config.disable_hybrid_kv_cache_manager: # Hybrid KV cache manager (HMA) runtime rules:
# logger should only print warning message for hybrid models. As we # - Explicit enable (--no-disable-kv-cache-manager): error if runtime
# can't know whether the model is hybrid or not now, so we don't log # disables it
# warning message here and will log it later. # - No preference: auto-disable for unsupported features (e.g. kv connector)
if not current_platform.support_hybrid_kv_cache(): # - Explicit disable (--disable-kv-cache-manager): always respect it
# Hybrid KV cache manager is not supported on non-GPU platforms. need_disable_hybrid_kv_cache_manager = False
self.scheduler_config.disable_hybrid_kv_cache_manager = True # logger should only print warning message for hybrid models. As we
# can't know whether the model is hybrid or not now, so we don't log
# warning message here and will log it later.
if not current_platform.support_hybrid_kv_cache():
# Hybrid KV cache manager is not supported on non-GPU platforms.
need_disable_hybrid_kv_cache_manager = True
if self.kv_events_config is not None:
# Hybrid KV cache manager is not compatible with KV events.
need_disable_hybrid_kv_cache_manager = True
if (
self.model_config is not None
and self.model_config.attention_chunk_size is not None
):
if (
self.speculative_config is not None
and self.speculative_config.use_eagle()
):
# Hybrid KV cache manager is not yet supported with chunked
# local attention + eagle.
need_disable_hybrid_kv_cache_manager = True
elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
logger.warning(
"There is a latency regression when using chunked local"
" attention with the hybrid KV cache manager. Disabling"
" it, by default. To enable it, set the environment "
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
)
# Hybrid KV cache manager is not yet supported with chunked
# local attention.
need_disable_hybrid_kv_cache_manager = True
if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
# Default to disable HMA, but only if the user didn't express a preference.
if self.kv_transfer_config is not None: if self.kv_transfer_config is not None:
# NOTE(Kuntai): turn HMA off for connector for now. # NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
# TODO(Kuntai): have a more elegent solution to check and need_disable_hybrid_kv_cache_manager = True
# turn off HMA for connector that does not support HMA.
logger.warning( logger.warning(
"Turning off hybrid kv cache manager because " "Turning off hybrid kv cache manager because "
"`--kv-transfer-config` is set. This will reduce the " "`--kv-transfer-config` is set. This will reduce the "
@ -905,33 +936,26 @@ class VllmConfig:
"or Mamba attention. If you are a developer of kv connector" "or Mamba attention. If you are a developer of kv connector"
", please consider supporting hybrid kv cache manager for " ", please consider supporting hybrid kv cache manager for "
"your connector by making sure your connector is a subclass" "your connector by making sure your connector is a subclass"
" of `SupportsHMA` defined in kv_connector/v1/base.py." " of `SupportsHMA` defined in kv_connector/v1/base.py and"
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
) )
self.scheduler_config.disable_hybrid_kv_cache_manager = True self.scheduler_config.disable_hybrid_kv_cache_manager = (
if self.kv_events_config is not None: need_disable_hybrid_kv_cache_manager
# Hybrid KV cache manager is not compatible with KV events. )
self.scheduler_config.disable_hybrid_kv_cache_manager = True elif (
if ( self.scheduler_config.disable_hybrid_kv_cache_manager is False
self.model_config is not None and need_disable_hybrid_kv_cache_manager
and self.model_config.attention_chunk_size is not None ):
): raise ValueError(
if ( "Hybrid KV cache manager was explicitly enabled but is not "
self.speculative_config is not None "supported in this configuration. Consider omitting the "
and self.speculative_config.use_eagle() "--no-disable-hybrid-kv-cache-manager flag to let vLLM decide"
): " automatically."
# Hybrid KV cache manager is not yet supported with chunked )
# local attention + eagle.
self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: # Default to enable HMA if not explicitly disabled by user or logic above.
logger.warning( self.scheduler_config.disable_hybrid_kv_cache_manager = False
"There is a latency regression when using chunked local"
" attention with the hybrid KV cache manager. Disabling"
" it, by default. To enable it, set the environment "
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
)
# Hybrid KV cache manager is not yet supported with chunked
# local attention.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.compilation_config.debug_dump_path: if self.compilation_config.debug_dump_path:
self.compilation_config.debug_dump_path = ( self.compilation_config.debug_dump_path = (

View File

@ -491,7 +491,7 @@ class EngineArgs:
enable_chunked_prefill: bool | None = None enable_chunked_prefill: bool | None = None
disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
disable_hybrid_kv_cache_manager: bool = ( disable_hybrid_kv_cache_manager: bool | None = (
SchedulerConfig.disable_hybrid_kv_cache_manager SchedulerConfig.disable_hybrid_kv_cache_manager
) )