diff --git a/vllm/config.py b/vllm/config.py index 6bfe94b761dd4..3bcbbe60652b7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4769,12 +4769,23 @@ class VllmConfig: # Hybrid KV cache manager is not compatible with KV events. self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.model_config is not None and \ - self.model_config.attention_chunk_size is not None and \ - self.speculative_config is not None and \ - self.speculative_config.use_eagle(): - # Hybrid KV cache manager is not yet supported with chunked - # local attention + eagle. - self.scheduler_config.disable_hybrid_kv_cache_manager = True + self.model_config.attention_chunk_size is not None: + if self.speculative_config is not None and \ + self.speculative_config.use_eagle(): + # Hybrid KV cache manager is not yet supported with chunked + # local attention + eagle. + self.scheduler_config.disable_hybrid_kv_cache_manager = True + elif \ + not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: + logger.warning( + "There is a latency regression when using chunked local" + " attention with the hybrid KV cache manager. Disabling" + " it, by default. To enable it, set the environment " + "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1." + ) + # Hybrid KV cache manager is not yet supported with chunked + # local attention. + self.scheduler_config.disable_hybrid_kv_cache_manager = True def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list: diff --git a/vllm/envs.py b/vllm/envs.py index 0eff741519ae5..fcfad4eec1621 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -143,6 +143,7 @@ if TYPE_CHECKING: VLLM_USE_CUDNN_PREFILL: bool = False VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_LOOPBACK_IP: str = "" + VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False def get_default_cache_root(): @@ -991,6 +992,17 @@ environment_variables: dict[str, Callable[[], Any]] = { # The default value is "VLLM". "VLLM_PROCESS_NAME_PREFIX": lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"), + + # Allow chunked local attention with hybrid kv cache manager. + # Currently using the Hybrid KV cache manager with chunked local attention + # in the Llama4 models (the only models currently using chunked local attn) + # causes a latency regression. For this reason, we disable it by default. + # This flag is used to allow users to enable it if they want to (to save on + # kv-cache memory usage and enable longer contexts) + # TODO(lucas): Remove this flag once latency regression is resolved. + "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": + lambda: bool(int(os.getenv(\ + "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))), } # --8<-- [end:env-vars-definition]