diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 941aff8919a92..8bdc22acf380e 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3665,6 +3665,24 @@ class VllmConfig: " Disabling `torch.compile`.") self.compilation_config.level = CompilationLevel.NO_COMPILATION + if self.cache_config.kv_sharing_fast_prefill: + if not envs.VLLM_USE_V1: + raise NotImplementedError( + "Fast prefill optimization for KV sharing is not supported " + "in V0 currently.") + + if self.speculative_config is not None and \ + self.speculative_config.use_eagle(): + raise NotImplementedError( + "Fast prefill optimization for KV sharing is not " + "compatible with EAGLE as EAGLE requires correct logits " + "for all tokens while fast prefill gives incorrect logits " + "for prompt tokens.") + + logger.warning_once( + "--kv-sharing-fast-prefill requires changes on model side for " + "correctness and to realize prefill savings. ") + if ((not envs.VLLM_USE_V1) and self.lora_config is not None and self.compilation_config.level != CompilationLevel.NO_COMPILATION): diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 79761e7844859..6f8f962fe7cad 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -145,19 +145,12 @@ class CacheConfig: self._verify_cache_dtype() self._verify_prefix_caching() - self._verify_kv_sharing_fast_prefill() def metrics_info(self): # convert cache_config to dict(key: str, value: str) for prometheus # metrics info return {key: str(value) for key, value in self.__dict__.items()} - def _verify_kv_sharing_fast_prefill(self) -> None: - if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1: - raise NotImplementedError( - "Fast prefill optimization for KV sharing is not supported " - "in V0 currently.") - @model_validator(mode='after') def _verify_args(self) -> Self: if self.cpu_offload_gb < 0: