[KV Sharing] Raise error if using eagle with fast prefill (#24350)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-12-22 18:45:01 +08:00 · 2025-09-05 20:22:40 -07:00 · 2025-09-05 20:22:40 -07:00 · 3c529fc994
commit 3c529fc994
parent 35bf193864
2 changed files with 18 additions and 7 deletions
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@ -3665,6 +3665,24 @@ class VllmConfig:
                " Disabling `torch.compile`.")
            self.compilation_config.level = CompilationLevel.NO_COMPILATION
        if self.cache_config.kv_sharing_fast_prefill:
            if not envs.VLLM_USE_V1:
                raise NotImplementedError(
                    "Fast prefill optimization for KV sharing is not supported "
                    "in V0 currently.")
            if self.speculative_config is not None and \
                self.speculative_config.use_eagle():
                raise NotImplementedError(
                    "Fast prefill optimization for KV sharing is not "
                    "compatible with EAGLE as EAGLE requires correct logits "
                    "for all tokens while fast prefill gives incorrect logits "
                    "for prompt tokens.")
            logger.warning_once(
                "--kv-sharing-fast-prefill requires changes on model side for "
                "correctness and to realize prefill savings. ")
        if ((not envs.VLLM_USE_V1) and self.lora_config is not None
                and self.compilation_config.level
                != CompilationLevel.NO_COMPILATION):
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@ -145,19 +145,12 @@ class CacheConfig:
        self._verify_cache_dtype()
        self._verify_prefix_caching()
        self._verify_kv_sharing_fast_prefill()
    def metrics_info(self):
        # convert cache_config to dict(key: str, value: str) for prometheus
        # metrics info
        return {key: str(value) for key, value in self.__dict__.items()}
    def _verify_kv_sharing_fast_prefill(self) -> None:
        if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
            raise NotImplementedError(
                "Fast prefill optimization for KV sharing is not supported "
                "in V0 currently.")
    @model_validator(mode='after')
    def _verify_args(self) -> Self:
        if self.cpu_offload_gb < 0: