mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 18:45:01 +08:00
[KV Sharing] Raise error if using eagle with fast prefill (#24350)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
This commit is contained in:
parent
35bf193864
commit
3c529fc994
@ -3665,6 +3665,24 @@ class VllmConfig:
|
|||||||
" Disabling `torch.compile`.")
|
" Disabling `torch.compile`.")
|
||||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||||
|
|
||||||
|
if self.cache_config.kv_sharing_fast_prefill:
|
||||||
|
if not envs.VLLM_USE_V1:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Fast prefill optimization for KV sharing is not supported "
|
||||||
|
"in V0 currently.")
|
||||||
|
|
||||||
|
if self.speculative_config is not None and \
|
||||||
|
self.speculative_config.use_eagle():
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Fast prefill optimization for KV sharing is not "
|
||||||
|
"compatible with EAGLE as EAGLE requires correct logits "
|
||||||
|
"for all tokens while fast prefill gives incorrect logits "
|
||||||
|
"for prompt tokens.")
|
||||||
|
|
||||||
|
logger.warning_once(
|
||||||
|
"--kv-sharing-fast-prefill requires changes on model side for "
|
||||||
|
"correctness and to realize prefill savings. ")
|
||||||
|
|
||||||
if ((not envs.VLLM_USE_V1) and self.lora_config is not None
|
if ((not envs.VLLM_USE_V1) and self.lora_config is not None
|
||||||
and self.compilation_config.level
|
and self.compilation_config.level
|
||||||
!= CompilationLevel.NO_COMPILATION):
|
!= CompilationLevel.NO_COMPILATION):
|
||||||
|
|||||||
@ -145,19 +145,12 @@ class CacheConfig:
|
|||||||
|
|
||||||
self._verify_cache_dtype()
|
self._verify_cache_dtype()
|
||||||
self._verify_prefix_caching()
|
self._verify_prefix_caching()
|
||||||
self._verify_kv_sharing_fast_prefill()
|
|
||||||
|
|
||||||
def metrics_info(self):
|
def metrics_info(self):
|
||||||
# convert cache_config to dict(key: str, value: str) for prometheus
|
# convert cache_config to dict(key: str, value: str) for prometheus
|
||||||
# metrics info
|
# metrics info
|
||||||
return {key: str(value) for key, value in self.__dict__.items()}
|
return {key: str(value) for key, value in self.__dict__.items()}
|
||||||
|
|
||||||
def _verify_kv_sharing_fast_prefill(self) -> None:
|
|
||||||
if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Fast prefill optimization for KV sharing is not supported "
|
|
||||||
"in V0 currently.")
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
@model_validator(mode='after')
|
||||||
def _verify_args(self) -> Self:
|
def _verify_args(self) -> Self:
|
||||||
if self.cpu_offload_gb < 0:
|
if self.cpu_offload_gb < 0:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user