mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-20 10:05:54 +08:00
Enable prefix caching with full cuda graphs (#19617)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
3d330c4c09
commit
055915e6ce
@ -4495,7 +4495,6 @@ class VllmConfig:
|
|||||||
"full_cuda_graph is not supported with "
|
"full_cuda_graph is not supported with "
|
||||||
"cascade attention. Disabling cascade attention.")
|
"cascade attention. Disabling cascade attention.")
|
||||||
self.model_config.disable_cascade_attn = True
|
self.model_config.disable_cascade_attn = True
|
||||||
self.cache_config.enable_prefix_caching = False
|
|
||||||
|
|
||||||
if (self.kv_events_config is not None
|
if (self.kv_events_config is not None
|
||||||
and self.kv_events_config.enable_kv_cache_events
|
and self.kv_events_config.enable_kv_cache_events
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user