mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-16 12:35:01 +08:00
Enable prefix caching with full cuda graphs (#19617)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
3d330c4c09
commit
055915e6ce
@ -4495,7 +4495,6 @@ class VllmConfig:
|
||||
"full_cuda_graph is not supported with "
|
||||
"cascade attention. Disabling cascade attention.")
|
||||
self.model_config.disable_cascade_attn = True
|
||||
self.cache_config.enable_prefix_caching = False
|
||||
|
||||
if (self.kv_events_config is not None
|
||||
and self.kv_events_config.enable_kv_cache_events
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user