Enable prefix caching with full cuda graphs (#19617)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon 2025-06-15 01:05:05 -07:00 committed by GitHub
parent 3d330c4c09
commit 055915e6ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4495,7 +4495,6 @@ class VllmConfig:
"full_cuda_graph is not supported with "
"cascade attention. Disabling cascade attention.")
self.model_config.disable_cascade_attn = True
self.cache_config.enable_prefix_caching = False
if (self.kv_events_config is not None
and self.kv_events_config.enable_kv_cache_events