From 055915e6ce0bdd3a9f7222cd23084197faaed408 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 15 Jun 2025 01:05:05 -0700 Subject: [PATCH] Enable prefix caching with full cuda graphs (#19617) Signed-off-by: Woosuk Kwon --- vllm/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index b36bae806c3e7..7217a659a5595 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4495,7 +4495,6 @@ class VllmConfig: "full_cuda_graph is not supported with " "cascade attention. Disabling cascade attention.") self.model_config.disable_cascade_attn = True - self.cache_config.enable_prefix_caching = False if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events