From 4c2e10ea19b9053924d66f30f3d7121fbd9684f8 Mon Sep 17 00:00:00 2001 From: PatrykSaffer Date: Wed, 10 Dec 2025 01:47:07 +0100 Subject: [PATCH] [Bugfix] Fix cuda graph sizes when running with speculative decoding (#30330) Signed-off-by: Patryk Saffer Signed-off-by: PatrykSaffer Co-authored-by: Patryk Saffer --- vllm/config/vllm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 614a3226cb711..8f27db0013305 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1047,8 +1047,14 @@ class VllmConfig: self.compilation_config.max_cudagraph_capture_size ) if max_cudagraph_capture_size is None: + decode_query_len = 1 + if ( + self.speculative_config + and self.speculative_config.num_speculative_tokens + ): + decode_query_len += self.speculative_config.num_speculative_tokens max_cudagraph_capture_size = min( - self.scheduler_config.max_num_seqs * 2, 512 + self.scheduler_config.max_num_seqs * decode_query_len * 2, 512 ) max_num_tokens = self.scheduler_config.max_num_batched_tokens max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)