[Bugfix] Fix cuda graph sizes when running with speculative decoding (#30330)

Signed-off-by: Patryk Saffer <patryk.saffer99@gmail.com> Signed-off-by: PatrykSaffer <patryk.saffer@mistral.ai> Co-authored-by: Patryk Saffer <patryk.saffer99@gmail.com>
2025-12-23 06:55:01 +08:00 · 2025-12-10 01:47:07 +01:00 · 2025-12-10 01:47:07 +01:00 · 4c2e10ea19
commit 4c2e10ea19
parent 03b5f940fd
1 changed files with 7 additions and 1 deletions
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -1047,8 +1047,14 @@ class VllmConfig:
                self.compilation_config.max_cudagraph_capture_size
            )
            if max_cudagraph_capture_size is None:
                decode_query_len = 1
                if (
                    self.speculative_config
                    and self.speculative_config.num_speculative_tokens
                ):
                    decode_query_len += self.speculative_config.num_speculative_tokens
                max_cudagraph_capture_size = min(
-                    self.scheduler_config.max_num_seqs * 2, 512
+                    self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
                )
            max_num_tokens = self.scheduler_config.max_num_batched_tokens
            max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)