[BugFix] Fix Eagle IndexError: list index out of range for even num_speculative_tokens (#29102)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-03-18 20:57:19 +08:00 · 2025-11-21 19:47:05 -05:00 · 2025-11-21 19:47:05 -05:00 · 30d6466238
commit 30d6466238
parent e9af6ba62a
3 changed files with 37 additions and 20 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -748,6 +748,14 @@ class VllmRunner:
            # being captured which can trigger edge cases that we don't handle yet.
            kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]}

+            # Make sure we have atleast one cudagraph large enough for a single decode.
+            if (speculative_config := kwargs.get("speculative_config")) and (
+                num_speculative_tokens := speculative_config["num_speculative_tokens"]
+            ):
+                kwargs["compilation_config"]["cudagraph_capture_sizes"].append(
+                    num_speculative_tokens + 1
+                )
+
        with init_ctx:
            self.llm = LLM(
                model=model_name,
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -950,14 +950,18 @@ class CompilationConfig:
            )
        )

+        if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size:
+            # if one valid but would be round_down use that
+            rounded_sizes = [multiple_of]
+
        if len(rounded_sizes) == 0:
-            logger.warning(
-                "No valid cudagraph sizes after rounding to multiple of "
-                " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens"
-                " or max_cudagraph_capture_size (or cudagraph_capture_sizes)",
-                multiple_of,
+            raise ValueError(
+                f"No valid cudagraph sizes after rounding to multiple of {multiple_of} "
+                f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)"
+                f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}"
+                f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})"
+                f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})"
            )
-            return

        self.max_cudagraph_capture_size = rounded_sizes[-1]
        self.cudagraph_capture_sizes = rounded_sizes
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@ -86,9 +86,9 @@ class EagleProposer:

        self.use_cuda_graph = False

-        compilation_config = self.vllm_config.compilation_config
-        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            cudagraph_mode = compilation_config.cudagraph_mode
+        self.compilation_config = self.vllm_config.compilation_config
+        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            cudagraph_mode = self.compilation_config.cudagraph_mode
            if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode(
                CUDAGraphMode.PIECEWISE
            ):
@ -103,13 +103,6 @@ class EagleProposer:
                and not self.speculative_config.enforce_eager
            )

-        self.cudagraph_batch_sizes = (
-            (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes))
-            if self.use_cuda_graph
-            else []
-        )
-
-        self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
        # persistent buffers for cuda graph
        self.input_ids = torch.zeros(
            self.max_num_tokens, dtype=torch.int32, device=device
@ -276,7 +269,10 @@ class EagleProposer:
            per_layer_attn_metadata[layer_name] = draft_indexer_metadata

        cudagraph_runtime_mode = CUDAGraphMode.NONE
-        if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        if (
+            self.use_cuda_graph
+            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+        ):
            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
            cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
        else:
@ -366,7 +362,10 @@ class EagleProposer:
        # Generate the remaining draft tokens.
        draft_token_ids_list = [draft_token_ids]

-        if self.use_cuda_graph and batch_size <= self.cudagraph_batch_sizes[-1]:
+        if (
+            self.use_cuda_graph
+            and batch_size <= self.compilation_config.max_cudagraph_capture_size
+        ):
            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
            cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
        else:
@ -777,7 +776,10 @@ class EagleProposer:
            self.positions[:num_tokens] = tree_positions.view(-1)
            self.hidden_states[:num_tokens] = tree_hidden_states.view(num_tokens, -1)

-            if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+            if (
+                self.use_cuda_graph
+                and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+            ):
                num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
                cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
            else:
@ -1114,7 +1116,10 @@ class EagleProposer:
    ) -> None:
        # Determine if CUDA graphs should be used for this run.
        cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
-        if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        if (
+            cudagraphs_enabled
+            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+        ):
            num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)

        with set_forward_context(