From 30d64662387aaa74abcee294f27b83043f2d1ae6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 21 Nov 2025 19:47:05 -0500 Subject: [PATCH] [BugFix] Fix Eagle `IndexError: list index out of range` for even `num_speculative_tokens` (#29102) Signed-off-by: Lucas Wilkinson --- tests/conftest.py | 8 ++++++++ vllm/config/compilation.py | 16 ++++++++++------ vllm/v1/spec_decode/eagle.py | 33 +++++++++++++++++++-------------- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b17081352edcf..5afdb225b8923 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -748,6 +748,14 @@ class VllmRunner: # being captured which can trigger edge cases that we don't handle yet. kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]} + # Make sure we have atleast one cudagraph large enough for a single decode. + if (speculative_config := kwargs.get("speculative_config")) and ( + num_speculative_tokens := speculative_config["num_speculative_tokens"] + ): + kwargs["compilation_config"]["cudagraph_capture_sizes"].append( + num_speculative_tokens + 1 + ) + with init_ctx: self.llm = LLM( model=model_name, diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index abdae49106120..9b5309598d0e2 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -950,14 +950,18 @@ class CompilationConfig: ) ) + if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size: + # if one valid but would be round_down use that + rounded_sizes = [multiple_of] + if len(rounded_sizes) == 0: - logger.warning( - "No valid cudagraph sizes after rounding to multiple of " - " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens" - " or max_cudagraph_capture_size (or cudagraph_capture_sizes)", - multiple_of, + raise ValueError( + f"No valid cudagraph sizes after rounding to multiple of {multiple_of} " + f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)" + f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}" + f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})" + f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})" ) - return self.max_cudagraph_capture_size = rounded_sizes[-1] self.cudagraph_capture_sizes = rounded_sizes diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 0df9cd3214e53..3de418f1d13c8 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -86,9 +86,9 @@ class EagleProposer: self.use_cuda_graph = False - compilation_config = self.vllm_config.compilation_config - if compilation_config.mode == CompilationMode.VLLM_COMPILE: - cudagraph_mode = compilation_config.cudagraph_mode + self.compilation_config = self.vllm_config.compilation_config + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: + cudagraph_mode = self.compilation_config.cudagraph_mode if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode( CUDAGraphMode.PIECEWISE ): @@ -103,13 +103,6 @@ class EagleProposer: and not self.speculative_config.enforce_eager ) - self.cudagraph_batch_sizes = ( - (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes)) - if self.use_cuda_graph - else [] - ) - - self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes) # persistent buffers for cuda graph self.input_ids = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=device @@ -276,7 +269,10 @@ class EagleProposer: per_layer_attn_metadata[layer_name] = draft_indexer_metadata cudagraph_runtime_mode = CUDAGraphMode.NONE - if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -366,7 +362,10 @@ class EagleProposer: # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] - if self.use_cuda_graph and batch_size <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and batch_size <= self.compilation_config.max_cudagraph_capture_size + ): input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -777,7 +776,10 @@ class EagleProposer: self.positions[:num_tokens] = tree_positions.view(-1) self.hidden_states[:num_tokens] = tree_hidden_states.view(num_tokens, -1) - if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -1114,7 +1116,10 @@ class EagleProposer: ) -> None: # Determine if CUDA graphs should be used for this run. cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph - if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + cudagraphs_enabled + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) with set_forward_context(