From b2ea5ba6776fdf7343f24acfee9de8604e1d5d6d Mon Sep 17 00:00:00 2001 From: 7mile Date: Tue, 7 Oct 2025 02:24:22 +0800 Subject: [PATCH] [Bugfix][Spec Decode] Fix wrong valid_mask for padded speculation when chunked prefill occurs (#26231) Signed-off-by: seven-mile Signed-off-by: Benjamin Chislett Co-authored-by: Benjamin Chislett --- vllm/v1/spec_decode/eagle.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5d4822a6279b2..1e1161727be1e 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -522,13 +522,9 @@ class EagleProposer: ) # Generate a mask for all valid tokens within those requests - max_gen_len = sampled_token_ids.shape[-1] - if max_gen_len == 1: - valid_mask = torch.ones_like(valid_sampled_token_ids_gpu, dtype=torch.bool) - else: - valid_mask = (valid_sampled_token_ids_gpu != -1) & ( - valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size - ) + valid_mask = (valid_sampled_token_ids_gpu != -1) & ( + valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size + ) # Count the number of valid tokens in each request valid_sampled_tokens_count = valid_mask.sum(dim=1)