From be493e0b3cfb5810d254e9845217878a39a4853b Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 27 Nov 2025 16:45:38 -0500 Subject: [PATCH] [BugFix] Fix new nightly failures (#29578) Signed-off-by: Lucas Wilkinson --- vllm/v1/attention/backends/utils.py | 26 ++++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 12 +++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 6e0d84e4fb4ac..27f07218d9b2e 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -100,6 +100,32 @@ class CommonAttentionMetadata: dcp_local_seq_lens_cpu: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" + # TODO(lucas): remove once we have FULL-CG spec-decode support + def unpadded( + self, num_actual_tokens: int, num_actual_reqs: int + ) -> "CommonAttentionMetadata": + maybe_slice_reqs = lambda x: x[:num_actual_reqs] if x is not None else None + return CommonAttentionMetadata( + query_start_loc=self.query_start_loc[: num_actual_reqs + 1], + query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1], + seq_lens=self.seq_lens[:num_actual_reqs], + seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs], + num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs], + num_reqs=num_actual_reqs, + num_actual_tokens=num_actual_tokens, + max_query_len=self.max_query_len, + max_seq_len=self.max_seq_len, + block_table_tensor=self.block_table_tensor[:num_actual_reqs], + slot_mapping=self.slot_mapping[:num_actual_tokens], + causal=self.causal, + logits_indices_padded=self.logits_indices_padded, + num_logits_indices=self.num_logits_indices, + encoder_seq_lens=maybe_slice_reqs(self.encoder_seq_lens), + encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu), + dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens), + dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu), + ) + def slice_query_start_locs( query_start_loc: torch.Tensor, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0ae4eb48acf22..6bff83658b45a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1551,7 +1551,7 @@ class GPUModelRunner( # Encoder-only layers do not have KV cache, so we need to # create a dummy block table and slot mapping for them. blk_table_tensor = torch.zeros( - (num_tokens_padded, 1), + (num_reqs_padded, 1), dtype=torch.int32, device=self.device, ) @@ -1652,6 +1652,16 @@ class GPUModelRunner( for layer_name in attn_group.layer_names: attn_metadata[layer_name] = attn_metadata_i + if spec_decode_common_attn_metadata is not None and ( + num_reqs != num_reqs_padded or num_tokens != num_tokens_padded + ): + # Currently the drafter still only uses piecewise cudagraphs (and modifies + # the attention metadata in directly), and therefore does not want to use + # padded attention metadata. + spec_decode_common_attn_metadata = ( + spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs) + ) + return attn_metadata, spec_decode_common_attn_metadata def _compute_cascade_attn_prefix_lens(