From 78555c06a20fff084b9cc4f80a444928b36ed18f Mon Sep 17 00:00:00 2001 From: zhuhaoran Date: Fri, 12 Dec 2025 00:02:47 +0800 Subject: [PATCH] fix bug Signed-off-by: zhuhaoran --- vllm/v1/worker/gpu_input_batch.py | 20 +++++++++++++++++--- vllm/v1/worker/gpu_model_runner.py | 13 +++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 10f5df96f9535..786dfe95106e6 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -952,7 +952,18 @@ class InputBatch: sampled_token_ids = self.sampled_token_ids_cpu.tolist() # Replace placeholder token id(s) with actual sampled id(s). if sampled_ids := sampled_token_ids[prev_index]: - req_output_token_ids[-len(sampled_ids) :] = sampled_ids + num_placeholders = 0 + for t in reversed(req_output_token_ids): + if t == -1: + num_placeholders += 1 + else: + break + if num_placeholders == 0: + continue + assert num_placeholders <= len(sampled_ids) + req_output_token_ids[-num_placeholders:] = sampled_ids[ + :num_placeholders + ] def update_async_spec_token_ids( self, draft_token_ids_cpu: list[list[int]] | None @@ -971,13 +982,16 @@ class InputBatch: return for index, req_id in enumerate(self.req_ids): - prev_index = self.prev_req_id_to_index.get(req_id, default=None) + prev_index = self.prev_req_id_to_index.get(req_id) if prev_index is None: continue assert prev_index < len(draft_token_ids_cpu) draft_ids = draft_token_ids_cpu[prev_index] + if not draft_ids: + continue assert index < len(spec_token_ids) - spec_token_ids[index] = draft_ids + spec_token_ids[index].clear() + spec_token_ids[index].extend(draft_ids) @property def num_reqs(self) -> int: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 47f9ac90b7183..5c85a9bf399d9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3423,14 +3423,13 @@ class GPUModelRunner( for async scheduling with spec decode + penalty/bad_words. Returns None if no draft tokens were copied in previous step. """ + if not self._has_draft_tokens: + return None + if isinstance(self._draft_token_ids, list): return self._draft_token_ids - if ( - self.draft_token_ids_copy_event is None - or self.draft_token_ids_cpu is None - or not self._has_draft_tokens - ): + if self.draft_token_ids_copy_event is None or self.draft_token_ids_cpu is None: return None self._has_draft_tokens = False @@ -3600,9 +3599,7 @@ class GPUModelRunner( mm_embed_inputs=mm_embed_inputs, ) - self._copy_draft_token_ids( - self._draft_token_ids, self.input_batch.num_reqs - ) + self._copy_draft_token_ids(draft_token_ids, self.input_batch.num_reqs) return draft_token_ids def update_config(self, overrides: dict[str, Any]) -> None: