From 78555c06a20fff084b9cc4f80a444928b36ed18f Mon Sep 17 00:00:00 2001
From: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Date: Fri, 12 Dec 2025 00:02:47 +0800
Subject: [PATCH] fix bug

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
---
 vllm/v1/worker/gpu_input_batch.py  | 20 +++++++++++++++++---
 vllm/v1/worker/gpu_model_runner.py | 13 +++++--------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 10f5df96f9535..786dfe95106e6 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -952,7 +952,18 @@ class InputBatch:
                 sampled_token_ids = self.sampled_token_ids_cpu.tolist()
             # Replace placeholder token id(s) with actual sampled id(s).
             if sampled_ids := sampled_token_ids[prev_index]:
-                req_output_token_ids[-len(sampled_ids) :] = sampled_ids
+                num_placeholders = 0
+                for t in reversed(req_output_token_ids):
+                    if t == -1:
+                        num_placeholders += 1
+                    else:
+                        break
+                if num_placeholders == 0:
+                    continue
+                assert num_placeholders <= len(sampled_ids)
+                req_output_token_ids[-num_placeholders:] = sampled_ids[
+                    :num_placeholders
+                ]
 
     def update_async_spec_token_ids(
         self, draft_token_ids_cpu: list[list[int]] | None
@@ -971,13 +982,16 @@ class InputBatch:
             return
 
         for index, req_id in enumerate(self.req_ids):
-            prev_index = self.prev_req_id_to_index.get(req_id, default=None)
+            prev_index = self.prev_req_id_to_index.get(req_id)
             if prev_index is None:
                 continue
             assert prev_index < len(draft_token_ids_cpu)
             draft_ids = draft_token_ids_cpu[prev_index]
+            if not draft_ids:
+                continue
             assert index < len(spec_token_ids)
-            spec_token_ids[index] = draft_ids
+            spec_token_ids[index].clear()
+            spec_token_ids[index].extend(draft_ids)
 
     @property
     def num_reqs(self) -> int:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 47f9ac90b7183..5c85a9bf399d9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3423,14 +3423,13 @@ class GPUModelRunner(
         for async scheduling with spec decode + penalty/bad_words.
         Returns None if no draft tokens were copied in previous step.
         """
+        if not self._has_draft_tokens:
+            return None
+
         if isinstance(self._draft_token_ids, list):
             return self._draft_token_ids
 
-        if (
-            self.draft_token_ids_copy_event is None
-            or self.draft_token_ids_cpu is None
-            or not self._has_draft_tokens
-        ):
+        if self.draft_token_ids_copy_event is None or self.draft_token_ids_cpu is None:
             return None
 
         self._has_draft_tokens = False
@@ -3600,9 +3599,7 @@ class GPUModelRunner(
                 mm_embed_inputs=mm_embed_inputs,
             )
 
-        self._copy_draft_token_ids(
-            self._draft_token_ids, self.input_batch.num_reqs
-        )
+        self._copy_draft_token_ids(draft_token_ids, self.input_batch.num_reqs)
         return draft_token_ids
 
     def update_config(self, overrides: dict[str, Any]) -> None: