[BugFix][Spec Decode] Fix spec token ids in model runner (#20530)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2026-03-17 00:37:15 +08:00 · 2025-07-06 12:44:52 -07:00 · 2025-07-06 12:44:52 -07:00 · 9528e3a05e
commit 9528e3a05e
parent 9fb52e523a
1 changed files with 12 additions and 11 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -528,19 +528,20 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                    start_token_index:end_token_index] = new_token_ids
                self.input_batch.num_tokens_no_spec[
                    req_index] = end_token_index
-                # Add spec_token_ids to token_ids_cpu.
-                spec_token_ids = (
-                    scheduler_output.scheduled_spec_decode_tokens.get(
-                        req_id, ()))
-                if spec_token_ids:
-                    start_index = end_token_index
-                    end_token_index += len(spec_token_ids)
-                    self.input_batch.token_ids_cpu[
-                        req_index,
-                        start_index:end_token_index] = spec_token_ids
-                # NOTE(woosuk): `num_tokens` here may include spec tokens.
                self.input_batch.num_tokens[req_index] = end_token_index

+            # Add spec_token_ids to token_ids_cpu.
+            spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
+            if spec_token_ids:
+                num_spec_tokens = len(spec_token_ids)
+                start_index = self.input_batch.num_tokens_no_spec[req_index]
+                end_token_index = start_index + num_spec_tokens
+                self.input_batch.token_ids_cpu[
+                    req_index, start_index:end_token_index] = spec_token_ids
+                # NOTE(woosuk): `num_tokens` here may include spec tokens.
+                self.input_batch.num_tokens[req_index] += num_spec_tokens
+
        # Add the new or resumed requests to the persistent batch.
        # The smaller empty indices are filled first.
        for req_id in req_ids_to_add: