From 70189d8eb0863d499014616cf90cd3b7a9bbc9e5 Mon Sep 17 00:00:00 2001 From: zhuhaoran Date: Fri, 5 Dec 2025 15:51:43 +0800 Subject: [PATCH] fix: copy actual sampled_token_ids to req_output_token_ids Signed-off-by: zhuhaoran --- vllm/v1/worker/gpu_input_batch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index ead7a3619dea5..0f523eacfa4bf 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -949,9 +949,11 @@ class InputBatch: if sampled_token_ids is None: assert self.async_copy_ready_event is not None self.async_copy_ready_event.synchronize() - sampled_token_ids = self.sampled_token_ids_cpu.squeeze(-1).tolist() + sampled_token_ids = self.sampled_token_ids_cpu.tolist() # Replace placeholder token id with actual sampled id. - req_output_token_ids[-1] = sampled_token_ids[prev_index] + req_output_token_ids[-len(sampled_token_ids[prev_index]) :] = ( + sampled_token_ids[prev_index] + ) @property def num_reqs(self) -> int: