diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ec824f6d6bf5..f9bf6b720e40 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2056,7 +2056,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): pooler_output.append(output) return ModelRunnerOutput( - req_ids=self.input_batch.req_ids, + # NOTE(woosuk): input_batch.req_ids may include requests that are + # not scheduled in this step. Therefore, we truncate it here. + req_ids=self.input_batch.req_ids[: self.input_batch.num_reqs], req_id_to_index=self.input_batch.req_id_to_index, sampled_token_ids=[], logprobs=None, @@ -2269,7 +2271,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Copy some objects so they don't get modified after returning. # This is important when using async scheduling. - req_ids_output_copy = self.input_batch.req_ids.copy() + # NOTE(woosuk): input_batch.req_ids may include requests that are + # not scheduled in this step. Therefore, we truncate it here. + num_reqs = self.input_batch.num_reqs + req_ids_output_copy = self.input_batch.req_ids[:num_reqs].copy() req_id_to_index_output_copy = self.input_batch.req_id_to_index.copy() # NOTE: GPU -> CPU Sync happens here.