mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 12:46:07 +08:00
add truncation
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
fbc2cc8217
commit
3a8990743e
@ -2056,7 +2056,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
pooler_output.append(output)
|
pooler_output.append(output)
|
||||||
|
|
||||||
return ModelRunnerOutput(
|
return ModelRunnerOutput(
|
||||||
req_ids=self.input_batch.req_ids,
|
# NOTE(woosuk): input_batch.req_ids may include requests that are
|
||||||
|
# not scheduled in this step. Therefore, we truncate it here.
|
||||||
|
req_ids=self.input_batch.req_ids[: self.input_batch.num_reqs],
|
||||||
req_id_to_index=self.input_batch.req_id_to_index,
|
req_id_to_index=self.input_batch.req_id_to_index,
|
||||||
sampled_token_ids=[],
|
sampled_token_ids=[],
|
||||||
logprobs=None,
|
logprobs=None,
|
||||||
@ -2269,7 +2271,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
# Copy some objects so they don't get modified after returning.
|
# Copy some objects so they don't get modified after returning.
|
||||||
# This is important when using async scheduling.
|
# This is important when using async scheduling.
|
||||||
req_ids_output_copy = self.input_batch.req_ids.copy()
|
# NOTE(woosuk): input_batch.req_ids may include requests that are
|
||||||
|
# not scheduled in this step. Therefore, we truncate it here.
|
||||||
|
num_reqs = self.input_batch.num_reqs
|
||||||
|
req_ids_output_copy = self.input_batch.req_ids[:num_reqs].copy()
|
||||||
req_id_to_index_output_copy = self.input_batch.req_id_to_index.copy()
|
req_id_to_index_output_copy = self.input_batch.req_id_to_index.copy()
|
||||||
|
|
||||||
# NOTE: GPU -> CPU Sync happens here.
|
# NOTE: GPU -> CPU Sync happens here.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user