mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 20:44:27 +08:00
perf: Avoid copying inputs_embeds tensors to GPU unless prompt_embeds is enabled (#25739)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
This commit is contained in:
parent
e84e0735c7
commit
d48f4d6daf
@ -836,8 +836,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
if self.input_batch.prev_sampled_token_ids is None:
|
if self.input_batch.prev_sampled_token_ids is None:
|
||||||
# Normal scheduling case
|
# Normal scheduling case
|
||||||
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
|
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
|
||||||
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
|
if self.enable_prompt_embeds:
|
||||||
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
|
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
|
||||||
|
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Async scheduling case, where some decode requests from the previous
|
# Async scheduling case, where some decode requests from the previous
|
||||||
@ -863,8 +864,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
# If not all requests are decodes from the last iteration,
|
# If not all requests are decodes from the last iteration,
|
||||||
# We need to copy the input_ids_cpu to the GPU first.
|
# We need to copy the input_ids_cpu to the GPU first.
|
||||||
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
|
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
|
||||||
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
|
if self.enable_prompt_embeds:
|
||||||
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
|
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
|
||||||
|
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
|
||||||
if num_commmon_tokens == 0:
|
if num_commmon_tokens == 0:
|
||||||
# No requests in common with the previous iteration
|
# No requests in common with the previous iteration
|
||||||
# So input_ids_cpu will have all the input ids.
|
# So input_ids_cpu will have all the input ids.
|
||||||
@ -878,7 +880,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
|
self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
|
||||||
0],
|
0],
|
||||||
non_blocking=True)
|
non_blocking=True)
|
||||||
self.is_token_ids.gpu[:num_commmon_tokens] = True
|
if self.enable_prompt_embeds:
|
||||||
|
self.is_token_ids.gpu[:num_commmon_tokens] = True
|
||||||
return
|
return
|
||||||
# Upload the index tensors asynchronously
|
# Upload the index tensors asynchronously
|
||||||
# so the scatter can be non-blocking.
|
# so the scatter can be non-blocking.
|
||||||
@ -978,12 +981,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
0,
|
0,
|
||||||
token_indices_tensor,
|
token_indices_tensor,
|
||||||
out=self.input_ids.cpu[:total_num_scheduled_tokens])
|
out=self.input_ids.cpu[:total_num_scheduled_tokens])
|
||||||
is_token_ids = self.input_batch.is_token_ids.flatten()
|
if self.enable_prompt_embeds:
|
||||||
torch.index_select(
|
is_token_ids = self.input_batch.is_token_ids.flatten()
|
||||||
is_token_ids,
|
torch.index_select(
|
||||||
0,
|
is_token_ids,
|
||||||
token_indices_tensor,
|
0,
|
||||||
out=self.is_token_ids.cpu[:total_num_scheduled_tokens])
|
token_indices_tensor,
|
||||||
|
out=self.is_token_ids.cpu[:total_num_scheduled_tokens])
|
||||||
|
|
||||||
# Because we did not pre-allocate a massive prompt_embeds CPU tensor on
|
# Because we did not pre-allocate a massive prompt_embeds CPU tensor on
|
||||||
# the InputBatch, we need to fill in the prompt embeds into the expected
|
# the InputBatch, we need to fill in the prompt embeds into the expected
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user