mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 21:35:00 +08:00
[Core][Bookkeeping Optimization] Update against numpy view of is_token_ids tensor (#27618)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
This commit is contained in:
parent
d34f5fe939
commit
b46e4a06f1
@ -108,9 +108,10 @@ class InputBatch:
|
|||||||
pin_memory=False,
|
pin_memory=False,
|
||||||
)
|
)
|
||||||
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
||||||
self.is_token_ids = torch.zeros(
|
self.is_token_ids_tensor = torch.zeros(
|
||||||
(max_num_reqs, max_model_len), device="cpu", dtype=bool, pin_memory=False
|
(max_num_reqs, max_model_len), device="cpu", dtype=bool, pin_memory=False
|
||||||
)
|
)
|
||||||
|
self.is_token_ids = self.is_token_ids_tensor.numpy()
|
||||||
# Store prompt embeddings per request to avoid OOM from large upfront
|
# Store prompt embeddings per request to avoid OOM from large upfront
|
||||||
# allocation if max_model_len is big.
|
# allocation if max_model_len is big.
|
||||||
# Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
|
# Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
|
||||||
|
|||||||
@ -1103,7 +1103,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
out=self.input_ids.cpu[:total_num_scheduled_tokens],
|
out=self.input_ids.cpu[:total_num_scheduled_tokens],
|
||||||
)
|
)
|
||||||
if self.enable_prompt_embeds:
|
if self.enable_prompt_embeds:
|
||||||
is_token_ids = self.input_batch.is_token_ids.flatten()
|
is_token_ids = self.input_batch.is_token_ids_tensor.flatten()
|
||||||
torch.index_select(
|
torch.index_select(
|
||||||
is_token_ids,
|
is_token_ids,
|
||||||
0,
|
0,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user