mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-18 03:35:01 +08:00
[V1][Minor] Set pin_memory=False for token_ids_cpu tensor (#11581)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
59d6bb4c86
commit
42bb201fd6
@ -57,11 +57,13 @@ class InputBatch:
|
|||||||
|
|
||||||
# TODO(woosuk): This buffer could be too large if max_model_len is big.
|
# TODO(woosuk): This buffer could be too large if max_model_len is big.
|
||||||
# Find a way to reduce the CPU memory usage.
|
# Find a way to reduce the CPU memory usage.
|
||||||
|
# This buffer is not directly transferred to the GPU, so it does not
|
||||||
|
# need to be pinned.
|
||||||
self.token_ids_cpu_tensor = torch.zeros(
|
self.token_ids_cpu_tensor = torch.zeros(
|
||||||
(max_num_reqs, max_model_len),
|
(max_num_reqs, max_model_len),
|
||||||
device="cpu",
|
device="cpu",
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
pin_memory=pin_memory,
|
pin_memory=False,
|
||||||
)
|
)
|
||||||
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
||||||
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
|
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user