[V1][Minor] Set pin_memory=False for token_ids_cpu tensor (#11581)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-12-18 03:35:01 +08:00 · 2024-12-28 22:33:12 +09:00 · 2024-12-28 22:33:12 +09:00 · 42bb201fd6
commit 42bb201fd6
parent 59d6bb4c86
1 changed files with 3 additions and 1 deletions
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@ -57,11 +57,13 @@ class InputBatch:
        # TODO(woosuk): This buffer could be too large if max_model_len is big.
        # Find a way to reduce the CPU memory usage.
        # This buffer is not directly transferred to the GPU, so it does not
        # need to be pinned.
        self.token_ids_cpu_tensor = torch.zeros(
            (max_num_reqs, max_model_len),
            device="cpu",
            dtype=torch.int32,
-            pin_memory=pin_memory,
+            pin_memory=False,
        )
        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)