diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 476c3edefb84..bc7578cbd97c 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -108,9 +108,10 @@ class InputBatch: pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() - self.is_token_ids = torch.zeros( + self.is_token_ids_tensor = torch.zeros( (max_num_reqs, max_model_len), device="cpu", dtype=bool, pin_memory=False ) + self.is_token_ids = self.is_token_ids_tensor.numpy() # Store prompt embeddings per request to avoid OOM from large upfront # allocation if max_model_len is big. # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a110ad54a05e..129d7e54466a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1103,7 +1103,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): out=self.input_ids.cpu[:total_num_scheduled_tokens], ) if self.enable_prompt_embeds: - is_token_ids = self.input_batch.is_token_ids.flatten() + is_token_ids = self.input_batch.is_token_ids_tensor.flatten() torch.index_select( is_token_ids, 0,