From 19c0dfc4694c4d64148bbd39b633649f433fadf7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Aug 2025 13:08:07 -0700
Subject: [PATCH] minor

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py  | 24 +++++++++---------------
 vllm/v1/worker/gpu_worker_states.py |  1 +
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 37df18b231057..ad8312c86d2e7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -269,6 +269,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device=self.device,
             pin_memory=self.pin_memory,
         )
+        self.idx_mapping = self._make_buffer(self.max_num_reqs,
+                                             dtype=torch.int32)
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         # Keep in int64 to avoid overflow with long context
@@ -277,13 +279,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                        self.max_num_tokens),
                                    dtype=np.int64)
 
-        self.index_mapping_cpu = torch.zeros(self.max_num_reqs,
-                                             dtype=torch.int32,
-                                             device="cpu",
-                                             pin_memory=self.pin_memory)
-        self.index_mapping_np = self.index_mapping_cpu.numpy()
-        self.index_mapping = self.index_mapping_cpu.to(self.device)
-
         # Layer pairings for cross-layer KV sharing.
         # If an Attention layer `layer_name` is in the keys of this dict, it
         # means this layer will perform attention using the keys and values
@@ -572,10 +567,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         idx_mapping_list = [
             self.requests.req_id_to_index[req_id] for req_id in req_ids
         ]
-        self.index_mapping_np[:num_reqs] = idx_mapping_list
-        index_mapping_np = self.index_mapping_np[:num_reqs]
-        idx_mapping = self.index_mapping[:num_reqs].copy_(
-            self.index_mapping_cpu[:num_reqs], non_blocking=True)
+        self.idx_mapping.np[:num_reqs] = idx_mapping_list
+        idx_mapping_np = self.idx_mapping.np[:num_reqs]
+        idx_mapping = self.idx_mapping.copy_to_gpu(num_reqs)
 
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
@@ -587,7 +581,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_num_scheduled_tokens = max(tokens)
 
         prepare_inputs(
-            idx_mapping=index_mapping_np,
+            idx_mapping=idx_mapping_np,
             token_ids=self.requests.token_ids.np,
             num_computed_tokens=self.requests.num_computed_tokens.np,
             num_scheduled_tokens=num_scheduled_tokens,
@@ -774,7 +768,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_scheduled_tokens=num_scheduled_tokens,
             req_id_to_batch_idx=req_id_to_batch_idx,
             idx_mapping=idx_mapping,
-            idx_mapping_np=index_mapping_np,
+            idx_mapping_np=idx_mapping_np,
             num_reqs=num_reqs,
             total_num_tokens=total_num_scheduled_tokens,
             max_num_tokens=max_num_scheduled_tokens,
@@ -1378,8 +1372,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
         num_input_tokens += num_pad
 
-        # _prepare_inputs may reorder the batch, so we must gather multi
-        # modal outputs after that to ensure the correct order
+        # _prepare_inputs decides the order of the requests, so we must gather
+        # multimodal outputs after that.
         if self.supports_mm_inputs:
             # Run the multimodal encoder if any.
             self._execute_mm_encoder(scheduler_output)
diff --git a/vllm/v1/worker/gpu_worker_states.py b/vllm/v1/worker/gpu_worker_states.py
index 9e7189e3bd40a..4aa656ce9ab89 100644
--- a/vllm/v1/worker/gpu_worker_states.py
+++ b/vllm/v1/worker/gpu_worker_states.py
@@ -249,6 +249,7 @@ class RequestState:
         batch_idx_to_req_idx: torch.Tensor,
     ) -> SamplingMetadata:
         batch_size = batch_idx_to_req_idx.shape[0]
+        # TODO(woosuk): Use UVA to optimize CPU -> GPU copy.
         _make_sampling_metadata_kernel[(batch_size, )](
             batch_idx_to_req_idx,
             self.temperature.mirror_to_gpu(),