minimize fill_

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2026-06-26 15:17:23 +08:00 · 2025-02-04 22:03:51 +00:00 · 2025-02-04 22:03:51 +00:00 · 1244c25908
commit 1244c25908
parent 34fb0cbbd0
1 changed files with 2 additions and 7 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -457,15 +457,10 @@ class GPUModelRunner:
            self.slot_mapping_cpu[:total_num_scheduled_tokens],
            non_blocking=True)

-        # TODO: Which of these are needed?
+        # Fill unused with -1. Needed for reshape_and_cache
+        self.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
        self.seq_lens[num_reqs:].fill_(0)
        self.query_start_loc[num_reqs + 1:].fill_(-1)
-        self.positions[total_num_scheduled_tokens:].fill_(0)
-        self.input_batch.block_table.get_device_tensor()[num_reqs:].fill_(-1)
-
-        # Fill with -1s -- needed for reshape_and_cache
-        self.slot_mapping[total_num_scheduled_tokens:].fill_(
-            -1)  # Definitely needed

        # Prepare for cascade attention if needed.
        common_prefix_len = (scheduler_output.num_common_prefix_blocks *