Minor

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2026-07-02 11:07:10 +08:00 · 2025-01-01 03:16:56 -08:00 · 2025-01-01 03:16:56 -08:00 · 1260e43230
commit 1260e43230
parent a6e5d7b5b7
2 changed files with 3 additions and 5 deletions
--- a/vllm/v1/worker/gpu_block_table.py
+++ b/vllm/v1/worker/gpu_block_table.py
@ -88,7 +88,7 @@ class BlockTable:
            # Clear the source row.
            self.block_table_diff_np[src].fill(0)

-    def apply_diff(self, num_reqs: int) -> None:
+    def commit(self, num_reqs: int) -> None:
        if self.use_uva:
            # Only copy the diff to the GPU.
            ops.copy_subranges(
@ -103,6 +103,7 @@ class BlockTable:
            # table is large.
            self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
                                              non_blocking=True)
+        self.clear_diff()

    def clear(self) -> None:
        self.block_table.fill_(0)
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -163,9 +163,6 @@ class GPUModelRunner:
        self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()

    def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
-        # Clean up diffs.
-        self.input_batch.block_table.clear_diff()
-
        # Remove stopped requests from the cached states.
        # Keep the states of the pre-empted requests.
        for req_id in scheduler_output.finished_req_ids:
@ -270,7 +267,7 @@ class GPUModelRunner:

        # OPTIMIZATION: Start copying the block table first.
        # This way, we can overlap the copy with the following CPU operations.
-        self.input_batch.block_table.apply_diff(num_reqs)
+        self.input_batch.block_table.commit(num_reqs)

        # Get the number of scheduled tokens for each request.
        # TODO: The Python loop can be slow. Optimize.