From 1260e432307247a0f0cf7d85fba164265f0bd327 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 1 Jan 2025 03:16:56 -0800 Subject: [PATCH] Minor Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_block_table.py | 3 ++- vllm/v1/worker/gpu_model_runner.py | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_block_table.py b/vllm/v1/worker/gpu_block_table.py index dd92ab4c3a592..5152394346ed9 100644 --- a/vllm/v1/worker/gpu_block_table.py +++ b/vllm/v1/worker/gpu_block_table.py @@ -88,7 +88,7 @@ class BlockTable: # Clear the source row. self.block_table_diff_np[src].fill(0) - def apply_diff(self, num_reqs: int) -> None: + def commit(self, num_reqs: int) -> None: if self.use_uva: # Only copy the diff to the GPU. ops.copy_subranges( @@ -103,6 +103,7 @@ class BlockTable: # table is large. self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs], non_blocking=True) + self.clear_diff() def clear(self) -> None: self.block_table.fill_(0) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0ab8118167765..80a1aac667eb5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -163,9 +163,6 @@ class GPUModelRunner: self.seq_start_loc_np = self.seq_start_loc_cpu.numpy() def _update_states(self, scheduler_output: "SchedulerOutput") -> None: - # Clean up diffs. - self.input_batch.block_table.clear_diff() - # Remove stopped requests from the cached states. # Keep the states of the pre-empted requests. for req_id in scheduler_output.finished_req_ids: @@ -270,7 +267,7 @@ class GPUModelRunner: # OPTIMIZATION: Start copying the block table first. # This way, we can overlap the copy with the following CPU operations. - self.input_batch.block_table.apply_diff(num_reqs) + self.input_batch.block_table.commit(num_reqs) # Get the number of scheduled tokens for each request. # TODO: The Python loop can be slow. Optimize.