Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon 2025-01-01 03:16:56 -08:00
parent a6e5d7b5b7
commit 1260e43230
2 changed files with 3 additions and 5 deletions

View File

@ -88,7 +88,7 @@ class BlockTable:
# Clear the source row.
self.block_table_diff_np[src].fill(0)
def apply_diff(self, num_reqs: int) -> None:
def commit(self, num_reqs: int) -> None:
if self.use_uva:
# Only copy the diff to the GPU.
ops.copy_subranges(
@ -103,6 +103,7 @@ class BlockTable:
# table is large.
self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
non_blocking=True)
self.clear_diff()
def clear(self) -> None:
self.block_table.fill_(0)

View File

@ -163,9 +163,6 @@ class GPUModelRunner:
self.seq_start_loc_np = self.seq_start_loc_cpu.numpy()
def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
# Clean up diffs.
self.input_batch.block_table.clear_diff()
# Remove stopped requests from the cached states.
# Keep the states of the pre-empted requests.
for req_id in scheduler_output.finished_req_ids:
@ -270,7 +267,7 @@ class GPUModelRunner:
# OPTIMIZATION: Start copying the block table first.
# This way, we can overlap the copy with the following CPU operations.
self.input_batch.block_table.apply_diff(num_reqs)
self.input_batch.block_table.commit(num_reqs)
# Get the number of scheduled tokens for each request.
# TODO: The Python loop can be slow. Optimize.