From 0f199f197b4e7a835ccc5b4d15363f8faa7824c8 Mon Sep 17 00:00:00 2001 From: JialinOuyang-Meta Date: Fri, 18 Jul 2025 12:34:40 -0700 Subject: [PATCH] [Core] Avoid KVCacheBlock.__eq__ invocations in FreeKVCacheBlockQueue (#21005) Signed-off-by: Jialin Ouyang --- benchmarks/kv_cache/benchmark_block_pool.py | 108 ++++++++++++++++++++ tests/v1/core/test_kv_cache_utils.py | 28 ++--- tests/v1/core/test_prefix_caching.py | 26 ++--- vllm/v1/core/kv_cache_utils.py | 104 +++++++++++++------ 4 files changed, 209 insertions(+), 57 deletions(-) create mode 100644 benchmarks/kv_cache/benchmark_block_pool.py diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py new file mode 100644 index 0000000000000..134551bb61285 --- /dev/null +++ b/benchmarks/kv_cache/benchmark_block_pool.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc +import time +from typing import Optional + +from tabulate import tabulate + +from vllm.utils import FlexibleArgumentParser +from vllm.v1.core.block_pool import BlockPool + + +class Metric: + def __init__(self) -> None: + self.cnt: int = 0 + self.sum_v: int = 0 + self.max_v: Optional[int] = None + + def update(self, v: int) -> None: + self.cnt += 1 + self.sum_v += v + if self.max_v is None: + self.max_v = v + else: + self.max_v = max(self.max_v, v) + + def avg_v(self) -> float: + return self.sum_v * 1.0 / self.cnt + + +def main(args): + rows = [] + for allocate_block in args.allocate_blocks: + # Enforce a GC collect ahead to minimize the impact among runs + gc.collect() + block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True) + + get_blocks_metric: Metric = Metric() + free_blocks_metric: Metric = Metric() + for _ in range(args.num_iteration): + t1 = time.monotonic_ns() + blocks = block_pool.get_new_blocks(allocate_block) + t2 = time.monotonic_ns() + block_pool.free_blocks(blocks) + t3 = time.monotonic_ns() + get_blocks_metric.update(t2 - t1) + free_blocks_metric.update(t3 - t2) + + if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None: + rows.append( + [ + get_blocks_metric.cnt, + args.num_gpu_blocks, + allocate_block, + get_blocks_metric.avg_v() / 1000000, + get_blocks_metric.max_v / 1000000.0, + free_blocks_metric.avg_v() / 1000000, + free_blocks_metric.max_v / 1000000.0, + ] + ) + else: + print( + "No valid metrics found." + f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}" + ) + + print( + tabulate( + rows, + headers=[ + "Iterations", + "Total\nBlocks", + "Allocated\nBlocks", + "Get Blocks\nAvg (ms)", + "Get Blocks\nMax (ms)", + "Free Blocks\nAvg (ms)", + "Free Blocks\nMax (ms)", + ], + tablefmt="grid", + floatfmt=".6f", + ) + ) + + +def invoke_main() -> None: + parser = FlexibleArgumentParser( + description="Benchmark the performance of BlockPool for KV Cache." + ) + parser.add_argument("--num-gpu-blocks", type=int, default=100000) + parser.add_argument( + "--num-iteration", + type=int, + default=1000, + help="Number of iterations to run to stablize final data readings", + ) + parser.add_argument( + "--allocate-blocks", + type=int, + nargs="*", + default=[10, 50, 100, 500, 1000], + help="Number of blocks to allocate", + ) + args = parser.parse_args() + main(args) + + +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 0676cb3eb65d0..68b0601569012 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -132,8 +132,8 @@ def test_free_kv_cache_block_queue_initialization(): block = KVCacheBlock(block_id=0) queue = FreeKVCacheBlockQueue([block]) assert queue.num_free_blocks == 1 - assert queue.free_list_head == block - assert queue.free_list_tail == block + assert queue.fake_free_list_head.next_free_block is block + assert queue.fake_free_list_tail.prev_free_block is block def test_free_kv_cache_block_queue_operations(): @@ -145,36 +145,38 @@ def test_free_kv_cache_block_queue_operations(): # Check initial state assert queue.num_free_blocks == 5 - assert queue.free_list_head == blocks[0] - assert queue.free_list_tail == blocks[4] + assert queue.fake_free_list_head.next_free_block is blocks[0] + assert queue.fake_free_list_tail.prev_free_block is blocks[4] # Pop the first block block1 = queue.popleft() assert block1 == blocks[0] assert queue.num_free_blocks == 4 - assert queue.free_list_head == blocks[1] - assert queue.free_list_tail == blocks[4] + assert queue.fake_free_list_head.next_free_block is blocks[1] + assert queue.fake_free_list_tail.prev_free_block is blocks[4] # Remove a block from the middle block_to_remove = blocks[2] queue.remove(block_to_remove) assert queue.num_free_blocks == 3 - assert blocks[1].next_free_block == blocks[3] - assert blocks[3].prev_free_block == blocks[1] + assert blocks[1].next_free_block is blocks[3] + assert blocks[3].prev_free_block is blocks[1] # Append a block back queue.append(block_to_remove) assert queue.num_free_blocks == 4 - assert queue.free_list_tail == block_to_remove - assert block_to_remove.prev_free_block == blocks[4] - assert block_to_remove.next_free_block is None + assert queue.fake_free_list_tail.prev_free_block is block_to_remove + assert block_to_remove.prev_free_block is blocks[4] + assert block_to_remove.next_free_block is queue.fake_free_list_tail # Pop blocks until empty for _ in range(4): queue.popleft() assert queue.num_free_blocks == 0 - assert queue.free_list_head is None - assert queue.free_list_tail is None + assert (queue.fake_free_list_head.next_free_block + is queue.fake_free_list_tail) + assert (queue.fake_free_list_tail.prev_free_block + is queue.fake_free_list_head) # Attempt to pop from an empty queue with pytest.raises(ValueError) as e: diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index f31bdf74f4a67..b7f583de1f631 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -155,13 +155,14 @@ def test_prefill(hash_algo): assert block.ref_cnt == 2 # At this point, we should have 5 free blocks left. - assert manager.block_pool.free_block_queue.num_free_blocks == 5 + free_block_queue = manager.block_pool.free_block_queue + assert free_block_queue.num_free_blocks == 5 manager.free(req0) manager.free(req1) # All blocks should be available. - assert manager.block_pool.free_block_queue.num_free_blocks == 10 + assert free_block_queue.num_free_blocks == 10 # The order should be # [unallocated (6, 7, 8, 9, 10)] # [unique_req0 (4)] @@ -188,14 +189,10 @@ def test_prefill(hash_algo): # Although we only have 6 free blocks, we have 8 blocks in # the free block queue due to lazy removal. - assert manager.block_pool.free_block_queue.num_free_blocks == 6 - assert all([ - b.ref_cnt == 0 - for b in manager.block_pool.free_block_queue.get_all_free_blocks() - ]) - assert len([ - b for b in manager.block_pool.free_block_queue.get_all_free_blocks() - ]) == 6 + assert free_block_queue.num_free_blocks == 6 + assert all( + [b.ref_cnt == 0 for b in free_block_queue.get_all_free_blocks()]) + assert len([b for b in free_block_queue.get_all_free_blocks()]) == 6 manager.free(req2) @@ -209,9 +206,12 @@ def test_prefill(hash_algo): computed_blocks) # This block ID order also checks the eviction order. assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], ) - assert manager.block_pool.free_block_queue.num_free_blocks == 0 - assert manager.block_pool.free_block_queue.free_list_head is None - assert manager.block_pool.free_block_queue.free_list_tail is None + + assert free_block_queue.num_free_blocks == 0 + assert (free_block_queue.fake_free_list_head.next_free_block + is free_block_queue.fake_free_list_tail) + assert (free_block_queue.fake_free_list_tail.prev_free_block + is free_block_queue.fake_free_list_head) def test_prefill_hybrid_model(): diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6067a127e97fa..b1fab0d34de4c 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -212,27 +212,65 @@ class FreeKVCacheBlockQueue: def __init__(self, blocks: list[KVCacheBlock]) -> None: self.num_free_blocks = len(blocks) - # Initialize the doubly linked list of free blocks. - self.free_list_head: Optional[KVCacheBlock] = blocks[0] - self.free_list_tail: Optional[KVCacheBlock] = blocks[-1] + # Initialize doubly links of consecutive blocks for i in range(self.num_free_blocks): if i > 0: blocks[i].prev_free_block = blocks[i - 1] if i < self.num_free_blocks - 1: blocks[i].next_free_block = blocks[i + 1] + # Create a fake head and a tail block for the doubly linked list to + # reduce branching in the code + # + # The implementation garenteed that the fake head and tail + # are NEVER got popped, so we could safely assume each real blocks + # in the queue has prev and next blocks. + self.fake_free_list_head = KVCacheBlock(block_id=-1) + self.fake_free_list_tail = KVCacheBlock(block_id=-1) + if self.num_free_blocks > 0: + # Connect fake_head and fake_tail to the first and last block + # respectively. + self.fake_free_list_head.next_free_block = blocks[0] + blocks[0].prev_free_block = self.fake_free_list_head + self.fake_free_list_tail.prev_free_block = blocks[-1] + blocks[-1].next_free_block = self.fake_free_list_tail + else: + # For empty list, simply connect the fake head and tail. + self.fake_free_list_head.next_free_block = self.fake_free_list_tail + self.fake_free_list_tail.prev_free_block = self.fake_free_list_head + def popleft(self) -> KVCacheBlock: """Pop the first free block and reduce num_free_blocks by 1. Returns: The first free block. """ - if not self.free_list_head: + if (self.fake_free_list_head.next_free_block + is self.fake_free_list_tail + or self.fake_free_list_head.next_free_block is None): + assert self.num_free_blocks == 0, ( + f"num_free_blocks ({self.num_free_blocks}) is out of sync " + "with the free list.") raise ValueError("No free blocks available") - block = self.free_list_head - self.remove(block) - return block + first_block: KVCacheBlock = self.fake_free_list_head.next_free_block + + if first_block.next_free_block is None: + # This should not happen if the block is from the free list. + # It indicates a bug in the caller's logic. + raise RuntimeError("Invalid block found in popleft() " + "which doesn't have a valid next_free_block") + + # Connect fake_head and the next block of first_block (i.e. second block + # or fake tail). + self.fake_free_list_head.next_free_block = first_block.next_free_block + first_block.next_free_block.prev_free_block = self.fake_free_list_head + + # Remove the block from the linked list. + first_block.prev_free_block = first_block.next_free_block = None + + self.num_free_blocks -= 1 + return first_block def remove(self, block: KVCacheBlock) -> None: """Remove a block in the free list and reduce num_free_blocks by 1. @@ -240,19 +278,15 @@ class FreeKVCacheBlockQueue: Args: block: The block to remove. """ - if block.prev_free_block is not None: - # Link the previous block to the next block. - block.prev_free_block.next_free_block = block.next_free_block - if block.next_free_block is not None: - # Link the next block to the previous block. - block.next_free_block.prev_free_block = block.prev_free_block + if block.prev_free_block is None or block.next_free_block is None: + # This should not happen if the block is from the free list. + # It indicates a bug in the caller's logic. + raise RuntimeError(f"remove() called on an invalid block: {block}") - if block == self.free_list_head: - # Update the head if the block is the head. - self.free_list_head = block.next_free_block - if block == self.free_list_tail: - # Update the tail if the block is the tail. - self.free_list_tail = block.prev_free_block + # Link the previous block to the next block. + block.prev_free_block.next_free_block = block.next_free_block + # Link the next block to the previous block. + block.next_free_block.prev_free_block = block.prev_free_block # Remove the block from the linked list. block.prev_free_block = block.next_free_block = None @@ -265,17 +299,19 @@ class FreeKVCacheBlockQueue: Args: block: The block to append. """ - if self.free_list_tail is not None: - # Link the last block to the new block. - self.free_list_tail.next_free_block = block - block.prev_free_block = self.free_list_tail - self.free_list_tail = block - else: - # The free list is empty. - assert self.free_list_head is None - self.free_list_head = self.free_list_tail = block + if self.fake_free_list_tail.prev_free_block is None: + raise RuntimeError( + "prev_free_block of fake_free_list_tail should always exist") + last_block: KVCacheBlock = self.fake_free_list_tail.prev_free_block + + # Connect the new block after the last block. + last_block.next_free_block = block + block.prev_free_block = last_block + + # Connect the fake tail after the new block. + block.next_free_block = self.fake_free_list_tail + self.fake_free_list_tail.prev_free_block = block - block.next_free_block = None self.num_free_blocks += 1 def get_all_free_blocks(self) -> list[KVCacheBlock]: @@ -285,8 +321,14 @@ class FreeKVCacheBlockQueue: A list of free blocks. """ ret = [] - curr_block = self.free_list_head - while curr_block is not None: + if self.fake_free_list_head.next_free_block is None: + raise RuntimeError( + "next_free_block of fake_free_list_head should always exist") + # Start from the first block + curr_block: KVCacheBlock = self.fake_free_list_head.next_free_block + # As long as next_free_block is available, we haven't reached to + # the fake tail yet. + while curr_block.next_free_block is not None: ret.append(curr_block) curr_block = curr_block.next_free_block return ret