[Core] Avoid KVCacheBlock.__eq__ invocations in FreeKVCacheBlockQueue (#21005)

Signed-off-by: Jialin Ouyang <jialino@meta.com>
This commit is contained in:
JialinOuyang-Meta 2025-07-18 12:34:40 -07:00 committed by GitHub
parent b2eb2b5ad7
commit 0f199f197b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 209 additions and 57 deletions

View File

@ -0,0 +1,108 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import time
from typing import Optional
from tabulate import tabulate
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool
class Metric:
def __init__(self) -> None:
self.cnt: int = 0
self.sum_v: int = 0
self.max_v: Optional[int] = None
def update(self, v: int) -> None:
self.cnt += 1
self.sum_v += v
if self.max_v is None:
self.max_v = v
else:
self.max_v = max(self.max_v, v)
def avg_v(self) -> float:
return self.sum_v * 1.0 / self.cnt
def main(args):
rows = []
for allocate_block in args.allocate_blocks:
# Enforce a GC collect ahead to minimize the impact among runs
gc.collect()
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
get_blocks_metric: Metric = Metric()
free_blocks_metric: Metric = Metric()
for _ in range(args.num_iteration):
t1 = time.monotonic_ns()
blocks = block_pool.get_new_blocks(allocate_block)
t2 = time.monotonic_ns()
block_pool.free_blocks(blocks)
t3 = time.monotonic_ns()
get_blocks_metric.update(t2 - t1)
free_blocks_metric.update(t3 - t2)
if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
rows.append(
[
get_blocks_metric.cnt,
args.num_gpu_blocks,
allocate_block,
get_blocks_metric.avg_v() / 1000000,
get_blocks_metric.max_v / 1000000.0,
free_blocks_metric.avg_v() / 1000000,
free_blocks_metric.max_v / 1000000.0,
]
)
else:
print(
"No valid metrics found."
f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
)
print(
tabulate(
rows,
headers=[
"Iterations",
"Total\nBlocks",
"Allocated\nBlocks",
"Get Blocks\nAvg (ms)",
"Get Blocks\nMax (ms)",
"Free Blocks\nAvg (ms)",
"Free Blocks\nMax (ms)",
],
tablefmt="grid",
floatfmt=".6f",
)
)
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of BlockPool for KV Cache."
)
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
parser.add_argument(
"--num-iteration",
type=int,
default=1000,
help="Number of iterations to run to stablize final data readings",
)
parser.add_argument(
"--allocate-blocks",
type=int,
nargs="*",
default=[10, 50, 100, 500, 1000],
help="Number of blocks to allocate",
)
args = parser.parse_args()
main(args)
if __name__ == "__main__":
invoke_main() # pragma: no cover

View File

@ -132,8 +132,8 @@ def test_free_kv_cache_block_queue_initialization():
block = KVCacheBlock(block_id=0)
queue = FreeKVCacheBlockQueue([block])
assert queue.num_free_blocks == 1
assert queue.free_list_head == block
assert queue.free_list_tail == block
assert queue.fake_free_list_head.next_free_block is block
assert queue.fake_free_list_tail.prev_free_block is block
def test_free_kv_cache_block_queue_operations():
@ -145,36 +145,38 @@ def test_free_kv_cache_block_queue_operations():
# Check initial state
assert queue.num_free_blocks == 5
assert queue.free_list_head == blocks[0]
assert queue.free_list_tail == blocks[4]
assert queue.fake_free_list_head.next_free_block is blocks[0]
assert queue.fake_free_list_tail.prev_free_block is blocks[4]
# Pop the first block
block1 = queue.popleft()
assert block1 == blocks[0]
assert queue.num_free_blocks == 4
assert queue.free_list_head == blocks[1]
assert queue.free_list_tail == blocks[4]
assert queue.fake_free_list_head.next_free_block is blocks[1]
assert queue.fake_free_list_tail.prev_free_block is blocks[4]
# Remove a block from the middle
block_to_remove = blocks[2]
queue.remove(block_to_remove)
assert queue.num_free_blocks == 3
assert blocks[1].next_free_block == blocks[3]
assert blocks[3].prev_free_block == blocks[1]
assert blocks[1].next_free_block is blocks[3]
assert blocks[3].prev_free_block is blocks[1]
# Append a block back
queue.append(block_to_remove)
assert queue.num_free_blocks == 4
assert queue.free_list_tail == block_to_remove
assert block_to_remove.prev_free_block == blocks[4]
assert block_to_remove.next_free_block is None
assert queue.fake_free_list_tail.prev_free_block is block_to_remove
assert block_to_remove.prev_free_block is blocks[4]
assert block_to_remove.next_free_block is queue.fake_free_list_tail
# Pop blocks until empty
for _ in range(4):
queue.popleft()
assert queue.num_free_blocks == 0
assert queue.free_list_head is None
assert queue.free_list_tail is None
assert (queue.fake_free_list_head.next_free_block
is queue.fake_free_list_tail)
assert (queue.fake_free_list_tail.prev_free_block
is queue.fake_free_list_head)
# Attempt to pop from an empty queue
with pytest.raises(ValueError) as e:

View File

@ -155,13 +155,14 @@ def test_prefill(hash_algo):
assert block.ref_cnt == 2
# At this point, we should have 5 free blocks left.
assert manager.block_pool.free_block_queue.num_free_blocks == 5
free_block_queue = manager.block_pool.free_block_queue
assert free_block_queue.num_free_blocks == 5
manager.free(req0)
manager.free(req1)
# All blocks should be available.
assert manager.block_pool.free_block_queue.num_free_blocks == 10
assert free_block_queue.num_free_blocks == 10
# The order should be
# [unallocated (6, 7, 8, 9, 10)]
# [unique_req0 (4)]
@ -188,14 +189,10 @@ def test_prefill(hash_algo):
# Although we only have 6 free blocks, we have 8 blocks in
# the free block queue due to lazy removal.
assert manager.block_pool.free_block_queue.num_free_blocks == 6
assert all([
b.ref_cnt == 0
for b in manager.block_pool.free_block_queue.get_all_free_blocks()
])
assert len([
b for b in manager.block_pool.free_block_queue.get_all_free_blocks()
]) == 6
assert free_block_queue.num_free_blocks == 6
assert all(
[b.ref_cnt == 0 for b in free_block_queue.get_all_free_blocks()])
assert len([b for b in free_block_queue.get_all_free_blocks()]) == 6
manager.free(req2)
@ -209,9 +206,12 @@ def test_prefill(hash_algo):
computed_blocks)
# This block ID order also checks the eviction order.
assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], )
assert manager.block_pool.free_block_queue.num_free_blocks == 0
assert manager.block_pool.free_block_queue.free_list_head is None
assert manager.block_pool.free_block_queue.free_list_tail is None
assert free_block_queue.num_free_blocks == 0
assert (free_block_queue.fake_free_list_head.next_free_block
is free_block_queue.fake_free_list_tail)
assert (free_block_queue.fake_free_list_tail.prev_free_block
is free_block_queue.fake_free_list_head)
def test_prefill_hybrid_model():

View File

@ -212,27 +212,65 @@ class FreeKVCacheBlockQueue:
def __init__(self, blocks: list[KVCacheBlock]) -> None:
self.num_free_blocks = len(blocks)
# Initialize the doubly linked list of free blocks.
self.free_list_head: Optional[KVCacheBlock] = blocks[0]
self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
# Initialize doubly links of consecutive blocks
for i in range(self.num_free_blocks):
if i > 0:
blocks[i].prev_free_block = blocks[i - 1]
if i < self.num_free_blocks - 1:
blocks[i].next_free_block = blocks[i + 1]
# Create a fake head and a tail block for the doubly linked list to
# reduce branching in the code
#
# The implementation garenteed that the fake head and tail
# are NEVER got popped, so we could safely assume each real blocks
# in the queue has prev and next blocks.
self.fake_free_list_head = KVCacheBlock(block_id=-1)
self.fake_free_list_tail = KVCacheBlock(block_id=-1)
if self.num_free_blocks > 0:
# Connect fake_head and fake_tail to the first and last block
# respectively.
self.fake_free_list_head.next_free_block = blocks[0]
blocks[0].prev_free_block = self.fake_free_list_head
self.fake_free_list_tail.prev_free_block = blocks[-1]
blocks[-1].next_free_block = self.fake_free_list_tail
else:
# For empty list, simply connect the fake head and tail.
self.fake_free_list_head.next_free_block = self.fake_free_list_tail
self.fake_free_list_tail.prev_free_block = self.fake_free_list_head
def popleft(self) -> KVCacheBlock:
"""Pop the first free block and reduce num_free_blocks by 1.
Returns:
The first free block.
"""
if not self.free_list_head:
if (self.fake_free_list_head.next_free_block
is self.fake_free_list_tail
or self.fake_free_list_head.next_free_block is None):
assert self.num_free_blocks == 0, (
f"num_free_blocks ({self.num_free_blocks}) is out of sync "
"with the free list.")
raise ValueError("No free blocks available")
block = self.free_list_head
self.remove(block)
return block
first_block: KVCacheBlock = self.fake_free_list_head.next_free_block
if first_block.next_free_block is None:
# This should not happen if the block is from the free list.
# It indicates a bug in the caller's logic.
raise RuntimeError("Invalid block found in popleft() "
"which doesn't have a valid next_free_block")
# Connect fake_head and the next block of first_block (i.e. second block
# or fake tail).
self.fake_free_list_head.next_free_block = first_block.next_free_block
first_block.next_free_block.prev_free_block = self.fake_free_list_head
# Remove the block from the linked list.
first_block.prev_free_block = first_block.next_free_block = None
self.num_free_blocks -= 1
return first_block
def remove(self, block: KVCacheBlock) -> None:
"""Remove a block in the free list and reduce num_free_blocks by 1.
@ -240,19 +278,15 @@ class FreeKVCacheBlockQueue:
Args:
block: The block to remove.
"""
if block.prev_free_block is not None:
# Link the previous block to the next block.
block.prev_free_block.next_free_block = block.next_free_block
if block.next_free_block is not None:
# Link the next block to the previous block.
block.next_free_block.prev_free_block = block.prev_free_block
if block.prev_free_block is None or block.next_free_block is None:
# This should not happen if the block is from the free list.
# It indicates a bug in the caller's logic.
raise RuntimeError(f"remove() called on an invalid block: {block}")
if block == self.free_list_head:
# Update the head if the block is the head.
self.free_list_head = block.next_free_block
if block == self.free_list_tail:
# Update the tail if the block is the tail.
self.free_list_tail = block.prev_free_block
# Link the previous block to the next block.
block.prev_free_block.next_free_block = block.next_free_block
# Link the next block to the previous block.
block.next_free_block.prev_free_block = block.prev_free_block
# Remove the block from the linked list.
block.prev_free_block = block.next_free_block = None
@ -265,17 +299,19 @@ class FreeKVCacheBlockQueue:
Args:
block: The block to append.
"""
if self.free_list_tail is not None:
# Link the last block to the new block.
self.free_list_tail.next_free_block = block
block.prev_free_block = self.free_list_tail
self.free_list_tail = block
else:
# The free list is empty.
assert self.free_list_head is None
self.free_list_head = self.free_list_tail = block
if self.fake_free_list_tail.prev_free_block is None:
raise RuntimeError(
"prev_free_block of fake_free_list_tail should always exist")
last_block: KVCacheBlock = self.fake_free_list_tail.prev_free_block
# Connect the new block after the last block.
last_block.next_free_block = block
block.prev_free_block = last_block
# Connect the fake tail after the new block.
block.next_free_block = self.fake_free_list_tail
self.fake_free_list_tail.prev_free_block = block
block.next_free_block = None
self.num_free_blocks += 1
def get_all_free_blocks(self) -> list[KVCacheBlock]:
@ -285,8 +321,14 @@ class FreeKVCacheBlockQueue:
A list of free blocks.
"""
ret = []
curr_block = self.free_list_head
while curr_block is not None:
if self.fake_free_list_head.next_free_block is None:
raise RuntimeError(
"next_free_block of fake_free_list_head should always exist")
# Start from the first block
curr_block: KVCacheBlock = self.fake_free_list_head.next_free_block
# As long as next_free_block is available, we haven't reached to
# the fake tail yet.
while curr_block.next_free_block is not None:
ret.append(curr_block)
curr_block = curr_block.next_free_block
return ret