From 0f199f197b4e7a835ccc5b4d15363f8faa7824c8 Mon Sep 17 00:00:00 2001
From: JialinOuyang-Meta <jialino@meta.com>
Date: Fri, 18 Jul 2025 12:34:40 -0700
Subject: [PATCH] [Core] Avoid KVCacheBlock.__eq__ invocations in
 FreeKVCacheBlockQueue (#21005)

Signed-off-by: Jialin Ouyang <jialino@meta.com>
---
 benchmarks/kv_cache/benchmark_block_pool.py | 108 ++++++++++++++++++++
 tests/v1/core/test_kv_cache_utils.py        |  28 ++---
 tests/v1/core/test_prefix_caching.py        |  26 ++---
 vllm/v1/core/kv_cache_utils.py              | 104 +++++++++++++------
 4 files changed, 209 insertions(+), 57 deletions(-)
 create mode 100644 benchmarks/kv_cache/benchmark_block_pool.py

diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py
new file mode 100644
index 0000000000000..134551bb61285
--- /dev/null
+++ b/benchmarks/kv_cache/benchmark_block_pool.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import time
+from typing import Optional
+
+from tabulate import tabulate
+
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+class Metric:
+    def __init__(self) -> None:
+        self.cnt: int = 0
+        self.sum_v: int = 0
+        self.max_v: Optional[int] = None
+
+    def update(self, v: int) -> None:
+        self.cnt += 1
+        self.sum_v += v
+        if self.max_v is None:
+            self.max_v = v
+        else:
+            self.max_v = max(self.max_v, v)
+
+    def avg_v(self) -> float:
+        return self.sum_v * 1.0 / self.cnt
+
+
+def main(args):
+    rows = []
+    for allocate_block in args.allocate_blocks:
+        # Enforce a GC collect ahead to minimize the impact among runs
+        gc.collect()
+        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+        get_blocks_metric: Metric = Metric()
+        free_blocks_metric: Metric = Metric()
+        for _ in range(args.num_iteration):
+            t1 = time.monotonic_ns()
+            blocks = block_pool.get_new_blocks(allocate_block)
+            t2 = time.monotonic_ns()
+            block_pool.free_blocks(blocks)
+            t3 = time.monotonic_ns()
+            get_blocks_metric.update(t2 - t1)
+            free_blocks_metric.update(t3 - t2)
+
+        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
+            rows.append(
+                [
+                    get_blocks_metric.cnt,
+                    args.num_gpu_blocks,
+                    allocate_block,
+                    get_blocks_metric.avg_v() / 1000000,
+                    get_blocks_metric.max_v / 1000000.0,
+                    free_blocks_metric.avg_v() / 1000000,
+                    free_blocks_metric.max_v / 1000000.0,
+                ]
+            )
+        else:
+            print(
+                "No valid metrics found."
+                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
+            )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "Iterations",
+                "Total\nBlocks",
+                "Allocated\nBlocks",
+                "Get Blocks\nAvg (ms)",
+                "Get Blocks\nMax (ms)",
+                "Free Blocks\nAvg (ms)",
+                "Free Blocks\nMax (ms)",
+            ],
+            tablefmt="grid",
+            floatfmt=".6f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of BlockPool for KV Cache."
+    )
+    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=1000,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--allocate-blocks",
+        type=int,
+        nargs="*",
+        default=[10, 50, 100, 500, 1000],
+        help="Number of blocks to allocate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 0676cb3eb65d0..68b0601569012 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -132,8 +132,8 @@ def test_free_kv_cache_block_queue_initialization():
     block = KVCacheBlock(block_id=0)
     queue = FreeKVCacheBlockQueue([block])
     assert queue.num_free_blocks == 1
-    assert queue.free_list_head == block
-    assert queue.free_list_tail == block
+    assert queue.fake_free_list_head.next_free_block is block
+    assert queue.fake_free_list_tail.prev_free_block is block
 
 
 def test_free_kv_cache_block_queue_operations():
@@ -145,36 +145,38 @@ def test_free_kv_cache_block_queue_operations():
 
     # Check initial state
     assert queue.num_free_blocks == 5
-    assert queue.free_list_head == blocks[0]
-    assert queue.free_list_tail == blocks[4]
+    assert queue.fake_free_list_head.next_free_block is blocks[0]
+    assert queue.fake_free_list_tail.prev_free_block is blocks[4]
 
     # Pop the first block
     block1 = queue.popleft()
     assert block1 == blocks[0]
     assert queue.num_free_blocks == 4
-    assert queue.free_list_head == blocks[1]
-    assert queue.free_list_tail == blocks[4]
+    assert queue.fake_free_list_head.next_free_block is blocks[1]
+    assert queue.fake_free_list_tail.prev_free_block is blocks[4]
 
     # Remove a block from the middle
     block_to_remove = blocks[2]
     queue.remove(block_to_remove)
     assert queue.num_free_blocks == 3
-    assert blocks[1].next_free_block == blocks[3]
-    assert blocks[3].prev_free_block == blocks[1]
+    assert blocks[1].next_free_block is blocks[3]
+    assert blocks[3].prev_free_block is blocks[1]
 
     # Append a block back
     queue.append(block_to_remove)
     assert queue.num_free_blocks == 4
-    assert queue.free_list_tail == block_to_remove
-    assert block_to_remove.prev_free_block == blocks[4]
-    assert block_to_remove.next_free_block is None
+    assert queue.fake_free_list_tail.prev_free_block is block_to_remove
+    assert block_to_remove.prev_free_block is blocks[4]
+    assert block_to_remove.next_free_block is queue.fake_free_list_tail
 
     # Pop blocks until empty
     for _ in range(4):
         queue.popleft()
     assert queue.num_free_blocks == 0
-    assert queue.free_list_head is None
-    assert queue.free_list_tail is None
+    assert (queue.fake_free_list_head.next_free_block
+            is queue.fake_free_list_tail)
+    assert (queue.fake_free_list_tail.prev_free_block
+            is queue.fake_free_list_head)
 
     # Attempt to pop from an empty queue
     with pytest.raises(ValueError) as e:
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index f31bdf74f4a67..b7f583de1f631 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -155,13 +155,14 @@ def test_prefill(hash_algo):
         assert block.ref_cnt == 2
 
     # At this point, we should have 5 free blocks left.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 5
+    free_block_queue = manager.block_pool.free_block_queue
+    assert free_block_queue.num_free_blocks == 5
 
     manager.free(req0)
     manager.free(req1)
 
     # All blocks should be available.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 10
+    assert free_block_queue.num_free_blocks == 10
     # The order should be
     # [unallocated (6, 7, 8, 9, 10)]
     # [unique_req0 (4)]
@@ -188,14 +189,10 @@ def test_prefill(hash_algo):
 
     # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 6
-    assert all([
-        b.ref_cnt == 0
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ])
-    assert len([
-        b for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ]) == 6
+    assert free_block_queue.num_free_blocks == 6
+    assert all(
+        [b.ref_cnt == 0 for b in free_block_queue.get_all_free_blocks()])
+    assert len([b for b in free_block_queue.get_all_free_blocks()]) == 6
 
     manager.free(req2)
 
@@ -209,9 +206,12 @@ def test_prefill(hash_algo):
                                     computed_blocks)
     # This block ID order also checks the eviction order.
     assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], )
-    assert manager.block_pool.free_block_queue.num_free_blocks == 0
-    assert manager.block_pool.free_block_queue.free_list_head is None
-    assert manager.block_pool.free_block_queue.free_list_tail is None
+
+    assert free_block_queue.num_free_blocks == 0
+    assert (free_block_queue.fake_free_list_head.next_free_block
+            is free_block_queue.fake_free_list_tail)
+    assert (free_block_queue.fake_free_list_tail.prev_free_block
+            is free_block_queue.fake_free_list_head)
 
 
 def test_prefill_hybrid_model():
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 6067a127e97fa..b1fab0d34de4c 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -212,27 +212,65 @@ class FreeKVCacheBlockQueue:
     def __init__(self, blocks: list[KVCacheBlock]) -> None:
         self.num_free_blocks = len(blocks)
 
-        # Initialize the doubly linked list of free blocks.
-        self.free_list_head: Optional[KVCacheBlock] = blocks[0]
-        self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
+        # Initialize doubly links of consecutive blocks
         for i in range(self.num_free_blocks):
             if i > 0:
                 blocks[i].prev_free_block = blocks[i - 1]
             if i < self.num_free_blocks - 1:
                 blocks[i].next_free_block = blocks[i + 1]
 
+        # Create a fake head and a tail block for the doubly linked list to
+        # reduce branching in the code
+        #
+        # The implementation garenteed that the fake head and tail
+        # are NEVER got popped, so we could safely assume each real blocks
+        # in the queue has prev and next blocks.
+        self.fake_free_list_head = KVCacheBlock(block_id=-1)
+        self.fake_free_list_tail = KVCacheBlock(block_id=-1)
+        if self.num_free_blocks > 0:
+            # Connect fake_head and fake_tail to the first and last block
+            # respectively.
+            self.fake_free_list_head.next_free_block = blocks[0]
+            blocks[0].prev_free_block = self.fake_free_list_head
+            self.fake_free_list_tail.prev_free_block = blocks[-1]
+            blocks[-1].next_free_block = self.fake_free_list_tail
+        else:
+            # For empty list, simply connect the fake head and tail.
+            self.fake_free_list_head.next_free_block = self.fake_free_list_tail
+            self.fake_free_list_tail.prev_free_block = self.fake_free_list_head
+
     def popleft(self) -> KVCacheBlock:
         """Pop the first free block and reduce num_free_blocks by 1.
 
         Returns:
             The first free block.
         """
-        if not self.free_list_head:
+        if (self.fake_free_list_head.next_free_block
+                is self.fake_free_list_tail
+                or self.fake_free_list_head.next_free_block is None):
+            assert self.num_free_blocks == 0, (
+                f"num_free_blocks ({self.num_free_blocks}) is out of sync "
+                "with the free list.")
             raise ValueError("No free blocks available")
 
-        block = self.free_list_head
-        self.remove(block)
-        return block
+        first_block: KVCacheBlock = self.fake_free_list_head.next_free_block
+
+        if first_block.next_free_block is None:
+            # This should not happen if the block is from the free list.
+            # It indicates a bug in the caller's logic.
+            raise RuntimeError("Invalid block found in popleft() "
+                               "which doesn't have a valid next_free_block")
+
+        # Connect fake_head and the next block of first_block (i.e. second block
+        # or fake tail).
+        self.fake_free_list_head.next_free_block = first_block.next_free_block
+        first_block.next_free_block.prev_free_block = self.fake_free_list_head
+
+        # Remove the block from the linked list.
+        first_block.prev_free_block = first_block.next_free_block = None
+
+        self.num_free_blocks -= 1
+        return first_block
 
     def remove(self, block: KVCacheBlock) -> None:
         """Remove a block in the free list and reduce num_free_blocks by 1.
@@ -240,19 +278,15 @@ class FreeKVCacheBlockQueue:
         Args:
             block: The block to remove.
         """
-        if block.prev_free_block is not None:
-            # Link the previous block to the next block.
-            block.prev_free_block.next_free_block = block.next_free_block
-        if block.next_free_block is not None:
-            # Link the next block to the previous block.
-            block.next_free_block.prev_free_block = block.prev_free_block
+        if block.prev_free_block is None or block.next_free_block is None:
+            # This should not happen if the block is from the free list.
+            # It indicates a bug in the caller's logic.
+            raise RuntimeError(f"remove() called on an invalid block: {block}")
 
-        if block == self.free_list_head:
-            # Update the head if the block is the head.
-            self.free_list_head = block.next_free_block
-        if block == self.free_list_tail:
-            # Update the tail if the block is the tail.
-            self.free_list_tail = block.prev_free_block
+        # Link the previous block to the next block.
+        block.prev_free_block.next_free_block = block.next_free_block
+        # Link the next block to the previous block.
+        block.next_free_block.prev_free_block = block.prev_free_block
 
         # Remove the block from the linked list.
         block.prev_free_block = block.next_free_block = None
@@ -265,17 +299,19 @@ class FreeKVCacheBlockQueue:
         Args:
             block: The block to append.
         """
-        if self.free_list_tail is not None:
-            # Link the last block to the new block.
-            self.free_list_tail.next_free_block = block
-            block.prev_free_block = self.free_list_tail
-            self.free_list_tail = block
-        else:
-            # The free list is empty.
-            assert self.free_list_head is None
-            self.free_list_head = self.free_list_tail = block
+        if self.fake_free_list_tail.prev_free_block is None:
+            raise RuntimeError(
+                "prev_free_block of fake_free_list_tail should always exist")
+        last_block: KVCacheBlock = self.fake_free_list_tail.prev_free_block
+
+        # Connect the new block after the last block.
+        last_block.next_free_block = block
+        block.prev_free_block = last_block
+
+        # Connect the fake tail after the new block.
+        block.next_free_block = self.fake_free_list_tail
+        self.fake_free_list_tail.prev_free_block = block
 
-        block.next_free_block = None
         self.num_free_blocks += 1
 
     def get_all_free_blocks(self) -> list[KVCacheBlock]:
@@ -285,8 +321,14 @@ class FreeKVCacheBlockQueue:
             A list of free blocks.
         """
         ret = []
-        curr_block = self.free_list_head
-        while curr_block is not None:
+        if self.fake_free_list_head.next_free_block is None:
+            raise RuntimeError(
+                "next_free_block of fake_free_list_head should always exist")
+        # Start from the first block
+        curr_block: KVCacheBlock = self.fake_free_list_head.next_free_block
+        # As long as next_free_block is available, we haven't reached to
+        # the fake tail yet.
+        while curr_block.next_free_block is not None:
             ret.append(curr_block)
             curr_block = curr_block.next_free_block
         return ret