[V1] Simplify prefix caching logic by removing num_evictable_computed_blocks (#11310)

2025-12-16 10:06:18 +08:00 · 2024-12-18 20:17:12 -08:00 · 2024-12-18 20:17:12 -08:00 · c6b0a7d3ba
commit c6b0a7d3ba
parent a30482f054
1 changed files with 2 additions and 11 deletions
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@ -201,23 +201,15 @@ class KVCacheManager:
                f"num_tokens must be greater than 0, got {num_tokens}")

        # Touch the computed blocks to make sure they won't be evicted.
-        num_evictable_computed_blocks = 0
        if self.enable_caching:
            self._touch(computed_blocks)
-
-            # If a computed block of a request is an eviction candidate (in the
-            # free queue and ref_cnt == 0), it cannot be counted as a free block
-            # when allocating this request.
-            num_evictable_computed_blocks = len(
-                [blk for blk in computed_blocks if blk.ref_cnt == 0])
        else:
            assert not computed_blocks, (
                "Computed blocks should be empty when "
                "prefix caching is disabled")

        num_required_blocks = cdiv(num_tokens, self.block_size)
-        if (num_required_blocks > self.free_block_queue.num_free_blocks -
-                num_evictable_computed_blocks):
+        if (num_required_blocks > self.free_block_queue.num_free_blocks):
            # Cannot allocate new blocks.
            return None

@ -225,8 +217,7 @@ class KVCacheManager:
        # preallocated blocks.
        num_new_blocks = min(
            num_required_blocks + self.num_preallocate_blocks,
-            self.free_block_queue.num_free_blocks -
-            num_evictable_computed_blocks,
+            self.free_block_queue.num_free_blocks,
            # Should not exceed the maximum number of blocks per request.
            # This is especially because the block table has the shape
            # [..., max_num_blocks_per_req].