[Core][Bugfix]: fix prefix caching for blockv2 (#4764)

Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2025-12-18 06:45:01 +08:00 · 2024-05-25 01:07:09 +08:00 · 2024-05-25 01:07:09 +08:00 · e64fde4b01
commit e64fde4b01
parent 919770957f
2 changed files with 141 additions and 17 deletions
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@ -410,6 +410,123 @@ class TestPrefixCachingBlockAllocator:
        assert (len(res) == zero_point_blocks)
    # Test case that assume those prompted block after first immutable would
    # be freed into hashless allocator, while first immutable block get ref
    # increased.
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [3])
    @pytest.mark.parametrize("block_size", [16])
    @pytest.mark.parametrize("seed", list(range(10)))
    def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
        random.seed(seed)
        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
                                                block_size=block_size)
        token_ids = list(range(block_size))
        block = allocator.allocate_immutable(prev_block=None,
                                             token_ids=token_ids)
        assert allocator._refcounter.get(block.block_id) == 1
        m = allocator.allocate_mutable(prev_block=None)
        block_id = m.block_id
        for i in range(block_size):
            m.append_token_ids([i])
        # After block get promoted to immutable from mutable, if there is
        # already same content hash block, then it shall be released into
        # hashless_allocator
        # And first immutable block's ref get increased by 1
        assert m.block_id == block.block_id
        assert block_id in allocator._hashless_allocator._free_block_indices
        assert allocator._refcounter.get(block.block_id) == 2
    # Test case when eviction and allocation are mixed,
    # make sure they work as expected
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [3])
    @pytest.mark.parametrize("block_size", [16])
    @pytest.mark.parametrize("seed", list(range(10)))
    def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
        random.seed(seed)
        all_blocks_list = [i for i in range(num_blocks)]
        zero_ref = {i: 0 for i in range(num_blocks)}
        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
                                                block_size=block_size)
        token_ids = list(range(num_blocks * block_size))
        # now we have num_blocks free blocks in hashless allocator
        # with internal tracking list _blocks _cached_blocks and evictor
        # empty and block's ref shall be 0
        assert list(allocator._hashless_allocator._free_block_indices
                    ) == all_blocks_list
        assert len(allocator._blocks.keys()) == 0
        assert len(allocator._cached_blocks.values()) == 0
        assert len(allocator.evictor.free_table.keys()) == 0
        assert allocator._refcounter._refcounts == zero_ref
        # Allocate immutable chains with only one block residuled in
        new_block = []
        for i in range(num_blocks):
            block = allocator.allocate_immutable(
                prev_block=None,
                token_ids=token_ids[block_size * i:block_size * (i + 1)])
            new_block.append(block)
        # Free all blocks, and now all blocks shall be in the evictor
        # there shall be no tracking data left in _blocks
        # all blocks shall be tracked in _cached_blocks
        # all blocks' ref shall be zero
        for block in new_block:
            allocator.free(block)
        assert len(allocator._blocks.keys()) == 0
        assert len(allocator._hashless_allocator._free_block_indices) == 0
        assert list(allocator._cached_blocks.values()) == all_blocks_list
        assert list(allocator.evictor.free_table.keys()) == all_blocks_list
        assert allocator._refcounter._refcounts == zero_ref
        # Allocate a mutable block, and the first block shall be evicted
        # and set its content hash into None, ref to 1
        mutable = allocator.allocate_mutable(prev_block=None)
        assert mutable.block_id == 0
        assert mutable.content_hash is None
        assert 0 in allocator._blocks
        assert allocator._refcounter.get(0) == 1
        assert 0 not in allocator._cached_blocks
        assert 0 not in allocator.evictor
        # Since this mutable block has no hash yet, it shall be released into
        # hashless allocator
        allocator.free(mutable)
        assert len(allocator._blocks.keys()) == 0
        assert allocator._refcounter._refcounts == zero_ref
        assert 0 not in allocator._cached_blocks
        assert 0 not in allocator.evictor
        assert 0 in allocator._hashless_allocator._free_block_indices
        # when allocate immutable with first block_size tokens, we
        # shall get free block from hashless allocator, thus no block left
        # in hashless
        block = allocator.allocate_immutable(prev_block=None,
                                             token_ids=token_ids[:block_size])
        assert block.block_id == 0
        assert len(allocator._hashless_allocator._free_block_indices) == 0
        assert 0 in allocator._blocks
        assert 0 in allocator._cached_blocks.values()
        assert allocator._refcounter.get(0) == 1
        assert 0 not in allocator.evictor
        # allocate mutable block again, it shall be popped from evictor
        mutable = allocator.allocate_mutable(prev_block=None)
        assert len(allocator._hashless_allocator._free_block_indices) == 0
        assert mutable.block_id not in allocator.evictor.free_table
        assert allocator._refcounter.get(mutable.block_id) == 1
    # Test case where two last accessed times are equal
    @staticmethod
    @pytest.mark.parametrize("num_blocks", [1024])
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@ -160,22 +160,18 @@ class PrefixCachingBlockAllocator(BlockAllocator):
        # If the evictor has blocks available for eviction, evict a block
        # and return it.
        if self.evictor.num_blocks > 0:
            # here we get an evicted block, which is only added
            # into evictor if its ref counter is 0
            # and since its content would be changed, we need
            # to remove it from _cached_blocks's tracking list
            block_id, content_hash_to_evict = self.evictor.evict()
            # Here we may have scenario that several blocks have
            # the same content hash, but due to the latter coming block
            # is coming from mutable to immutable path, their physical
            # block is added into evictor.
            # However in this case, we shall not pop the _cached_blocks,
            # as the same content is still used by others, which means
            # we need to check ref before decide to pop the list.
            _block_id = self._cached_blocks[content_hash_to_evict]
-            refcount = self._refcounter.get(_block_id)
+            assert self._refcounter.get(_block_id) == 0
            if refcount == 1:
                self._cached_blocks.pop(content_hash_to_evict)
            assert _block_id == block_id
            self._cached_blocks.pop(content_hash_to_evict)
            self._refcounter.incr(block_id)
            # the block comes from evictor already contain computed result
@ -199,7 +195,11 @@ class PrefixCachingBlockAllocator(BlockAllocator):
    def _incr_refcount_cached_block(self, block: Block,
                                    block_id: BlockId) -> None:
-        # since block is already computed, mark it
+        # now _incr_refcount_cached_block comes from two place
        # allocate_immutable/promote_to_immutable_block where hit
        # _cached_blocks hash key.
        # In both cases, it means that already exists a already
        # computed block which shared with block now
        block.computed = True
        refcount = self._refcounter.incr(block_id)
@ -228,13 +228,19 @@ class PrefixCachingBlockAllocator(BlockAllocator):
                                 block: Block) -> None:
        assert isinstance(block, PrefixCachingBlock)
-        if block.content_hash is None:
+        # if we comes from promote_to_immutable_block, it means that
        # block.content_hash is never None.
        # However we need to release the same content block, so that
        # physical block could get reused.
        if block.block_id != block_id or block.content_hash is None:
            refcount = self._refcounter.get(block_id)
            # We have fork case where block would get more than one ref,
            # so we cannot free it from tracking if ref cnt large than 1
            if refcount <= 1:
            assert block.block_id is not None
            refcount = self._refcounter.get(block.block_id)
            if refcount == 1:
                del self._blocks[block.block_id]
            return self._hashless_allocator.free(block)
        refcount = self._refcounter.decr(block_id)
@ -317,7 +323,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
        if block.content_hash not in self._cached_blocks:
            self._cached_blocks[block.content_hash] = block.block_id
        else:
-            self._free_block_id_for_block(block.block_id, block)
+            self._free_block_id_for_block(
                self._cached_blocks[block.content_hash], block)
            self._incr_refcount_cached_block(
                block, self._cached_blocks[block.content_hash])