mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-18 06:45:01 +08:00
[Core][Bugfix]: fix prefix caching for blockv2 (#4764)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
This commit is contained in:
parent
919770957f
commit
e64fde4b01
@ -410,6 +410,123 @@ class TestPrefixCachingBlockAllocator:
|
|||||||
|
|
||||||
assert (len(res) == zero_point_blocks)
|
assert (len(res) == zero_point_blocks)
|
||||||
|
|
||||||
|
# Test case that assume those prompted block after first immutable would
|
||||||
|
# be freed into hashless allocator, while first immutable block get ref
|
||||||
|
# increased.
|
||||||
|
@staticmethod
|
||||||
|
@pytest.mark.parametrize("num_blocks", [3])
|
||||||
|
@pytest.mark.parametrize("block_size", [16])
|
||||||
|
@pytest.mark.parametrize("seed", list(range(10)))
|
||||||
|
def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
|
||||||
|
random.seed(seed)
|
||||||
|
|
||||||
|
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||||
|
block_size=block_size)
|
||||||
|
token_ids = list(range(block_size))
|
||||||
|
|
||||||
|
block = allocator.allocate_immutable(prev_block=None,
|
||||||
|
token_ids=token_ids)
|
||||||
|
|
||||||
|
assert allocator._refcounter.get(block.block_id) == 1
|
||||||
|
m = allocator.allocate_mutable(prev_block=None)
|
||||||
|
|
||||||
|
block_id = m.block_id
|
||||||
|
for i in range(block_size):
|
||||||
|
m.append_token_ids([i])
|
||||||
|
# After block get promoted to immutable from mutable, if there is
|
||||||
|
# already same content hash block, then it shall be released into
|
||||||
|
# hashless_allocator
|
||||||
|
# And first immutable block's ref get increased by 1
|
||||||
|
assert m.block_id == block.block_id
|
||||||
|
assert block_id in allocator._hashless_allocator._free_block_indices
|
||||||
|
assert allocator._refcounter.get(block.block_id) == 2
|
||||||
|
|
||||||
|
# Test case when eviction and allocation are mixed,
|
||||||
|
# make sure they work as expected
|
||||||
|
@staticmethod
|
||||||
|
@pytest.mark.parametrize("num_blocks", [3])
|
||||||
|
@pytest.mark.parametrize("block_size", [16])
|
||||||
|
@pytest.mark.parametrize("seed", list(range(10)))
|
||||||
|
def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
|
||||||
|
random.seed(seed)
|
||||||
|
|
||||||
|
all_blocks_list = [i for i in range(num_blocks)]
|
||||||
|
zero_ref = {i: 0 for i in range(num_blocks)}
|
||||||
|
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||||
|
block_size=block_size)
|
||||||
|
token_ids = list(range(num_blocks * block_size))
|
||||||
|
|
||||||
|
# now we have num_blocks free blocks in hashless allocator
|
||||||
|
# with internal tracking list _blocks _cached_blocks and evictor
|
||||||
|
# empty and block's ref shall be 0
|
||||||
|
assert list(allocator._hashless_allocator._free_block_indices
|
||||||
|
) == all_blocks_list
|
||||||
|
assert len(allocator._blocks.keys()) == 0
|
||||||
|
assert len(allocator._cached_blocks.values()) == 0
|
||||||
|
assert len(allocator.evictor.free_table.keys()) == 0
|
||||||
|
assert allocator._refcounter._refcounts == zero_ref
|
||||||
|
|
||||||
|
# Allocate immutable chains with only one block residuled in
|
||||||
|
new_block = []
|
||||||
|
for i in range(num_blocks):
|
||||||
|
block = allocator.allocate_immutable(
|
||||||
|
prev_block=None,
|
||||||
|
token_ids=token_ids[block_size * i:block_size * (i + 1)])
|
||||||
|
new_block.append(block)
|
||||||
|
|
||||||
|
# Free all blocks, and now all blocks shall be in the evictor
|
||||||
|
# there shall be no tracking data left in _blocks
|
||||||
|
# all blocks shall be tracked in _cached_blocks
|
||||||
|
# all blocks' ref shall be zero
|
||||||
|
for block in new_block:
|
||||||
|
allocator.free(block)
|
||||||
|
|
||||||
|
assert len(allocator._blocks.keys()) == 0
|
||||||
|
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||||
|
assert list(allocator._cached_blocks.values()) == all_blocks_list
|
||||||
|
assert list(allocator.evictor.free_table.keys()) == all_blocks_list
|
||||||
|
assert allocator._refcounter._refcounts == zero_ref
|
||||||
|
|
||||||
|
# Allocate a mutable block, and the first block shall be evicted
|
||||||
|
# and set its content hash into None, ref to 1
|
||||||
|
mutable = allocator.allocate_mutable(prev_block=None)
|
||||||
|
|
||||||
|
assert mutable.block_id == 0
|
||||||
|
assert mutable.content_hash is None
|
||||||
|
assert 0 in allocator._blocks
|
||||||
|
assert allocator._refcounter.get(0) == 1
|
||||||
|
assert 0 not in allocator._cached_blocks
|
||||||
|
assert 0 not in allocator.evictor
|
||||||
|
|
||||||
|
# Since this mutable block has no hash yet, it shall be released into
|
||||||
|
# hashless allocator
|
||||||
|
allocator.free(mutable)
|
||||||
|
|
||||||
|
assert len(allocator._blocks.keys()) == 0
|
||||||
|
assert allocator._refcounter._refcounts == zero_ref
|
||||||
|
assert 0 not in allocator._cached_blocks
|
||||||
|
assert 0 not in allocator.evictor
|
||||||
|
assert 0 in allocator._hashless_allocator._free_block_indices
|
||||||
|
|
||||||
|
# when allocate immutable with first block_size tokens, we
|
||||||
|
# shall get free block from hashless allocator, thus no block left
|
||||||
|
# in hashless
|
||||||
|
block = allocator.allocate_immutable(prev_block=None,
|
||||||
|
token_ids=token_ids[:block_size])
|
||||||
|
|
||||||
|
assert block.block_id == 0
|
||||||
|
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||||
|
assert 0 in allocator._blocks
|
||||||
|
assert 0 in allocator._cached_blocks.values()
|
||||||
|
assert allocator._refcounter.get(0) == 1
|
||||||
|
assert 0 not in allocator.evictor
|
||||||
|
|
||||||
|
# allocate mutable block again, it shall be popped from evictor
|
||||||
|
mutable = allocator.allocate_mutable(prev_block=None)
|
||||||
|
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||||
|
assert mutable.block_id not in allocator.evictor.free_table
|
||||||
|
assert allocator._refcounter.get(mutable.block_id) == 1
|
||||||
|
|
||||||
# Test case where two last accessed times are equal
|
# Test case where two last accessed times are equal
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@pytest.mark.parametrize("num_blocks", [1024])
|
@pytest.mark.parametrize("num_blocks", [1024])
|
||||||
|
|||||||
@ -160,22 +160,18 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
# If the evictor has blocks available for eviction, evict a block
|
# If the evictor has blocks available for eviction, evict a block
|
||||||
# and return it.
|
# and return it.
|
||||||
if self.evictor.num_blocks > 0:
|
if self.evictor.num_blocks > 0:
|
||||||
|
# here we get an evicted block, which is only added
|
||||||
|
# into evictor if its ref counter is 0
|
||||||
|
# and since its content would be changed, we need
|
||||||
|
# to remove it from _cached_blocks's tracking list
|
||||||
block_id, content_hash_to_evict = self.evictor.evict()
|
block_id, content_hash_to_evict = self.evictor.evict()
|
||||||
|
|
||||||
# Here we may have scenario that several blocks have
|
|
||||||
# the same content hash, but due to the latter coming block
|
|
||||||
# is coming from mutable to immutable path, their physical
|
|
||||||
# block is added into evictor.
|
|
||||||
# However in this case, we shall not pop the _cached_blocks,
|
|
||||||
# as the same content is still used by others, which means
|
|
||||||
# we need to check ref before decide to pop the list.
|
|
||||||
|
|
||||||
_block_id = self._cached_blocks[content_hash_to_evict]
|
_block_id = self._cached_blocks[content_hash_to_evict]
|
||||||
refcount = self._refcounter.get(_block_id)
|
assert self._refcounter.get(_block_id) == 0
|
||||||
if refcount == 1:
|
|
||||||
self._cached_blocks.pop(content_hash_to_evict)
|
|
||||||
assert _block_id == block_id
|
assert _block_id == block_id
|
||||||
|
|
||||||
|
self._cached_blocks.pop(content_hash_to_evict)
|
||||||
|
|
||||||
self._refcounter.incr(block_id)
|
self._refcounter.incr(block_id)
|
||||||
|
|
||||||
# the block comes from evictor already contain computed result
|
# the block comes from evictor already contain computed result
|
||||||
@ -199,7 +195,11 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
|
|
||||||
def _incr_refcount_cached_block(self, block: Block,
|
def _incr_refcount_cached_block(self, block: Block,
|
||||||
block_id: BlockId) -> None:
|
block_id: BlockId) -> None:
|
||||||
# since block is already computed, mark it
|
# now _incr_refcount_cached_block comes from two place
|
||||||
|
# allocate_immutable/promote_to_immutable_block where hit
|
||||||
|
# _cached_blocks hash key.
|
||||||
|
# In both cases, it means that already exists a already
|
||||||
|
# computed block which shared with block now
|
||||||
block.computed = True
|
block.computed = True
|
||||||
|
|
||||||
refcount = self._refcounter.incr(block_id)
|
refcount = self._refcounter.incr(block_id)
|
||||||
@ -228,13 +228,19 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
block: Block) -> None:
|
block: Block) -> None:
|
||||||
assert isinstance(block, PrefixCachingBlock)
|
assert isinstance(block, PrefixCachingBlock)
|
||||||
|
|
||||||
if block.content_hash is None:
|
# if we comes from promote_to_immutable_block, it means that
|
||||||
|
# block.content_hash is never None.
|
||||||
|
# However we need to release the same content block, so that
|
||||||
|
# physical block could get reused.
|
||||||
|
if block.block_id != block_id or block.content_hash is None:
|
||||||
refcount = self._refcounter.get(block_id)
|
refcount = self._refcounter.get(block_id)
|
||||||
# We have fork case where block would get more than one ref,
|
# We have fork case where block would get more than one ref,
|
||||||
# so we cannot free it from tracking if ref cnt large than 1
|
# so we cannot free it from tracking if ref cnt large than 1
|
||||||
if refcount <= 1:
|
|
||||||
assert block.block_id is not None
|
assert block.block_id is not None
|
||||||
|
refcount = self._refcounter.get(block.block_id)
|
||||||
|
if refcount == 1:
|
||||||
del self._blocks[block.block_id]
|
del self._blocks[block.block_id]
|
||||||
|
|
||||||
return self._hashless_allocator.free(block)
|
return self._hashless_allocator.free(block)
|
||||||
|
|
||||||
refcount = self._refcounter.decr(block_id)
|
refcount = self._refcounter.decr(block_id)
|
||||||
@ -317,7 +323,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
|
|||||||
if block.content_hash not in self._cached_blocks:
|
if block.content_hash not in self._cached_blocks:
|
||||||
self._cached_blocks[block.content_hash] = block.block_id
|
self._cached_blocks[block.content_hash] = block.block_id
|
||||||
else:
|
else:
|
||||||
self._free_block_id_for_block(block.block_id, block)
|
self._free_block_id_for_block(
|
||||||
|
self._cached_blocks[block.content_hash], block)
|
||||||
self._incr_refcount_cached_block(
|
self._incr_refcount_cached_block(
|
||||||
block, self._cached_blocks[block.content_hash])
|
block, self._cached_blocks[block.content_hash])
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user