mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 03:45:02 +08:00
- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**
commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date: Fri Jan 31 14:18:24 2025 -0500
Add SPDX license headers to python source files
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
also be easily used by tools to help manage license compliance.
The Linux Foundation runs license scans against the codebase to help
ensure
we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
More information can be found on the SPDX site:
- https://spdx.dev/learn/handling-license-info/
Signed-off-by: Russell Bryant <rbryant@redhat.com>
commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date: Fri Jan 31 14:36:32 2025 -0500
Check for SPDX headers using pre-commit
Signed-off-by: Russell Bryant <rbryant@redhat.com>
---------
Signed-off-by: Russell Bryant <rbryant@redhat.com>
662 lines
23 KiB
Python
662 lines
23 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
"""Compare the with and without prefix caching."""
|
|
import pytest
|
|
|
|
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
|
from vllm.sampling_params import SamplingParams
|
|
from vllm.utils import cdiv
|
|
from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
|
|
from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens
|
|
|
|
|
|
def make_request(request_id,
|
|
prompt_token_ids,
|
|
mm_positions=None,
|
|
mm_hashes=None):
|
|
if mm_positions is None:
|
|
multi_modal_inputs = None
|
|
else:
|
|
multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
|
|
|
|
return Request(
|
|
request_id=request_id,
|
|
prompt=None,
|
|
prompt_token_ids=prompt_token_ids,
|
|
multi_modal_inputs=multi_modal_inputs,
|
|
multi_modal_hashes=mm_hashes,
|
|
multi_modal_placeholders=mm_positions,
|
|
sampling_params=SamplingParams(max_tokens=17),
|
|
eos_token_id=100,
|
|
arrival_time=0,
|
|
lora_request=None,
|
|
)
|
|
|
|
|
|
def test_prefill():
|
|
manager = KVCacheManager(
|
|
block_size=16,
|
|
num_gpu_blocks=10,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=16,
|
|
)
|
|
|
|
# Complete 3 blocks (48 tokens)
|
|
common_token_ids = [i for i in range(3) for _ in range(16)]
|
|
|
|
# Fully cache miss
|
|
# Incomplete 1 block (7 tokens)
|
|
unique_token_ids = [3] * 7
|
|
all_token_ids = common_token_ids + unique_token_ids
|
|
req0 = make_request("0", all_token_ids)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
|
|
assert len(req0.kv_block_hashes) == 3
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req0, 55, computed_blocks)
|
|
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
|
|
|
# Check full block metadata
|
|
parent_block_hash = None
|
|
for block_id in (0, 1, 2):
|
|
block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
|
|
block_hash = hash_block_tokens(parent_block_hash, block_tokens)
|
|
assert manager.block_pool[block_id].block_hash == block_hash
|
|
assert manager.block_pool[block_id].ref_cnt == 1
|
|
parent_block_hash = block_hash.hash_value
|
|
|
|
# Check partial/preallocated block metadata
|
|
for block_id in (3, 4):
|
|
assert manager.block_pool[block_id].block_hash is None
|
|
assert manager.block_pool[block_id].ref_cnt == 1
|
|
|
|
# Cache hit in the common prefix when the original block is still in use.
|
|
# Incomplete 1 block (5 tokens)
|
|
unique_token_ids = [3] * 5
|
|
req1 = make_request("1", common_token_ids + unique_token_ids)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
|
assert len(req1.kv_block_hashes) == 3
|
|
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
|
|
assert num_computed_tokens == 3 * 16
|
|
num_new_tokens = 53 - 3 * 16
|
|
blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
|
|
assert [b.block_id for b in blocks] == [5, 6]
|
|
for block in computed_blocks:
|
|
assert block.ref_cnt == 2
|
|
|
|
# At this point, we should have 3 free blocks left.
|
|
assert manager.free_block_queue.num_free_blocks == 3
|
|
|
|
manager.free(req0)
|
|
manager.free(req1)
|
|
|
|
# All blocks should be available.
|
|
assert manager.free_block_queue.num_free_blocks == 10
|
|
# The order should be
|
|
# [unallocated (7, 8, 9)]
|
|
# [unique_req0 (4, 3)]
|
|
# [unique_req1 (6, 5)]
|
|
# [common (2, 1, 0)]
|
|
assert [
|
|
b.block_id for b in manager.free_block_queue.get_all_free_blocks()
|
|
] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
|
|
|
|
# Cache hit in the common prefix when the original block is already free.
|
|
# Incomplete 1 block (6 tokens)
|
|
unique_token_ids = [3] * 6
|
|
req2 = make_request("2", common_token_ids + unique_token_ids)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
|
assert len(req2.kv_block_hashes) == 3
|
|
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
|
|
assert num_computed_tokens == 3 * 16
|
|
num_new_tokens = 53 - 3 * 16
|
|
blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
|
|
assert [b.block_id for b in blocks] == [7, 8]
|
|
|
|
# Although we only have 5 free blocks, we have 8 blocks in
|
|
# the free block queue due to lazy removal.
|
|
assert manager.free_block_queue.num_free_blocks == 5
|
|
assert all([
|
|
b.ref_cnt == 0 for b in manager.free_block_queue.get_all_free_blocks()
|
|
])
|
|
assert len([b
|
|
for b in manager.free_block_queue.get_all_free_blocks()]) == 5
|
|
|
|
manager.free(req2)
|
|
|
|
# Cache miss and eviction.
|
|
req3 = make_request("3", [99] * (16 * 9))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
|
|
# This block ID order also checks the eviction order.
|
|
assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
|
|
assert manager.free_block_queue.num_free_blocks == 0
|
|
assert manager.free_block_queue.free_list_head is None
|
|
assert manager.free_block_queue.free_list_tail is None
|
|
|
|
|
|
def test_decode():
|
|
manager = KVCacheManager(
|
|
block_size=16,
|
|
num_gpu_blocks=10,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=16,
|
|
)
|
|
|
|
# Complete 3 blocks (48 tokens)
|
|
common_token_ids = [i for i in range(3) for _ in range(16)]
|
|
|
|
# Fully cache miss
|
|
# Incomplete 1 block (7 tokens)
|
|
unique_token_ids = [3] * 7
|
|
req0 = make_request("0", common_token_ids + unique_token_ids)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req0, 55, computed_blocks)
|
|
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
|
|
|
# Append slots without allocating a new block.
|
|
req0.num_computed_tokens = 55
|
|
for _ in range(4):
|
|
req0.append_output_token_ids(8)
|
|
new_blocks = manager.allocate_slots(req0, 4)
|
|
assert new_blocks is not None and len(new_blocks) == 0
|
|
assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
|
|
|
|
# Append slots without allocating a new block, but start using the
|
|
# preallocated block.
|
|
req0.num_computed_tokens = 59
|
|
# 6 tokens to fill the previous block, and 10 tokens to fill
|
|
# the preallocated block.
|
|
for _ in range(5 + 10):
|
|
req0.append_output_token_ids(7)
|
|
new_blocks = manager.allocate_slots(req0, 15)
|
|
assert new_blocks is not None and len(new_blocks) == 0
|
|
assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
|
|
|
|
# Append slots with allocating a new block.
|
|
req0.num_computed_tokens = 74
|
|
# 6 tokens to fill the previous block, and 10 tokens to fill
|
|
# the preallocated block.
|
|
for _ in range(6 + 11):
|
|
req0.append_output_token_ids(12)
|
|
new_blocks = manager.allocate_slots(req0, 17)
|
|
# Plus one preallocated block.
|
|
assert new_blocks is not None and len(new_blocks) == 2
|
|
|
|
|
|
def test_evict():
|
|
manager = KVCacheManager(
|
|
block_size=16,
|
|
num_gpu_blocks=10,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=16,
|
|
)
|
|
|
|
last_token_id = 5 * 16 + 7
|
|
req0 = make_request("0", list(range(last_token_id)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
|
|
assert len(blocks) == 7 # 5 full + 1 partial + 1 preallocated
|
|
|
|
# 3 blocks.
|
|
req1 = make_request("1", list(range(last_token_id,
|
|
last_token_id + 3 * 16)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks)
|
|
assert len(blocks) == 3 # 3 full blocks
|
|
last_token_id += 3 * 16
|
|
|
|
assert manager.free_block_queue.num_free_blocks == 0
|
|
|
|
manager.free(req0)
|
|
manager.free(req1)
|
|
assert manager.free_block_queue.num_free_blocks == 10
|
|
assert [
|
|
b.block_id for b in manager.free_block_queue.get_all_free_blocks()
|
|
] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
|
|
|
|
# Touch the first 2 blocks.
|
|
req2 = make_request("2", list(range(2 * 16 + 3)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
|
assert [b.block_id for b in computed_blocks] == [0, 1]
|
|
assert num_computed_tokens == 2 * 16
|
|
blocks = manager.allocate_slots(req2, 3, computed_blocks)
|
|
assert [b.block_id for b in blocks] == [6, 5]
|
|
assert manager.free_block_queue.num_free_blocks == 6
|
|
|
|
|
|
def test_hash_block_correct_reuse():
|
|
"""
|
|
This tests when a previously cached block is reused as a new block,
|
|
its hash metadata should be correctly reset.
|
|
"""
|
|
block_size = 16
|
|
manager = KVCacheManager(
|
|
block_size=block_size,
|
|
num_gpu_blocks=1,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=0,
|
|
)
|
|
|
|
# Allocate 1 block and cache it.
|
|
num_tokens = block_size * 1
|
|
req = make_request("0", list(range(num_tokens)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req, num_tokens, computed_blocks)
|
|
assert len(blocks) == 1
|
|
|
|
# Deallocate the block.
|
|
manager.free(req)
|
|
|
|
# Allocate a new block that's not full, make sure hash info on the
|
|
# block is cleared.
|
|
req = make_request("1", list(range(num_tokens - 1)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks)
|
|
assert len(blocks) == 1
|
|
|
|
assert manager.block_pool[blocks[0].block_id].block_hash is None
|
|
|
|
|
|
def test_computed_blocks_not_evicted():
|
|
"""
|
|
Test that the computed blocks are not evicted when getting new blocks
|
|
for a request if there are any other free blocks.
|
|
"""
|
|
block_size = 16
|
|
manager = KVCacheManager(
|
|
block_size=block_size,
|
|
num_gpu_blocks=2,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=0,
|
|
)
|
|
|
|
# Allocate a block and cache it.
|
|
num_tokens = block_size * 1
|
|
req0 = make_request("0", list(range(num_tokens)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
|
|
assert len(blocks) == 1
|
|
assert blocks[0].block_id == 0
|
|
|
|
# Allocate another block.
|
|
req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
|
|
assert len(blocks) == 1
|
|
assert blocks[0].block_id == 1
|
|
|
|
# Free the blocks.
|
|
manager.free(req0)
|
|
manager.free(req1)
|
|
|
|
# Now if we have a cache hit on the first block, we should evict the second
|
|
# cached block rather than the first one.
|
|
req2 = make_request("2", list(range(num_tokens * 2)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
|
assert len(computed_blocks) == 1
|
|
assert computed_blocks[0].block_id == 0
|
|
assert num_computed_tokens == block_size
|
|
|
|
blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
|
|
computed_blocks)
|
|
assert len(blocks) == 1
|
|
assert blocks[0].block_id == 1
|
|
|
|
|
|
def test_basic_prefix_caching_disabled():
|
|
"""
|
|
This tests that the prefix caching is disabled.
|
|
"""
|
|
block_size = 4
|
|
manager = KVCacheManager(
|
|
block_size=block_size,
|
|
num_gpu_blocks=4,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=False,
|
|
num_preallocate_tokens=0,
|
|
)
|
|
|
|
req1 = make_request("1", list(range(10))) # 2 blocks and some more
|
|
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req1, 10, computed_blocks)
|
|
assert len(blocks) == 3
|
|
|
|
# Free the blocks.
|
|
manager.free(req1)
|
|
|
|
# No caching.
|
|
req2 = make_request("2", list(range(16))) # shared prefix
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req2, 16, computed_blocks)
|
|
assert len(blocks) == 4
|
|
|
|
# New requests should not have any blocks.
|
|
req3 = make_request("3", list(range(4)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
blocks = manager.allocate_slots(req3, 4, computed_blocks)
|
|
assert not blocks
|
|
|
|
|
|
@pytest.mark.parametrize("num_preallocate_tokens", list(range(0, 8)))
|
|
@pytest.mark.parametrize("block_size", [4])
|
|
def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
|
|
"""
|
|
This tests that the preallocated blocks are correctly added.
|
|
"""
|
|
manager = KVCacheManager(
|
|
block_size=block_size,
|
|
num_gpu_blocks=10,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=num_preallocate_tokens,
|
|
)
|
|
num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
|
|
|
|
req = make_request("0", list(range(block_size * 30)))
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
# Just ask for 1 block.
|
|
blocks = manager.allocate_slots(req, block_size, computed_blocks)
|
|
req.num_computed_tokens = block_size
|
|
assert len(blocks) == 1 + num_preallocated_blocks
|
|
|
|
# Assume all computed, only when num_preallocate_tokens > 0, we need to
|
|
# consume the previously preallocated blocks.
|
|
if num_preallocated_blocks > 0:
|
|
manager.allocate_slots(req, block_size * (len(blocks) - 1))
|
|
req.num_computed_tokens = block_size * len(blocks)
|
|
|
|
# Append 1 block.
|
|
blocks = manager.allocate_slots(req, block_size)
|
|
assert len(blocks) == 1 + num_preallocated_blocks
|
|
|
|
|
|
def test_cache_blocks():
|
|
"""
|
|
This is a unit test that tests the correctness of the _cache_full_blocks
|
|
function of KVCacheManager.
|
|
"""
|
|
block_size = 4
|
|
manager = KVCacheManager(
|
|
block_size=block_size,
|
|
num_gpu_blocks=5,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=0,
|
|
)
|
|
# Req:
|
|
# Block 0: [0, 1, 2, 3]
|
|
# Block 1: [4, 5, 6, 7]
|
|
# Block 2: [8, 9, 10, 11]
|
|
# Block 3: [12, 13]
|
|
req = make_request("0", list(range(14)))
|
|
|
|
# Test that blocks are cached correctly for 2 full blocks from the start.
|
|
blocks = [KVCacheBlock(block_id=i) for i in range(2)]
|
|
|
|
manager._cache_full_blocks(
|
|
request=req,
|
|
blk_start_idx=0,
|
|
full_blocks=blocks,
|
|
prev_block=None,
|
|
)
|
|
|
|
assert len(manager.cached_block_hash_to_block) == 2
|
|
assert all([block.block_hash is not None for block in blocks])
|
|
|
|
# Test that blocks that don't start from the beginning are cached correctly.
|
|
blocks = [KVCacheBlock(block_id=2)]
|
|
manager._cache_full_blocks(
|
|
request=req,
|
|
blk_start_idx=2,
|
|
full_blocks=blocks,
|
|
prev_block=None,
|
|
)
|
|
assert len(manager.cached_block_hash_to_block) == 3
|
|
assert blocks[0].block_hash is not None
|
|
|
|
|
|
def test_mm_prefix_caching():
|
|
"""
|
|
This tests that the multi-modal prefix caching is correct.
|
|
"""
|
|
manager = KVCacheManager(
|
|
block_size=16,
|
|
num_gpu_blocks=10,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=16,
|
|
)
|
|
|
|
# Common prompt tokens (T is text tokens and P is image placeholder tokens)
|
|
# [T,...,T, P0,...,P0], [P0,...,P0,T,...,T,P1,...,P1], [P1,...,P1]
|
|
common_token_ids = list(range(10)) + [-1] * 6
|
|
common_token_ids += [-1] * 4 + list(range(10, 20)) + [-1] * 2
|
|
common_token_ids += [-1] * 16
|
|
|
|
common_mm_positions = [
|
|
PlaceholderRange(offset=11, length=10),
|
|
PlaceholderRange(offset=30, length=18),
|
|
]
|
|
common_mm_hashes = ["aaa", "bbb"]
|
|
|
|
# A unique image plus some text tokens.
|
|
unique_token_ids = [-1] * 7 + [100] * 4
|
|
all_token_ids = common_token_ids + unique_token_ids
|
|
mm_positions = common_mm_positions + [
|
|
PlaceholderRange(offset=48, length=7)
|
|
]
|
|
mm_hashes = common_mm_hashes + ["ccc"]
|
|
req0 = make_request("0",
|
|
all_token_ids,
|
|
mm_positions=mm_positions,
|
|
mm_hashes=mm_hashes)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
|
|
|
|
# Completed block should have hashes with extra keys.
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
assert len(req0.kv_block_hashes) == 3
|
|
assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
|
|
assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
|
|
assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
|
|
|
|
blocks = manager.allocate_slots(req0, 59, computed_blocks)
|
|
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
|
req0.num_computed_tokens = 59
|
|
|
|
# Append slots without allocating a new block.
|
|
for _ in range(5):
|
|
req0.append_output_token_ids(8)
|
|
new_blocks = manager.allocate_slots(req0, 5)
|
|
assert new_blocks is not None and len(new_blocks) == 0
|
|
|
|
# The just completed block should have hashes with extra keys.
|
|
assert len(req0.kv_block_hashes) == 4
|
|
assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
|
|
|
|
# Cache hit.
|
|
unique_token_ids = [-1] * 7 + [200] * 5
|
|
all_token_ids = common_token_ids + unique_token_ids
|
|
mm_positions = common_mm_positions + [
|
|
PlaceholderRange(offset=48, length=7)
|
|
]
|
|
mm_hashes = common_mm_hashes + ["ccc"]
|
|
req1 = make_request("1",
|
|
all_token_ids,
|
|
mm_positions=mm_positions,
|
|
mm_hashes=mm_hashes)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
|
assert len(computed_blocks) == 3
|
|
assert num_computed_tokens == 3 * 16
|
|
|
|
|
|
def test_prefill_not_enough_free_blocks_with_computed_blocks():
|
|
"""
|
|
This is a unit test that tests the correctness of the allocate_slots
|
|
when there is not enough free blocks. Specifically, when a request
|
|
has computed blocks but cannot be allocated due to not enough free blocks,
|
|
the computed blocks should not be touched.
|
|
"""
|
|
block_size = 16
|
|
manager = KVCacheManager(
|
|
block_size=block_size,
|
|
num_gpu_blocks=10,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=0,
|
|
)
|
|
# Complete 3 blocks (48 tokens)
|
|
# | Common-0 | Common-1 | Common-2 | ... |
|
|
common_token_ids = [i for i in range(3) for _ in range(16)]
|
|
req0 = make_request("0", common_token_ids)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
manager.allocate_slots(req0, 48, computed_blocks)
|
|
block_part0 = manager.req_to_blocks[req0.request_id]
|
|
|
|
# | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
|
|
req1 = make_request("1", common_token_ids * 2)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
|
assert computed_blocks == block_part0
|
|
assert num_computed_tokens == 3 * 16
|
|
manager.allocate_slots(req1, 48, computed_blocks)
|
|
block_part1 = manager.req_to_blocks[req1.request_id]
|
|
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
|
|
# | Req1-5(F)| ... |
|
|
manager.free(req1)
|
|
assert {block.ref_cnt for block in block_part1[:3]} == {1}
|
|
assert {block.ref_cnt for block in block_part1[3:]} == {0}
|
|
|
|
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
|
|
# | Req1-5(F)| Req2-0 | Req2-1 | ... |
|
|
req2 = make_request("2", [7] * block_size * 2)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
|
assert not computed_blocks
|
|
assert num_computed_tokens == 0
|
|
manager.allocate_slots(req2, block_size * 2, computed_blocks)
|
|
|
|
# Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
|
|
# but it cannot be allocated due to insufficient free blocks (2).
|
|
# In this case, the ref_cnt of the computed blocks should not be changed.
|
|
assert manager.free_block_queue.num_free_blocks == 5
|
|
req3 = make_request("3", common_token_ids * 3)
|
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
|
|
assert computed_blocks == block_part1
|
|
assert num_computed_tokens == 6 * 16
|
|
# Req3 cannot be allocated.
|
|
assert manager.allocate_slots(req3, 48, computed_blocks) is None
|
|
# Block 0-2 are used by Req 1.
|
|
assert {block.ref_cnt for block in block_part1[:3]} == {1}
|
|
# Block 3-5 are free.
|
|
assert {block.ref_cnt for block in block_part1[3:]} == {0}
|
|
|
|
|
|
def test_reset_prefix_cache():
|
|
manager = KVCacheManager(
|
|
block_size=16,
|
|
num_gpu_blocks=10,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=0,
|
|
)
|
|
|
|
full_block_token_ids = [i for i in range(3) for _ in range(16)]
|
|
unique_token_ids = [3] * 7
|
|
all_token_ids = full_block_token_ids + unique_token_ids
|
|
req0 = make_request("0", all_token_ids)
|
|
blocks = manager.allocate_slots(req0, 55)
|
|
assert [b.block_id for b in blocks] == [0, 1, 2, 3]
|
|
|
|
unique_token_ids = [4] * 7
|
|
all_token_ids = full_block_token_ids + unique_token_ids
|
|
req1 = make_request("1", all_token_ids)
|
|
computed_blocks, _ = manager.get_computed_blocks(req1)
|
|
assert len(req1.kv_block_hashes) == 3
|
|
assert len(computed_blocks) == 3
|
|
blocks = manager.allocate_slots(req1, 7, computed_blocks)
|
|
assert [b.block_id for b in blocks] == [4]
|
|
|
|
# Failed to reset prefix cache because some blocks are not freed yet.
|
|
assert not manager.reset_prefix_cache()
|
|
assert manager.cached_block_hash_to_block
|
|
|
|
# Free the blocks.
|
|
manager.free(req0)
|
|
manager.free(req1)
|
|
|
|
assert manager.reset_prefix_cache()
|
|
assert not manager.cached_block_hash_to_block
|
|
assert all([blk.block_hash is None for blk in manager.block_pool])
|
|
|
|
|
|
def test_uncache_blocks():
|
|
manager = KVCacheManager(
|
|
block_size=16,
|
|
num_gpu_blocks=10,
|
|
max_model_len=8192,
|
|
sliding_window=None,
|
|
enable_caching=True,
|
|
num_preallocate_tokens=0,
|
|
)
|
|
|
|
req0 = make_request("0", list(range(30)))
|
|
blocks = manager.allocate_slots(req0, 30)
|
|
assert [b.block_id for b in blocks] == [0, 1]
|
|
assert len(manager.cached_block_hash_to_block) == 1
|
|
|
|
req0.num_computed_tokens = 30
|
|
|
|
# Simulate speculative tokens.
|
|
for _ in range(5):
|
|
req0.append_output_token_ids(8)
|
|
manager.allocate_slots(req0, 5)
|
|
assert len(manager.cached_block_hash_to_block) == 2
|
|
|
|
# After sampling, assuming only 1 token is accepted.
|
|
req0.num_computed_tokens = 31
|
|
num_uncached_blocks = manager.uncache_blocks(req0)
|
|
assert num_uncached_blocks == 1
|
|
assert len(manager.cached_block_hash_to_block) == 1
|