mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-20 00:05:56 +08:00
[Misc] Simplify the prefix caching logic on draft tokens (#20701)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
cd587c93ef
commit
7c12a765aa
@ -190,7 +190,6 @@ class KVCacheManager:
|
|||||||
num_new_tokens: int,
|
num_new_tokens: int,
|
||||||
num_new_computed_tokens: int = 0,
|
num_new_computed_tokens: int = 0,
|
||||||
new_computed_blocks: Optional[KVCacheBlocks] = None,
|
new_computed_blocks: Optional[KVCacheBlocks] = None,
|
||||||
num_draft_tokens: int = 0,
|
|
||||||
num_lookahead_tokens: int = 0,
|
num_lookahead_tokens: int = 0,
|
||||||
delay_cache_blocks: bool = False,
|
delay_cache_blocks: bool = False,
|
||||||
) -> Optional[KVCacheBlocks]:
|
) -> Optional[KVCacheBlocks]:
|
||||||
@ -286,12 +285,17 @@ class KVCacheManager:
|
|||||||
if not self.enable_caching or delay_cache_blocks:
|
if not self.enable_caching or delay_cache_blocks:
|
||||||
return KVCacheBlocks(new_blocks)
|
return KVCacheBlocks(new_blocks)
|
||||||
|
|
||||||
# Speculated tokens might be rejected in the future, so we does
|
# NOTE(woosuk): We want to commit (cache) up to num_computed_tokens +
|
||||||
# not cache any speculated tokens. We only cache blocks with
|
# num_new_tokens, but must exclude "non-committable" tokens (e.g.,
|
||||||
# generated (accepted) tokens.
|
# draft tokens that could be rejected). Therefore, we cap the number
|
||||||
|
# at `request.num_tokens`, ensuring only "finalized" tokens are cached.
|
||||||
|
num_tokens_to_cache = min(num_computed_tokens + num_new_tokens,
|
||||||
|
request.num_tokens)
|
||||||
self.coordinator.cache_blocks(
|
self.coordinator.cache_blocks(
|
||||||
request, self.req_to_block_hashes[request.request_id],
|
request,
|
||||||
num_computed_tokens + num_new_tokens - num_draft_tokens)
|
self.req_to_block_hashes[request.request_id],
|
||||||
|
num_tokens_to_cache,
|
||||||
|
)
|
||||||
|
|
||||||
return KVCacheBlocks(new_blocks)
|
return KVCacheBlocks(new_blocks)
|
||||||
|
|
||||||
|
|||||||
@ -241,15 +241,10 @@ class Scheduler(SchedulerInterface):
|
|||||||
req_index += 1
|
req_index += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
num_draft_tokens = max(
|
|
||||||
num_new_tokens + request.num_computed_tokens -
|
|
||||||
request.num_tokens, 0)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||||
request,
|
request,
|
||||||
num_new_tokens,
|
num_new_tokens,
|
||||||
num_draft_tokens=num_draft_tokens,
|
|
||||||
num_lookahead_tokens=self.num_lookahead_tokens)
|
num_lookahead_tokens=self.num_lookahead_tokens)
|
||||||
if new_blocks is None:
|
if new_blocks is None:
|
||||||
# The request cannot be scheduled.
|
# The request cannot be scheduled.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user