mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 18:16:46 +08:00
[BugFix] [P/D] Handle lookahead token count edge-case with Eagle Spec Decoding and P/D (#22317)
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com> Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
This commit is contained in:
parent
81c57f60a2
commit
35afe1b30b
@ -437,14 +437,24 @@ class Scheduler(SchedulerInterface):
|
|||||||
# The request cannot be scheduled.
|
# The request cannot be scheduled.
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Handles an edge case when P/D Disaggregation
|
||||||
|
# is used with Spec Decoding where an
|
||||||
|
# extra block gets allocated which
|
||||||
|
# creates a mismatch between the number
|
||||||
|
# of local and remote blocks.
|
||||||
|
effective_lookahead_tokens = (0 if request.num_computed_tokens
|
||||||
|
== 0 else
|
||||||
|
self.num_lookahead_tokens)
|
||||||
|
|
||||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||||
request,
|
request,
|
||||||
num_new_tokens + num_external_computed_tokens,
|
num_new_tokens + num_external_computed_tokens,
|
||||||
num_new_local_computed_tokens,
|
num_new_local_computed_tokens,
|
||||||
new_computed_blocks,
|
new_computed_blocks,
|
||||||
num_lookahead_tokens=self.num_lookahead_tokens,
|
num_lookahead_tokens=effective_lookahead_tokens,
|
||||||
delay_cache_blocks=load_kv_async,
|
delay_cache_blocks=load_kv_async,
|
||||||
)
|
)
|
||||||
|
|
||||||
if new_blocks is None:
|
if new_blocks is None:
|
||||||
# The request cannot be scheduled.
|
# The request cannot be scheduled.
|
||||||
break
|
break
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user