mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 00:06:06 +08:00
[BugFix] [P/D] Handle lookahead token count edge-case with Eagle Spec Decoding and P/D (#22317)
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com> Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com> Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
This commit is contained in:
parent
81c57f60a2
commit
35afe1b30b
@ -437,14 +437,24 @@ class Scheduler(SchedulerInterface):
|
||||
# The request cannot be scheduled.
|
||||
break
|
||||
|
||||
# Handles an edge case when P/D Disaggregation
|
||||
# is used with Spec Decoding where an
|
||||
# extra block gets allocated which
|
||||
# creates a mismatch between the number
|
||||
# of local and remote blocks.
|
||||
effective_lookahead_tokens = (0 if request.num_computed_tokens
|
||||
== 0 else
|
||||
self.num_lookahead_tokens)
|
||||
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens + num_external_computed_tokens,
|
||||
num_new_local_computed_tokens,
|
||||
new_computed_blocks,
|
||||
num_lookahead_tokens=self.num_lookahead_tokens,
|
||||
num_lookahead_tokens=effective_lookahead_tokens,
|
||||
delay_cache_blocks=load_kv_async,
|
||||
)
|
||||
|
||||
if new_blocks is None:
|
||||
# The request cannot be scheduled.
|
||||
break
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user