mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 01:45:52 +08:00
[crashfix] Eagle + multimodal can crash on mm cache miss (#29750)
Signed-off-by: Mickael Seznec <mickael@mistral.ai> Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
parent
014ece97c7
commit
86e178f7c4
@ -263,6 +263,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
request.num_computed_tokens,
|
request.num_computed_tokens,
|
||||||
num_new_tokens,
|
num_new_tokens,
|
||||||
encoder_compute_budget,
|
encoder_compute_budget,
|
||||||
|
shift_computed_tokens=1 if self.use_eagle else 0,
|
||||||
)
|
)
|
||||||
|
|
||||||
if num_new_tokens == 0:
|
if num_new_tokens == 0:
|
||||||
@ -532,6 +533,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
num_computed_tokens,
|
num_computed_tokens,
|
||||||
num_new_tokens,
|
num_new_tokens,
|
||||||
encoder_compute_budget,
|
encoder_compute_budget,
|
||||||
|
shift_computed_tokens=1 if self.use_eagle else 0,
|
||||||
)
|
)
|
||||||
if num_new_tokens == 0:
|
if num_new_tokens == 0:
|
||||||
# The request cannot be scheduled.
|
# The request cannot be scheduled.
|
||||||
@ -829,6 +831,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
num_computed_tokens: int,
|
num_computed_tokens: int,
|
||||||
num_new_tokens: int,
|
num_new_tokens: int,
|
||||||
encoder_compute_budget: int,
|
encoder_compute_budget: int,
|
||||||
|
shift_computed_tokens: int = 0,
|
||||||
) -> tuple[list[int], int, int, list[int]]:
|
) -> tuple[list[int], int, int, list[int]]:
|
||||||
"""
|
"""
|
||||||
Determine which encoder inputs need to be scheduled in the current step,
|
Determine which encoder inputs need to be scheduled in the current step,
|
||||||
@ -873,7 +876,10 @@ class Scheduler(SchedulerInterface):
|
|||||||
# The encoder output is needed if the two ranges overlap:
|
# The encoder output is needed if the two ranges overlap:
|
||||||
# [num_computed_tokens, num_computed_tokens + num_new_tokens) and
|
# [num_computed_tokens, num_computed_tokens + num_new_tokens) and
|
||||||
# [start_pos, start_pos + num_encoder_tokens)
|
# [start_pos, start_pos + num_encoder_tokens)
|
||||||
if start_pos >= num_computed_tokens + num_new_tokens:
|
if (
|
||||||
|
start_pos
|
||||||
|
>= num_computed_tokens + num_new_tokens + shift_computed_tokens
|
||||||
|
):
|
||||||
# The encoder input is not needed in this step.
|
# The encoder input is not needed in this step.
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -929,10 +935,12 @@ class Scheduler(SchedulerInterface):
|
|||||||
# NOTE(woosuk): We assume that the encoder input tokens should
|
# NOTE(woosuk): We assume that the encoder input tokens should
|
||||||
# be processed altogether, as the encoder usually uses
|
# be processed altogether, as the encoder usually uses
|
||||||
# bidirectional attention.
|
# bidirectional attention.
|
||||||
if num_computed_tokens < start_pos:
|
if num_computed_tokens + shift_computed_tokens < start_pos:
|
||||||
# We only schedule the decoder tokens just before the
|
# We only schedule the decoder tokens just before the
|
||||||
# encoder input.
|
# encoder input.
|
||||||
num_new_tokens = start_pos - num_computed_tokens
|
num_new_tokens = start_pos - (
|
||||||
|
num_computed_tokens + shift_computed_tokens
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Because of prefix caching, num_computed_tokens is greater
|
# Because of prefix caching, num_computed_tokens is greater
|
||||||
# than start_pos even though its encoder input is not
|
# than start_pos even though its encoder input is not
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user