mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 10:18:39 +08:00
[Bugfix] Spec decode + structured output + spec model max len edge case (#28298)
Signed-off-by: Andy Lo <andy@mistral.ai>
This commit is contained in:
parent
26990d25dc
commit
47604137a2
@ -7,6 +7,7 @@ import pytest
|
|||||||
from tests.utils import get_attn_backend_list_based_on_platform
|
from tests.utils import get_attn_backend_list_based_on_platform
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.sampling_params import StructuredOutputsParams
|
||||||
|
|
||||||
_PROMPTS = [
|
_PROMPTS = [
|
||||||
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
|
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
|
||||||
@ -56,8 +57,34 @@ def test_eagle_max_len(
|
|||||||
"method": "eagle",
|
"method": "eagle",
|
||||||
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||||
"num_speculative_tokens": num_speculative_tokens,
|
"num_speculative_tokens": num_speculative_tokens,
|
||||||
|
"max_model_len": 80,
|
||||||
},
|
},
|
||||||
max_model_len=100,
|
max_model_len=200,
|
||||||
)
|
)
|
||||||
sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
|
sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
|
||||||
llm.generate(_PROMPTS, sampling_params)
|
outputs = llm.generate(_PROMPTS, sampling_params)
|
||||||
|
for o in outputs:
|
||||||
|
assert o.outputs[0].finish_reason == "length", (
|
||||||
|
"This test is only meaningful if the output "
|
||||||
|
"is truncated due to max length"
|
||||||
|
)
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
max_tokens=200,
|
||||||
|
structured_outputs=StructuredOutputsParams(
|
||||||
|
regex="^" + "a b c d e " * 15 + "$"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
output = llm.generate(_PROMPTS, sampling_params)
|
||||||
|
for o in output:
|
||||||
|
assert o.prompt_token_ids is not None
|
||||||
|
assert (
|
||||||
|
len(o.prompt_token_ids)
|
||||||
|
< 80
|
||||||
|
< len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
|
||||||
|
< 200
|
||||||
|
), (
|
||||||
|
"This test is only meaningful if the output "
|
||||||
|
"is longer than the eagle max length"
|
||||||
|
)
|
||||||
|
assert o.outputs[0].text == "a b c d e " * 15
|
||||||
|
|||||||
@ -325,6 +325,9 @@ class Scheduler(SchedulerInterface):
|
|||||||
scheduled_spec_decode_tokens[request.request_id] = (
|
scheduled_spec_decode_tokens[request.request_id] = (
|
||||||
request.spec_token_ids
|
request.spec_token_ids
|
||||||
)
|
)
|
||||||
|
# New spec tokens will be set in `update_draft_token_ids` before the
|
||||||
|
# next step when applicable.
|
||||||
|
request.spec_token_ids = []
|
||||||
|
|
||||||
# Encoder-related.
|
# Encoder-related.
|
||||||
if encoder_inputs_to_schedule:
|
if encoder_inputs_to_schedule:
|
||||||
@ -1149,10 +1152,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Add newly generated spec token ids to the request.
|
# Add newly generated spec token ids to the request.
|
||||||
if not spec_token_ids:
|
if self.structured_output_manager.should_advance(request):
|
||||||
# NOTE(woosuk): request.spec_token_ids should be updated.
|
|
||||||
request.spec_token_ids.clear()
|
|
||||||
elif self.structured_output_manager.should_advance(request):
|
|
||||||
metadata = request.structured_output_request
|
metadata = request.structured_output_request
|
||||||
request.spec_token_ids = metadata.grammar.validate_tokens( # type: ignore[union-attr]
|
request.spec_token_ids = metadata.grammar.validate_tokens( # type: ignore[union-attr]
|
||||||
spec_token_ids
|
spec_token_ids
|
||||||
|
|||||||
@ -269,9 +269,10 @@ class StructuredOutputManager:
|
|||||||
and token is not None
|
and token is not None
|
||||||
and not structured_output_request.grammar.is_terminated()
|
and not structured_output_request.grammar.is_terminated()
|
||||||
):
|
):
|
||||||
assert structured_output_request.grammar.accept_tokens(
|
accepted = structured_output_request.grammar.accept_tokens(
|
||||||
req_id, [token]
|
req_id, [token]
|
||||||
)
|
)
|
||||||
|
assert accepted, (token, req_id, scheduled_spec_decode_tokens)
|
||||||
state_advancements += 1
|
state_advancements += 1
|
||||||
cumulative_index += 1
|
cumulative_index += 1
|
||||||
if state_advancements > 0:
|
if state_advancements > 0:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user