diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index b83c37a9032d..a10b42ea3a4b 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -20,8 +20,6 @@ from .test_completion import zephyr_lora_files # noqa: F401 # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] - @pytest.fixture(scope="module") def monkeypatch_module(): @@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, assert last_completion_tokens == 10 -# NOTE: Not sure why, but when I place this after `test_guided_regex_chat` -# (i.e. using the same ordering as in the Completions API tests), the test -# will fail on the second `guided_decoding_backend` even when I swap their order -# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256) @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_chat(client: openai.AsyncOpenAI, - is_v1_server: bool, - guided_decoding_backend: str, sample_guided_choice): - - if is_v1_server and guided_decoding_backend != 'xgrammar': - pytest.skip("Only xgrammar backend is supported with V1") - messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(guided_choice=sample_guided_choice, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_choice=sample_guided_choice)) choice1 = chat_completion.choices[0].message.content assert choice1 in sample_guided_choice @@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(guided_choice=sample_guided_choice, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_choice=sample_guided_choice)) choice2 = chat_completion.choices[0].message.content assert choice2 in sample_guided_choice assert choice1 != choice2 @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool, - guided_decoding_backend: str, +async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema): - if is_v1_server: - pytest.skip("sample_json_schema has features unsupported in V1") - messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - extra_body=dict(guided_json=sample_json_schema, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_json=sample_json_schema)) message = chat_completion.choices[0].message assert message.content is not None json1 = json.loads(message.content) @@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - extra_body=dict(guided_json=sample_json_schema, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_json=sample_json_schema)) message = chat_completion.choices[0].message assert message.content is not None json2 = json.loads(message.content) @@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_regex_chat(client: openai.AsyncOpenAI, - is_v1_server: bool, - guided_decoding_backend: str, sample_regex): - - if is_v1_server and guided_decoding_backend != 'xgrammar': - pytest.skip("Only xgrammar backend is supported with V1") +async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex): messages = [{ "role": "system", @@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, model=MODEL_NAME, messages=messages, max_completion_tokens=20, - extra_body=dict(guided_regex=sample_regex, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_regex=sample_regex)) ip1 = chat_completion.choices[0].message.content assert ip1 is not None assert re.fullmatch(sample_regex, ip1) is not None @@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, model=MODEL_NAME, messages=messages, max_completion_tokens=20, - extra_body=dict(guided_regex=sample_regex, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_regex=sample_regex)) ip2 = chat_completion.choices[0].message.content assert ip2 is not None assert re.fullmatch(sample_regex, ip2) is not None @@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, - is_v1_server: bool, - guided_decoding_backend: str, sample_guided_choice): - if is_v1_server and guided_decoding_backend != 'xgrammar': - pytest.skip("Only xgrammar backend is supported with V1") - messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, max_completion_tokens=10, logprobs=True, top_logprobs=5, - extra_body=dict(guided_choice=sample_guided_choice, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_choice=sample_guided_choice)) assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs.content is not None @@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool, - guided_decoding_backend: str, - sample_json_schema): - - if is_v1_server: - pytest.skip("sample_json_schema has features unsupported on V1") - +async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool, "name": "dummy_function_name" } }, - extra_body=dict(guided_decoding_backend=guided_decoding_backend)) + ) message = chat_completion.choices[0].message assert len(message.content) == 0 json_string = message.tool_calls[0].function.arguments @@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool, "name": "dummy_function_name" } }, - extra_body=dict(guided_decoding_backend=guided_decoding_backend), stream=True) output = [] @@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model=model_name, tools=tools, tool_choice="required", - extra_body=dict(guided_decoding_backend="outlines"), ) assert chat_completion.choices[0].message.tool_calls is not None @@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, model=model_name, tools=tools, tool_choice="required", - extra_body=dict(guided_decoding_backend="outlines"), stream=True, ) @@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, - is_v1_server: bool, sample_json_schema): - - if is_v1_server: - pytest.skip("sample_json_schema has features unsupported on V1") - messages = [{ "role": "system", "content": "you are a helpful assistant" diff --git a/vllm/config.py b/vllm/config.py index 23541a884d91..ff9579a4bb1e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2976,7 +2976,7 @@ class DecodingConfig: # Which guided decoding algo to use. # 'outlines' / 'lm-format-enforcer' / 'xgrammar' - guided_decoding_backend: str = 'xgrammar' + guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar" reasoning_backend: Optional[str] = None @@ -3001,7 +3001,7 @@ class DecodingConfig: def __post_init__(self): v0_valid_guided_backends = [ - 'outlines', 'lm-format-enforcer', 'xgrammar' + 'outlines', 'lm-format-enforcer', 'xgrammar', 'auto' ] v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto'] diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ba71a8770d17..9cc6eca24b5c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -182,7 +182,7 @@ class EngineArgs: enable_chunked_prefill: Optional[bool] = None disable_chunked_mm_input: bool = False - guided_decoding_backend: str = 'xgrammar' + guided_decoding_backend: str = DecodingConfig.guided_decoding_backend logits_processor_pattern: Optional[str] = None speculative_config: Optional[Dict[str, Any]] = None @@ -407,13 +407,13 @@ class EngineArgs: parser.add_argument( '--guided-decoding-backend', type=str, - default='xgrammar', + default=DecodingConfig.guided_decoding_backend, help='Which engine will be used for guided decoding' ' (JSON schema / regex etc) by default. Currently support ' 'https://github.com/mlc-ai/xgrammar and ' 'https://github.com/guidance-ai/llguidance.' 'Valid backend values are "xgrammar", "guidance", and "auto". ' - 'With "auto", we will make opinionated choices based on request' + 'With "auto", we will make opinionated choices based on request ' 'contents and what the backend libraries currently support, so ' 'the behavior is subject to change in each release.') parser.add_argument( diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index d4fd11fd2e30..6f0eede74b5a 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -33,6 +33,12 @@ def maybe_backend_fallback( logger.warning("%s Falling back to use %s instead.", message, fallback) guided_params.backend = fallback + # `auto` was added for V1 to explicitly declare a mode that has fallbacks + # in place. If that is specified with V0, treat it as `xgrammar`, as we have + # fallbacks enabled for that and it is the V0 default. + if guided_params.backend == "auto": + guided_params.backend = "xgrammar" + # lm-format-enforce doesn't support grammar, fallback to xgrammar if guided_params.backend_name == "lm-format-enforcer": if guided_params.grammar is not None: