From 6909a762012ce665931ff6d482dce17cf927108a Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Sun, 30 Mar 2025 05:20:19 +0200 Subject: [PATCH] [Bugfix] Fix Mistral guided generation using xgrammar (#15704) Signed-off-by: Julien Denize --- .../llm/test_struct_output_generate.py | 33 +++++++++++++------ vllm/v1/structured_output/backend_xgrammar.py | 18 ++++++---- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index a32dd8263992e..fa58c6460f840 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -15,11 +15,20 @@ from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.sampling_params import GuidedDecodingParams, SamplingParams -GUIDED_DECODING_BACKENDS_V1 = [ - "xgrammar:disable-any-whitespace", "guidance:disable-any-whitespace" +PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ + ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace", + "auto"), + ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace", + "auto"), + ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace", + "mistral"), + ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"), + ("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"), ] -MODELS_TO_TEST = [ - "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410" + +PARAMS_MODELS_TOKENIZER_MODE = [ + ("mistralai/Ministral-8B-Instruct-2410", "auto"), + ("Qwen/Qwen2.5-1.5B-Instruct", "auto"), ] @@ -37,9 +46,8 @@ class CarDescription(BaseModel): @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend", - GUIDED_DECODING_BACKENDS_V1) -@pytest.mark.parametrize("model_name", MODELS_TO_TEST) +@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode", + PARAMS_MODELS_BACKENDS_TOKENIZER_MODE) def test_structured_output( monkeypatch: pytest.MonkeyPatch, sample_json_schema: dict[str, Any], @@ -49,6 +57,7 @@ def test_structured_output( sample_regex: str, sample_guided_choice: str, guided_decoding_backend: str, + tokenizer_mode: str, model_name: str, ): monkeypatch.setenv("VLLM_USE_V1", "1") @@ -58,7 +67,8 @@ def test_structured_output( llm = LLM(model=model_name, enforce_eager=True, max_model_len=1024, - guided_decoding_backend=guided_decoding_backend) + guided_decoding_backend=guided_decoding_backend, + tokenizer_mode=tokenizer_mode) # # Test 1: Generate JSON output based on a provided schema @@ -324,17 +334,20 @@ def test_structured_output( @pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("model_name", MODELS_TO_TEST) +@pytest.mark.parametrize("model_name, tokenizer_mode", + PARAMS_MODELS_TOKENIZER_MODE) def test_structured_output_auto_mode( monkeypatch: pytest.MonkeyPatch, unsupported_json_schema: dict[str, Any], model_name: str, + tokenizer_mode: str, ): monkeypatch.setenv("VLLM_USE_V1", "1") llm = LLM(model=model_name, max_model_len=1024, - guided_decoding_backend="auto") + guided_decoding_backend="auto", + tokenizer_mode=tokenizer_mode) sampling_params = SamplingParams( temperature=1.0, diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 9bfb644c58094..7fe62f26af597 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -42,12 +42,15 @@ class XgrammarBackend(StructuredOutputBackend): # NOTE: ideally, xgrammar should handle this accordingly. # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 try: - encoded_vocab = [ - token for token, _ in sorted( - tokenizer.get_vocab().items(), - key=lambda x: x[1], - ) - ] + if tokenizer.is_tekken: + encoded_vocab = tokenizer._vocab + else: + encoded_vocab = [ + token for token, _ in sorted( + tokenizer.get_vocab().items(), + key=lambda x: x[1], + ) + ] stop_token_ids = None if hasattr( tokenizer, @@ -62,7 +65,8 @@ class XgrammarBackend(StructuredOutputBackend): tokenizer_info = xgr.TokenizerInfo( # type: ignore encoded_vocab=encoded_vocab, # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501 - vocab_type=xgr.VocabType.BYTE_FALLBACK, + vocab_type=xgr.VocabType.RAW + if tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK, vocab_size=self.vocab_size, stop_token_ids=stop_token_ids, add_prefix_space=True,