From 57430fc95c8a94a7c68b3d525e3b8823b0f2433f Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Fri, 21 Nov 2025 22:58:59 +0100 Subject: [PATCH] Default model load/config/tokenizer to `mistral` format if relevant files exist (#28659) Signed-off-by: Julien Denize Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: mgoin --- docs/features/tool_calling.md | 23 +++++-- .../language/generation/test_mistral.py | 2 +- tests/models/multimodal/test_mapping.py | 14 ++++- .../models/quantization/test_bitsandbytes.py | 3 + tests/tool_use/utils.py | 6 ++ tests/transformers_utils/test_config.py | 62 +++++++++++++++++++ tests/transformers_utils/test_utils.py | 6 +- .../llm/test_struct_output_generate.py | 14 ++++- vllm/config/model.py | 9 +-- vllm/model_executor/model_loader/__init__.py | 2 + .../model_loader/default_loader.py | 20 +++++- vllm/transformers_utils/config.py | 46 +++++++++++++- vllm/transformers_utils/configs/mistral.py | 2 +- vllm/transformers_utils/tokenizer.py | 30 +++++---- vllm/v1/engine/processor.py | 25 +++++++- 15 files changed, 230 insertions(+), 34 deletions(-) create mode 100644 tests/transformers_utils/test_config.py diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 7e6c69e717dba..dd79ba19b7247 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -142,7 +142,7 @@ Flags: `--tool-call-parser hermes` Supported models: * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) -* Additional mistral function-calling models are compatible as well. +* Additional Mistral function-calling models are compatible as well. Known issues: @@ -158,12 +158,25 @@ Known issues: Recommended flags: -1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend: +1. To use the official Mistral AI's format: - `--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral` + `--tool-call-parser mistral` -2. To use the default Transformers tokenization backend: - `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` +2. To use the Transformers format when available: + + `--tokenizer_mode hf --config_format hf --load_format hf --tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` + +!!! note + Models officially released by Mistral AI have two possible formats: + + 1. The official format that is used by default with `auto` or `mistral` arguments: + + `--tokenizer_mode mistral --config_format mistral --load_format mistral` + This format uses [mistral-common](https://github.com/mistralai/mistral-common), the Mistral AI's tokenizer backend. + + 2. The Transformers format, when available, that is used with `hf` arguments: + + `--tokenizer_mode hf --config_format hf --load_format hf --chat-template examples/tool_chat_template_mistral_parallel.jinja` ### Llama Models (`llama3_json`) diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index 0ae83ec16020a..80e337d570a36 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -208,7 +208,7 @@ def test_mistral_format( with vllm_runner( model, dtype=dtype, - tokenizer_mode="auto", + tokenizer_mode="hf", load_format="safetensors", config_format="hf", ) as hf_format_model: diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py index 2f38dc450ef96..0d2eaca95504e 100644 --- a/tests/models/multimodal/test_mapping.py +++ b/tests/models/multimodal/test_mapping.py @@ -50,12 +50,24 @@ def test_hf_model_weights_mapper(model_arch: str): model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") + is_mistral_model = model_arch in [ + "Mistral3ForConditionalGeneration", + "PixtralForConditionalGeneration", + "VoxtralForConditionalGeneration", + ] + + if not is_mistral_model or model_info.tokenizer_mode == "mistral": + tokenizer_mode = model_info.tokenizer_mode + else: + tokenizer_mode = "hf" + model_id = model_info.default model_config = ModelConfig( model_id, tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, + tokenizer_mode=tokenizer_mode, + config_format="hf", revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index dc4b4546e451b..5b8aaa299fdc1 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -259,6 +259,9 @@ def validate_generated_texts( tensor_parallel_size=vllm_tp_size, enforce_eager=False, default_torch_num_threads=1, + tokenizer_mode="hf", + load_format="hf", + config_format="hf", ) as llm: vllm_outputs = llm.generate_greedy(prompts, max_tokens) vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner") diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index d188b21863812..7584b903156b7 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -128,6 +128,12 @@ CONFIGS: dict[str, ServerConfig] = { "arguments": [ "--enforce-eager", "--no-enable-prefix-caching", + "--tokenizer_mode", + "hf", + "--load_format", + "hf", + "--config_format", + "hf", "--tool-call-parser", "mistral", "--chat-template", diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py new file mode 100644 index 0000000000000..de28ab5f99e8c --- /dev/null +++ b/tests/transformers_utils/test_config.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, call, patch + +import pytest + +from vllm.transformers_utils.config import list_filtered_repo_files + + +@pytest.mark.parametrize( + "allow_patterns,expected_relative_files", + [ + ( + ["*.json", "correct*.txt"], + ["json_file.json", "subfolder/correct.txt", "correct_2.txt"], + ), + ], +) +def test_list_filtered_repo_files( + allow_patterns: list[str], expected_relative_files: list[str] +): + with tempfile.TemporaryDirectory() as tmp_dir: + # Prep folder and files + path_tmp_dir = Path(tmp_dir) + subfolder = path_tmp_dir / "subfolder" + subfolder.mkdir() + (path_tmp_dir / "json_file.json").touch() + (path_tmp_dir / "correct_2.txt").touch() + (path_tmp_dir / "uncorrect.txt").touch() + (path_tmp_dir / "uncorrect.jpeg").touch() + (subfolder / "correct.txt").touch() + (subfolder / "uncorrect_sub.txt").touch() + + def _glob_path() -> list[str]: + return [ + str(file.relative_to(path_tmp_dir)) + for file in path_tmp_dir.glob("**/*") + if file.is_file() + ] + + # Patch list_repo_files called by fn + with patch( + "vllm.transformers_utils.config.list_repo_files", + MagicMock(return_value=_glob_path()), + ) as mock_list_repo_files: + out_files = sorted( + list_filtered_repo_files( + tmp_dir, allow_patterns, "revision", "model", "token" + ) + ) + assert out_files == sorted(expected_relative_files) + assert mock_list_repo_files.call_count == 1 + assert mock_list_repo_files.call_args_list[0] == call( + repo_id=tmp_dir, + revision="revision", + repo_type="model", + token="token", + ) diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py index beaef04d766bf..bfe1cec76c138 100644 --- a/tests/transformers_utils/test_utils.py +++ b/tests/transformers_utils/test_utils.py @@ -2,7 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3 +from vllm.transformers_utils.utils import ( + is_cloud_storage, + is_gcs, + is_s3, +) def test_is_gcs(): diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index a00600b87eca1..d1b037b7956cf 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -46,11 +46,15 @@ EAGLE_SPEC_CONFIG = { PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None), - ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), + # FIXME: Since "auto" will use Mistral tokenizer and these backends do not support + # it, we skip these tests for now. + # ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), + # ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None), + ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", None), pytest.param( "mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", - "auto", + "hf", None, marks=pytest.mark.skip( reason=( @@ -80,7 +84,7 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None), # ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"), ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG), - ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG), + ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", NGRAM_SPEC_CONFIG), ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG), ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG), ] @@ -151,6 +155,8 @@ def test_structured_output( ), seed=120, tokenizer_mode=tokenizer_mode, + load_format="auto" if not model_name.startswith("mistralai/") else "hf", + config_format="auto" if not model_name.startswith("mistralai/") else "hf", speculative_config=speculative_config, ) @@ -720,6 +726,8 @@ def test_structured_output_auto_mode( max_model_len=1024, structured_outputs_config=dict(backend="auto"), tokenizer_mode=tokenizer_mode, + load_format="auto", + config_format="auto", ) sampling_params = SamplingParams( diff --git a/vllm/config/model.py b/vllm/config/model.py index 8f59673f4e1c3..49688e17cf932 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -81,7 +81,7 @@ TaskOption = Literal[ "transcription", "draft", ] -TokenizerMode = Literal["auto", "slow", "mistral", "custom"] +TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" @@ -130,7 +130,8 @@ class ModelConfig: name or path will be used.""" tokenizer_mode: TokenizerMode = "auto" """Tokenizer mode:\n - - "auto" will use the fast tokenizer if available.\n + - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n + - "hf" will use the fast tokenizer if available.\n - "slow" will always use the slow tokenizer.\n - "mistral" will always use the tokenizer from `mistral_common`.\n - "custom" will use --tokenizer to select the preregistered tokenizer.""" @@ -241,8 +242,8 @@ class ModelConfig: first one.""" config_format: str | ConfigFormat = "auto" """The format of the model config to load:\n - - "auto" will try to load the config in hf format if available else it - will try to load in mistral format.\n + - "auto" will try to load the config in hf format if available after trying + to load in mistral format.\n - "hf" will load the config in hf format.\n - "mistral" will load the config in mistral format.""" hf_token: bool | str | None = None diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index 301f2d00bf404..052d2cfc1099e 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -30,6 +30,7 @@ logger = init_logger(__name__) # if a new load format is added here LoadFormats = Literal[ "auto", + "hf", "bitsandbytes", "dummy", "fastsafetensors", @@ -45,6 +46,7 @@ LoadFormats = Literal[ ] _LOAD_FORMAT_TO_MODEL_LOADER: dict[str, type[BaseModelLoader]] = { "auto": DefaultModelLoader, + "hf": DefaultModelLoader, "bitsandbytes": BitsAndBytesModelLoader, "dummy": DummyModelLoader, "fastsafetensors": DefaultModelLoader, diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 67aa584c6bda2..7401a7a0e2dbb 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -31,6 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import ( safetensors_weights_iterator, ) from vllm.platforms import current_platform +from vllm.transformers_utils.config import list_filtered_repo_files logger = init_logger(__name__) @@ -96,8 +97,25 @@ class DefaultModelLoader(BaseModelLoader): load_format = self.load_config.load_format use_safetensors = False index_file = SAFE_WEIGHTS_INDEX_NAME - # Some quantized models use .pt files for storing the weights. + + # First check for 'auto' format that mistral files format are present. + # This is to load mistral models with official format by default. if load_format == "auto": + load_format = ( + "mistral" + if len( + list_filtered_repo_files( + model_name_or_path=model_name_or_path, + allow_patterns=["consolidated*.safetensors"], + revision=revision, + ) + ) + > 0 + else "hf" + ) + + # Some quantized models use .pt files for storing the weights. + if load_format == "hf": allow_patterns = ["*.safetensors", "*.bin"] elif load_format == "safetensors" or load_format == "fastsafetensors": use_safetensors = True diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index df24738477e76..9eac7bb50afa6 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import fnmatch import json import os import time @@ -355,6 +356,41 @@ def list_repo_files( return with_retry(lookup_files, "Error retrieving file list") +def list_filtered_repo_files( + model_name_or_path: str, + allow_patterns: list[str], + revision: str | None = None, + repo_type: str | None = None, + token: str | bool | None = None, +) -> list[str]: + try: + all_files = list_repo_files( + repo_id=model_name_or_path, + revision=revision, + token=token, + repo_type=repo_type, + ) + except Exception: + logger.error( + "Error retrieving file list. Please ensure your `model_name_or_path`" + "`repo_type`, `token` and `revision` arguments are correctly set. " + "Returning an empty list." + ) + return [] + + file_list = [] + # Filter patterns on filenames + for pattern in allow_patterns: + file_list.extend( + [ + file + for file in all_files + if fnmatch.fnmatch(os.path.basename(file), pattern) + ] + ) + return file_list + + def file_exists( repo_id: str, file_name: str, @@ -619,10 +655,14 @@ def get_config( if config_format == "auto": try: - if is_gguf or file_or_path_exists(model, HF_CONFIG_NAME, revision=revision): - config_format = "hf" - elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision): + # First check for Mistral to avoid defaulting to + # Transformers implementation. + if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision): config_format = "mistral" + elif is_gguf or file_or_path_exists( + model, HF_CONFIG_NAME, revision=revision + ): + config_format = "hf" else: raise ValueError( "Could not detect config format for no config file found. " diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index fe202b2ed1568..8da4ab35c56c3 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -118,7 +118,7 @@ def _remap_general_mistral_args(config: dict) -> dict: "model_type": ("model_type", "transformer"), "hidden_act": ("activation", "silu"), "tie_word_embeddings": ("tied_embeddings", False), - "max_seq_len": ("max_seq_len", 128_000), + "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), "max_position_embeddings": ("max_position_embeddings", 128_000), } diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index a393568909d27..233076741503d 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -3,8 +3,8 @@ import contextlib import copy +import importlib.util import os -import warnings from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, Any, TypeAlias @@ -15,7 +15,10 @@ from typing_extensions import assert_never from vllm import envs from vllm.logger import init_logger -from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config +from vllm.transformers_utils.config import ( + get_sentence_transformer_tokenizer_config, + list_filtered_repo_files, +) from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import check_gguf_file @@ -182,25 +185,29 @@ def get_tokenizer( kwargs["gguf_file"] = Path(tokenizer_name).name tokenizer_name = Path(tokenizer_name).parent - # if tokenizer is from official mistral org - is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai" - if is_from_mistral_org and tokenizer_mode != "mistral": - warnings.warn( - "It is strongly recommended to run mistral models with " - '`--tokenizer-mode "mistral"` to ensure correct ' - "encoding and decoding.", - FutureWarning, - stacklevel=2, + # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format + # first to use official Mistral tokenizer if possible. + mistral_common_installed = importlib.util.find_spec("mistral_common") is not None + if tokenizer_mode == "auto" and mistral_common_installed: + allow_patterns = ["tekken.json", "tokenizer.model.v*"] + files_list = list_filtered_repo_files( + model_name_or_path=str(tokenizer_name), + allow_patterns=allow_patterns, + revision=revision, ) + if len(files_list) > 0: + tokenizer_mode = "mistral" tokenizer: AnyTokenizer if tokenizer_mode == "mistral": + logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}") tokenizer = MistralTokenizer.from_pretrained( str(tokenizer_name), revision=revision ) elif tokenizer_mode == "custom": from vllm.transformers_utils.tokenizer_base import TokenizerRegistry + logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}") tokenizer = TokenizerRegistry.get_tokenizer( str(tokenizer_name), *args, @@ -210,6 +217,7 @@ def get_tokenizer( ) else: try: + logger.debug_once(f"Loading AutoTokenizer from {tokenizer_name}") tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, *args, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 905ad406b307e..af4f0e410e253 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -20,6 +20,7 @@ from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats @@ -300,12 +301,24 @@ class Processor: # allows <|special_token|> and similar, see # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens # Without tokenizer these are disallowed in grammars. + if isinstance(self.tokenizer, MistralTokenizer): + raise ValueError( + "Mistral tokenizer is not supported for the 'guidance' " + "structured output backend. Please use ['xgrammar', 'outlines'] " + "backends or tokenizer_mode='hf' instead." + ) validate_guidance_grammar(params, tokenizer=None) elif backend == "outlines": # outlines backend validate_structured_output_request_outlines(params) elif backend == "lm-format-enforcer": # lm format enforcer backend + if isinstance(self.tokenizer, MistralTokenizer): + raise ValueError( + "Mistral tokenizer is not supported for the 'lm-format-enforcer' " + "structured output backend. Please use ['xgrammar', 'outlines'] " + "backends or tokenizer_mode='hf' instead." + ) validate_structured_output_request_lm_format_enforcer(params) else: # NOTE: backend must be "auto" here, because we have @@ -320,9 +333,15 @@ class Processor: except ValueError: # The request either failed validation # or includes some jsonschema feature(s) that - # are not supported in xgrammar. Fall back to guidance. - validate_guidance_grammar(params, tokenizer=None) - params.structured_outputs._backend = "guidance" + # are not supported in xgrammar. + if isinstance(self.tokenizer, MistralTokenizer): + # Fall back to outlines if the tokenizer is Mistral + validate_structured_output_request_outlines(params) + params.structured_outputs._backend = "outlines" + else: + # Fall back to guidance by default. + validate_guidance_grammar(params, tokenizer=None) + params.structured_outputs._backend = "guidance" # Remember that this backend was set automatically params.structured_outputs._backend_was_auto = True