From 78b8015a4d0f07be1836d78dfe9bfa70ebbfd431 Mon Sep 17 00:00:00 2001 From: Bowen Bao Date: Fri, 3 Oct 2025 15:31:59 -0700 Subject: [PATCH] [Bugfix] Relax tokenizer regex for mixtral to include 'tokenizer.model' (#25964) Signed-off-by: Bowen Bao --- vllm/transformers_utils/tokenizers/mistral.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index d8a8d19391cd..ed9f28d54448 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -122,15 +122,21 @@ def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]: def find_tokenizer_file(files: list[str]): + # Accept both versioned (tokenizer.model.v3) and unversioned + # (tokenizer.model) forms, plus tekken.json and tokenizer.mm.model + # variants. Previous pattern only matched the versioned variants. file_pattern = re.compile( - r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$") + r"^tokenizer\.model(\.v.*)?|tekken\.json|tokenizer\.mm\.model(\.v.*)?$" + ) matched_files = [file for file in files if file_pattern.match(file)] if len(matched_files) > 1: - raise OSError( - f"Found {len(matched_files)} files matching the " - f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral " - f"tokenizer is present in {files}.") + logger.warning( + "Multiple files matched pattern `%s`: %s. Using %s.", + file_pattern.pattern, + matched_files, + matched_files[0], + ) elif len(matched_files) == 0: raise OSError( f"Found {len(matched_files)} files matching the "