[BugFix] Optional tokenizer argument when loading GGUF models (#29582)

Signed-off-by: Injae Ryou <injaeryou@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-03-16 16:17:27 +08:00 · 2025-11-28 01:53:10 +09:00 · 2025-11-28 01:53:10 +09:00 · 0840abdd24
commit 0840abdd24
parent e1f262337b
3 changed files with 59 additions and 8 deletions
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -439,13 +439,6 @@ class ModelConfig:
        self.model = maybe_model_redirect(self.model)
        # The tokenizer is consistent with the model by default.
        if self.tokenizer is None:
-            # Check if this is a GGUF model (either local file or remote GGUF)
-            if is_gguf(self.model):
-                raise ValueError(
-                    "Using a tokenizer is mandatory when loading a GGUF model. "
-                    "Please specify the tokenizer path or name using the "
-                    "--tokenizer argument."
-                )
            self.tokenizer = self.model
        if self.tokenizer_revision is None:
            self.tokenizer_revision = self.revision
@ -699,6 +692,14 @@ class ModelConfig:

            self.multimodal_config = MultiModalConfig(**mm_config_kwargs)

+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
        if self.disable_sliding_window:
            # Set after get_and_verify_max_len to ensure that max_model_len
            # can be correctly capped to sliding window size
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@ -9,6 +9,7 @@ from gguf.constants import Keys, VisionProjectorType
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig

 from vllm.logger import init_logger
+from vllm.transformers_utils.config import list_filtered_repo_files

 logger = init_logger(__name__)

@ -164,3 +165,44 @@ def maybe_patch_hf_config_from_gguf(
            hf_config = new_hf_config

    return hf_config
+
+
+def get_gguf_file_path_from_hf(
+    repo_id: str | Path,
+    quant_type: str,
+    revision: str | None = None,
+) -> str:
+    """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
+
+    Args:
+        repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
+        quant_type: The quantization type (e.g., "Q4_K_M", "F16")
+        revision: Optional revision/branch name
+
+    Returns:
+        The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
+    """
+    repo_id = str(repo_id)
+    gguf_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+    matching_files = list_filtered_repo_files(
+        repo_id,
+        allow_patterns=gguf_patterns,
+        revision=revision,
+    )
+
+    if len(matching_files) == 0:
+        raise ValueError(
+            "Could not find GGUF file for repo %s with quantization %s.",
+            repo_id,
+            quant_type,
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    matching_files.sort(key=lambda x: (x.count("-"), x))
+    gguf_filename = matching_files[0]
+    return gguf_filename
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -19,6 +19,7 @@ from vllm.transformers_utils.config import (
    get_sentence_transformer_tokenizer_config,
    list_filtered_repo_files,
 )
+from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import (
    check_gguf_file,
@ -190,7 +191,14 @@ def get_tokenizer(
            kwargs["gguf_file"] = Path(tokenizer_name).name
            tokenizer_name = Path(tokenizer_name).parent
        elif is_remote_gguf(tokenizer_name):
-            tokenizer_name, _ = split_remote_gguf(tokenizer_name)
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            gguf_file = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
+            kwargs["gguf_file"] = gguf_file

    # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
    # first to use official Mistral tokenizer if possible.