From 16be10168c6bf1f72b4a566cf49cdd10dbbdcf1b Mon Sep 17 00:00:00 2001 From: yurekami Date: Wed, 24 Dec 2025 23:34:07 +0900 Subject: [PATCH] fix(config): validate skip_tokenizer_init is not used with multimodal models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add early validation to detect when skip_tokenizer_init=True is used with multimodal models like Gemma3. This combination is not supported because multimodal processors require a tokenizer for initialization. Previously, this would cause a confusing AttributeError: 'NoneType' object has no attribute 'image_token_id' deep in the transformers Gemma3Processor initialization. Now users get a clear error message explaining the incompatibility. Fixes #31123 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 Signed-off-by: yurekami --- vllm/config/model.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vllm/config/model.py b/vllm/config/model.py index a730aa8ad1b9c..8eff5c84ef590 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -588,6 +588,19 @@ class ModelConfig: "repo name or path using the --tokenizer argument." ) + # Multimodal models require a tokenizer for processor initialization + # unless embedded inputs are enabled (enable_mm_embeds=True) + if self.skip_tokenizer_init and self.is_multimodal_model: + mm_config = getattr(self, "multimodal_config", None) + if mm_config is None or not mm_config.enable_mm_embeds: + raise ValueError( + "Multimodal models require a tokenizer for processing. " + "Please set skip_tokenizer_init=False when using multimodal " + f"models like {self.model}. Alternatively, enable embedded " + "inputs with enable_mm_embeds=True if your inputs are " + "pre-embedded." + ) + if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len # can be correctly capped to sliding window size