From 4f882be4a0d319551ce2a0eadcace0e76f3432cd Mon Sep 17 00:00:00 2001 From: Yu Jiaqi <54204033+piood@users.noreply.github.com> Date: Mon, 27 Oct 2025 21:57:37 +0800 Subject: [PATCH] [Model] Siglip2 Model Support (#27566) Signed-off-by: piood <2477084691@qq.com> --- docs/models/supported_models.md | 2 +- tests/models/multimodal/pooling/test_siglip.py | 2 +- vllm/model_executor/models/siglip.py | 8 +++++--- vllm/transformers_utils/config.py | 12 +++++++++++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 5da561c83b2cc..4d50c809d1966 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -775,7 +775,7 @@ The following table lists those that are tested in vLLM. | `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | | `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | | `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | -| `SiglipModel` | SigLIP | T / I | `google/siglip-base-patch16-224` | | | +| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | | | `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index f681b4787b697..3345b10c099ac 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -19,7 +19,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts( } ) -MODELS = ["google/siglip-base-patch16-224"] +MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"] def _run_test( diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 694e06f9fc811..e363be523dcce 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -174,9 +174,11 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]): @cached_property def image_token_id(self) -> int: tokenizer = self.info.get_tokenizer() - dummy_token_id = 0 - - assert dummy_token_id not in tokenizer.all_special_ids + dummy_token_id = next( + token_id + for token_id in range(tokenizer.vocab_size) + if token_id not in tokenizer.all_special_ids + ) return dummy_token_id diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 7802cece6075a..13de5939356e9 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -26,7 +26,10 @@ from huggingface_hub.utils import ( ) from transformers import GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import get_image_processor_config -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from transformers.models.auto.modeling_auto import ( + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + MODEL_MAPPING_NAMES, +) from transformers.models.auto.tokenization_auto import get_tokenizer_config from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME @@ -616,6 +619,13 @@ def get_config( model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] config.update({"architectures": [model_type]}) + # Architecture mapping for models without explicit architectures field + if not config.architectures: + if config.model_type not in MODEL_MAPPING_NAMES: + raise ValueError(f"Cannot find architecture name for {config.model_type}") + model_type = MODEL_MAPPING_NAMES[config.model_type] + config.update({"architectures": [model_type]}) + # ModelOpt 0.31.0 and after saves the quantization config in the model # config file. quantization_config = config_dict.get("quantization_config", None)