diff --git a/tests/models/registry.py b/tests/models/registry.py index 257ca36db3a0..1eb7f7b9d829 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -443,6 +443,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501 "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501 hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501 + "VoxtralForConditionalGeneration": _HfExamplesInfo( + "mistralai/Voxtral-Mini-3B-2507", + min_transformers_version="4.54", + # disable this temporarily until we support HF format + is_available_online=False, + ), # [Encoder-decoder] # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer # Therefore, we borrow the BartTokenizer from the original Bart model @@ -450,13 +456,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501 trust_remote_code=True), # noqa: E501 "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 - "VoxtralForConditionalGeneration": _HfExamplesInfo( - "mistralai/Voxtral-Mini-3B-2507", - tokenizer_mode="mistral", - min_transformers_version="4.54" - ), "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 - # [Cross-encoder] "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), # noqa: E501 } diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 25f026e9bef8..979d789b330c 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -13,8 +13,7 @@ from transformers import LlavaConfig as HfLlavaConfig from transformers import PretrainedConfig, SiglipVisionConfig from transformers.image_utils import ImageInput, get_image_size, to_numpy_array from transformers.models.llava import LlavaProcessor -from transformers.processing_utils import (ProcessingKwargs, Unpack, - _validate_images_text_input_order) +from transformers.processing_utils import ProcessingKwargs, Unpack from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from vllm.config import VllmConfig @@ -94,9 +93,6 @@ class TarsierProcessor(LlavaProcessor): raise ValueError( "You have to specify at least one of `images` or `text`.") - # check if images and text inputs are reversed for BC - images, text = _validate_images_text_input_order(images, text) - output_kwargs = self._merge_kwargs( TarsierProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2e66dc16b47a..8d1f59e6eadf 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -37,6 +37,7 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, MiniMaxText01Config, MiniMaxVL01Config, MllamaConfig, MLPSpeculatorConfig, MPTConfig, + Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, OvisConfig, RWConfig, SkyworkR1VChatConfig, SolarConfig, @@ -80,6 +81,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "dbrx": DbrxConfig, "deepseek_vl_v2": DeepseekVLV2Config, "kimi_vl": KimiVLConfig, + "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config, "mpt": MPTConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 5d84d648f1c5..89303213a27e 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.mpt import MPTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig +from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig @@ -50,6 +51,7 @@ __all__ = [ "KimiVLConfig", "NemotronConfig", "NemotronHConfig", + "Nemotron_Nano_VL_Config", "NVLM_D_Config", "OvisConfig", "SkyworkR1VChatConfig", diff --git a/vllm/transformers_utils/configs/nemotron_vl.py b/vllm/transformers_utils/configs/nemotron_vl.py new file mode 100644 index 000000000000..6a642f26b82a --- /dev/null +++ b/vllm/transformers_utils/configs/nemotron_vl.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# yapf: disable +# ruff: noqa: E501 +# Adapted from +# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py +# -------------------------------------------------------- +# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License +# LICENSE is in incl_licenses directory. +# -------------------------------------------------------- + +from transformers import LlamaConfig +from transformers.configuration_utils import PretrainedConfig +from transformers.dynamic_module_utils import get_class_from_dynamic_module + + +class Nemotron_Nano_VL_Config(PretrainedConfig): + model_type = 'Llama_Nemotron_Nano_VL' + is_composition = True + + def __init__( + self, + vision_config=None, + llm_config=None, + force_image_size=None, + downsample_ratio=0.5, + template=None, + ps_version='v1', + image_tag_type="internvl", + projector_hidden_size=4096, + vit_hidden_size=1280, + **kwargs + ): + super().__init__(**kwargs) + + if vision_config is not None: + assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"] + vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]) + self.vision_config = vision_auto_config(**vision_config) + else: + self.vision_config = PretrainedConfig() + + if llm_config is None: + self.text_config = LlamaConfig() + else: + self.text_config = LlamaConfig(**llm_config) + + # Assign configuration values + self.force_image_size = force_image_size + self.downsample_ratio = downsample_ratio + self.template = template # TODO move out of here and into the tokenizer + self.ps_version = ps_version # Pixel shuffle version + self.image_tag_type = image_tag_type # TODO: into the tokenizer too? + self.projector_hidden_size = projector_hidden_size + self.vit_hidden_size = vit_hidden_size