diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 3ca7c0a2c8991..313ab2fa8038b 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Set as AbstractSet from functools import partial import numpy as np @@ -22,14 +23,17 @@ from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.transformers_utils.tokenizer import ( - AnyTokenizer, MistralTokenizer, cached_tokenizer_from_config, encode_tokens, ) from ....multimodal.utils import random_audio, random_image, random_video -from ...registry import HF_EXAMPLE_MODELS +from ...registry import ( + _MULTIMODAL_EXAMPLE_MODELS, + _TRANSFORMERS_BACKEND_MODELS, + HF_EXAMPLE_MODELS, +) def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: @@ -83,6 +87,119 @@ def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: return mm_data +# For some multimodal models, tokenizer will always add bos_token +# at the beginning of prompt by default, causing hf_processor outputs +# incorrect token ids. So we need use `add_special_tokens=False` here +# to leave bos_token to be added by the processor. +_ADD_SPECIAL_TOKENS_OVERRIDES = { + "ovis": False, + "ovis2_5": False, + "paligemma": False, + "ultravox": False, + "whisper": False, +} + +_IGNORE_MM_KEYS = { + # In Ultravox, the audio_features can be different depending on padding + # The slight difference should not be a problem though, since + # attention_mask lets us ignore the difference. + "ultravox": {"audio_features"}, +} + +MM_DATA_PATCHES = { + # GLM4.1V and Qwen3-VL requires video metadata to be included in the input + "glm4v": glm4_1v_patch_mm_data, + "glm4v_moe": glm4_1v_patch_mm_data, + "qwen3_vl": qwen3_vl_patch_mm_data, + "qwen3_vl_moe": qwen3_vl_patch_mm_data, +} + + +def _iter_model_ids_to_test(model_arch_list: AbstractSet[str]): + for model_arch in model_arch_list: + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + yield model_info.default + + for extra_type, extra_model_id in model_info.extras.items(): + if "fp" in extra_type: + continue # Redundant to test quantized models + + yield extra_model_id + + +def _get_model_ids_to_test(model_arch_list: AbstractSet[str]): + return list(_iter_model_ids_to_test(model_arch_list)) + + +def get_model_ids_to_test(): + transformers_arch_ids = { + model_id + for info in _TRANSFORMERS_BACKEND_MODELS.values() + for model_id in (info.default, *info.extras.values()) + } + vllm_only_archs = { + arch + for arch, info in _MULTIMODAL_EXAMPLE_MODELS.items() + if not any( + model_id in transformers_arch_ids + for model_id in (info.default, *info.extras.values()) + ) + } + + return _get_model_ids_to_test(vllm_only_archs) + + +def get_text_token_prompts( + processor: BaseMultiModalProcessor, + mm_data: MultiModalDataDict, +): + dummy_inputs = processor.dummy_inputs + tokenizer = processor.info.get_tokenizer() + model_config = processor.info.ctx.model_config + + model_type = model_config.hf_config.model_type + if model_type in MM_DATA_PATCHES: + mm_data = MM_DATA_PATCHES[model_type](mm_data) + + parsed_data = processor.data_parser.parse_mm_data(mm_data) + mm_counts = {k: len(vs) for k, vs in parsed_data.items()} + + text_prompt: str | None + token_prompt: list[int] + if isinstance(tokenizer, MistralTokenizer): + images = parsed_data.get("image", []) + request = ChatCompletionRequest( + messages=[ + UserMessage( + content=[ + TextChunk(text=""), + *(ImageChunk(image=image) for image in images), + ] + ), + ] + ) + res = tokenizer.mistral.encode_chat_completion(request) + + # Mistral does not support decode_tokens with skip_special_tokens=False + text_prompt = None + token_prompt = res.tokens + else: + inputs = dummy_inputs.get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ) + assert isinstance(inputs.prompt, str) + + text_prompt = inputs.prompt + token_prompt = encode_tokens( + tokenizer, + text_prompt, + add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type), + ) + + return text_prompt, token_prompt + + def _test_processing_correctness( model_id_or_arch: str, hit_rate: float, @@ -148,8 +265,6 @@ def _test_processing_correctness( baseline_processor = factories.build_processor(ctx, cache=None) cached_processor = factories.build_processor(ctx, cache=cache) - dummy_inputs = baseline_processor.dummy_inputs - tokenizer = baseline_processor.info.get_tokenizer() rng = np.random.RandomState(0) @@ -175,29 +290,6 @@ def _test_processing_correctness( for k, limit in limit_mm_per_prompt_ints.items() } - mm_counts = {k: len(vs) for k, vs in mm_data.items()} - - # Mistral chat outputs tokens directly, rather than text prompts - if isinstance(tokenizer, MistralTokenizer): - images = mm_data.get("image", []) - request = ChatCompletionRequest( - messages=[ - UserMessage( - content=[ - TextChunk(text=""), - *(ImageChunk(image=image) for image in images), - ] - ), - ] - ) - res = tokenizer.mistral.encode_chat_completion(request) - prompt = res.tokens - else: - prompt = dummy_inputs.get_dummy_processor_inputs( - model_config.max_model_len, - mm_counts, - ).prompt - # Drop unnecessary keys and test single -> multi conversion if rng.rand() < simplify_rate: for k in list(mm_data.keys()): @@ -208,8 +300,6 @@ def _test_processing_correctness( _test_processing_correctness_one( model_config, - tokenizer, - prompt, mm_data, baseline_processor, cached_processor, @@ -217,59 +307,17 @@ def _test_processing_correctness( ) -# For some multimodal models, tokenizer will always add bos_token -# at the beginning of prompt by default, causing hf_processor outputs -# incorrect token ids. So we need use `add_special_tokens=False` here -# to leave bos_token to be added by the processor. -_ADD_SPECIAL_TOKENS_OVERRIDES = { - "ovis": False, - "ovis2_5": False, - "paligemma": False, - "ultravox": False, - "whisper": False, -} - -_IGNORE_MM_KEYS = { - # In Ultravox, the audio_features can be different depending on padding - # The slight difference should not be a problem though, since - # attention_mask lets us ignore the difference. - "ultravox": {"audio_features"}, -} - -MM_DATA_PATCHES = { - # GLM4.1V and Qwen3-VL requires video metadata to be included in the input - "glm4v": glm4_1v_patch_mm_data, - "glm4v_moe": glm4_1v_patch_mm_data, - "qwen3_vl": qwen3_vl_patch_mm_data, - "qwen3_vl_moe": qwen3_vl_patch_mm_data, -} - - def _test_processing_correctness_one( model_config: ModelConfig, - tokenizer: AnyTokenizer, - prompt: str | list[int], mm_data: MultiModalDataDict, baseline_processor: BaseMultiModalProcessor, cached_processor: BaseMultiModalProcessor, batch_idx: int, ): model_type = model_config.hf_config.model_type - ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]()) - if model_type in MM_DATA_PATCHES: - mm_data = MM_DATA_PATCHES[model_type](mm_data) - if isinstance(prompt, str): - text_prompt = prompt - token_prompt = encode_tokens( - tokenizer, - prompt, - add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type), - ) - else: - # Mistral does not support decode_tokens with skip_special_tokens=False - text_prompt = None - token_prompt = prompt + text_prompt, token_prompt = get_text_token_prompts(baseline_processor, mm_data) + ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]()) baseline_tokenized_result = baseline_processor.apply( token_prompt, @@ -324,81 +372,7 @@ def _test_processing_correctness_one( ) -@pytest.mark.parametrize( - "model_id", - [ - "rhymes-ai/Aria", - "CohereForAI/aya-vision-8b", - "Open-Bee/Bee-8B-RL", - "Salesforce/blip2-opt-2.7b", - "facebook/chameleon-7b", - "CohereLabs/command-a-vision-07-2025", - "deepseek-ai/deepseek-vl2-tiny", - "deepseek-ai/DeepSeek-OCR", - "baidu/ERNIE-4.5-VL-28B-A3B-PT", - "adept/fuyu-8b", - "google/gemma-3-4b-it", - "google/gemma-3n-E2B-it", - "zai-org/glm-4v-9b", - "zai-org/GLM-4.1V-9B-Thinking", - "zai-org/GLM-4.5V", - "ibm-granite/granite-speech-3.3-2b", - "h2oai/h2ovl-mississippi-800m", - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", - "HuggingFaceM4/Idefics3-8B-Llama3", - "internlm/Intern-S1", - "OpenGVLab/InternVL2-1B", - "OpenGVLab/InternVL3-1B", - "OpenGVLab/InternVL3_5-1B", - "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview", - "OpenGVLab/InternVL3_5-30B-A3B", - "Kwai-Keye/Keye-VL-8B-Preview", - "Kwai-Keye/Keye-VL-1_5-8B", - "moonshotai/Kimi-VL-A3B-Instruct", - "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "llava-hf/llava-1.5-7b-hf", - "llava-hf/llava-v1.6-mistral-7b-hf", - "llava-hf/LLaVA-NeXT-Video-7B-hf", - "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - "TIGER-Lab/Mantis-8B-siglip-llama3", - "mispeech/midashenglm-7b", - "openbmb/MiniCPM-Llama3-V-2_5", - "openbmb/MiniCPM-o-2_6", - "openbmb/MiniCPM-V-2_6", - "MiniMaxAI/MiniMax-VL-01", - "allenai/Molmo-7B-D-0924", - "allenai/Molmo-7B-O-0924", - "nvidia/NVLM-D-72B", - "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", - "AIDC-AI/Ovis1.6-Gemma2-9B", - "AIDC-AI/Ovis1.6-Llama3.2-3B", - "AIDC-AI/Ovis2-1B", - "AIDC-AI/Ovis2.5-2B", - "google/paligemma-3b-mix-224", - "google/paligemma2-3b-ft-docci-448", - "microsoft/Phi-3.5-vision-instruct", - "microsoft/Phi-4-multimodal-instruct", - "mistralai/Pixtral-12B-2409", - "mistral-community/pixtral-12b", - "Qwen/Qwen-VL-Chat", - "Qwen/Qwen2-VL-2B-Instruct", - "Qwen/Qwen2.5-VL-3B-Instruct", - "Qwen/Qwen2-Audio-7B-Instruct", - "Qwen/Qwen2.5-Omni-3B", - "Qwen/Qwen3-VL-4B-Instruct", - "Qwen/Qwen3-VL-30B-A3B-Instruct", - "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "YannQi/R-4B", - "Skywork/Skywork-R1V-38B", - "HuggingFaceTB/SmolVLM2-2.2B-Instruct", - "stepfun-ai/step3", - "fixie-ai/ultravox-v0_5-llama-3_2-1b", - "openai/whisper-large-v3", - "omni-research/Tarsier-7b", - "omni-research/Tarsier2-Recap-7b", - "mistralai/Voxtral-Mini-3B-2507", - ], -) +@pytest.mark.parametrize("model_id", get_model_ids_to_test()) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("simplify_rate", [1.0]) @@ -409,7 +383,12 @@ def test_processing_correctness( simplify_rate: float, ): if model_id == "google/gemma-3n-E2B-it": - pytest.skip("Skipping gemma-3n-E2B-it due to transformers #39911 bug.") + pytest.skip("Fix later") + if model_id == "OpenGVLab/InternVL2-2B": + pytest.skip("Fix later") + if model_id == "jinaai/jina-reranker-m0": + pytest.skip("Fix later") + _test_processing_correctness( model_id, hit_rate=hit_rate, diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 6f4ddb426cd55..687d1ef349f84 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -9,9 +9,6 @@ from typing import Any, TypeAlias import numpy as np import pytest import torch.nn as nn -from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk -from mistral_common.protocol.instruct.messages import UserMessage -from mistral_common.protocol.instruct.request import ChatCompletionRequest from PIL import Image from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config @@ -37,22 +34,9 @@ from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype -from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS +from ...registry import HF_EXAMPLE_MODELS from ...utils import dummy_hf_overrides - -ARCH_TO_SKIP = { - "MolmoForCausalLM": "incompatible requirements", -} -ARCH_NEEDS_EXTRAS = [ - "InternVLChatModel", - "Idefics3ForConditionalGeneration", - "LlavaForConditionalGeneration", - "MiniCPMV", - "PaliGemmaForConditionalGeneration", -] -REPO_ID_TO_SKIP = { - "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test", -} +from .test_common import get_model_ids_to_test, get_text_token_prompts ImageInput = list[Image.Image] VideoInput: TypeAlias = ( @@ -61,6 +45,18 @@ VideoInput: TypeAlias = ( AudioInput = list[tuple[np.ndarray, int]] +MM_OPTIONS_OVERRIDES = { + # Qwen3-VL's default profiling video size (64x64) can cause trouble + # after resizing, so we override it here for testing. + "qwen3_vl": dict( + video=VideoDummyOptions(num_frames=128, width=256, height=256), + ), + "qwen3_vl_moe": dict( + video=VideoDummyOptions(num_frames=128, width=256, height=256), + ), +} + + def _resize_data( _data: Image.Image | np.ndarray, size_factor: float ) -> Image.Image | np.ndarray: @@ -94,7 +90,7 @@ def resize_mm_data( if is_list_of(data, (Image.Image, np.ndarray, list)): return [_resize_data(d, s) for d, s in zip(data, size_factors)] elif is_list_of(data, tuple): - return [(_resize_data(d, s), meta) for (d, meta), s in zip(data, size_factors)] + return [_resize_data(d, s) for (d, _), s in zip(data, size_factors)] raise ValueError("Unsupported multimodal data type.") @@ -104,6 +100,8 @@ def create_batched_mm_kwargs( processor: BaseMultiModalProcessor, size_factors: tuple[float, ...] = (1.0, 0.5, 0.25), ) -> Iterable[tuple[str, int, BatchedTensorInputs]]: + model_type = model_config.hf_config.model_type + processing_info = processor.info dummy_inputs = processor.dummy_inputs supported_mm_limits = processing_info.get_supported_mm_limits() @@ -114,32 +112,19 @@ def create_batched_mm_kwargs( processor_inputs = dummy_inputs.get_dummy_processor_inputs( seq_len=model_config.max_model_len, mm_counts=mm_counts, + mm_options=MM_OPTIONS_OVERRIDES.get(model_type), ) mm_data = processor_inputs.mm_data resized_mm_data = { modality: resize_mm_data(data, size_factors) for modality, data in mm_data.items() } - # Mistral chat outputs tokens directly, rather than text prompts - if model_config.tokenizer_mode == "mistral": - images = resized_mm_data.get("image", []) - request = ChatCompletionRequest( - messages=[ - UserMessage( - content=[ - TextChunk(text=""), - *(ImageChunk(image=image) for image in images), - ] - ), - ] - ) - tokenizer = processing_info.get_tokenizer() - res = tokenizer.mistral.encode_chat_completion(request) - prompt = res.tokens - else: - prompt = processor_inputs.prompt + + # video metadata will be added back to the resized video data here. + text_prompt, token_prompt = get_text_token_prompts(processor, resized_mm_data) + mm_kwargs = processor.apply( - prompt=prompt, + prompt=token_prompt if text_prompt is None else text_prompt, mm_data=resized_mm_data, hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, tokenization_kwargs=processor_inputs.tokenization_kwargs, @@ -175,35 +160,15 @@ def initialize_dummy_model( cleanup_dist_env_and_memory() -def get_model_id_to_test(model_arch_list: Iterable[str]) -> list[tuple[str, str]]: - filtered_results = [] - for model_arch in model_arch_list: - model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) - if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS: - available_repos = list( - map( - lambda model_id: (model_arch, model_id), - [model_info.default, *model_info.extras.values()], - ) - ) - filtered_results.extend(available_repos) - else: - filtered_results.append((model_arch, model_info.default)) - return filtered_results - - -@pytest.mark.parametrize( - "model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()) -) -def test_model_tensor_schema(model_arch: str, model_id: str): - if model_arch in ARCH_TO_SKIP: - pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}") - if model_id in REPO_ID_TO_SKIP: - pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}") - - model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) +@pytest.mark.parametrize("model_id", get_model_ids_to_test()) +def test_model_tensor_schema(model_id: str): + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info.check_available_online(on_fail="skip") - model_info.check_transformers_version(on_fail="skip", check_max_version=False) + model_info.check_transformers_version(on_fail="skip") + + model_arch = next( + arch for arch, info in HF_EXAMPLE_MODELS.hf_models.items() if info == model_info + ) hf_overrides_fn = partial( dummy_hf_overrides, diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 3485adea6ac8a..89ce0068fb1ab 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -733,17 +733,21 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( else (pad_to_hop_length(audio[0], hop_length), audio[1]) for audio in audios ] - mm_kwargs = dict( - **mm_kwargs, - ) + # TODO(Isotr0py): Remove this patch after upstream fix PR # released and Transformers version update: # https://github.com/huggingface/transformers/pull/41473 - if ( - Version(TRANSFORMERS_VERSION) < Version("4.58.0") - and "truncation" not in mm_kwargs - ): - mm_kwargs["truncation"] = False + mm_kwargs = dict(mm_kwargs) + tok_kwargs = dict(tok_kwargs) + if Version(TRANSFORMERS_VERSION) < Version("4.58.0"): + # move truncation to audio_kwargs level to avoid conflict + # with tok_kwargs + mm_kwargs["audio_kwargs"] = { + "truncation": mm_kwargs.pop("truncation", False) + } + mm_kwargs["text_kwargs"] = { + "truncation": tok_kwargs.pop("truncation", False) + } hf_inputs = super()._call_hf_processor( prompt=prompt, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index fb2af187ebf17..940fa50ff8035 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -898,16 +898,12 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]) processor = self.info.get_hf_processor(**mm_kwargs) # Separate video processing from image processing. Because the videos - # are processed into serval image patches - if ( - "videos" in mm_data - and isinstance(mm_data["videos"], list) - and len(mm_data["videos"]) > 0 - ): + # are processed into several image patches + if videos := mm_data.pop("videos", []): video_grid_thw_lst = [] pixel_values_videos_lst = [] - for item_idx, item in enumerate(mm_data.pop("videos", [])): + for item in videos: video_array, metadata = item # NOTE: @JJJYmmm new attr metadata.frames_indices indicates