# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Common tests for testing .generate() functionality for single / multiple image, embedding, and video support for different VLMs in vLLM. """ import math import os from collections import defaultdict from pathlib import PosixPath import pytest from transformers import ( AutoModel, AutoModelForImageTextToText, AutoModelForTextToWaveform, ) from vllm.platforms import current_platform from vllm.utils import identity from ....conftest import ( IMAGE_ASSETS, AudioTestAssets, HfRunner, ImageTestAssets, VideoTestAssets, VllmRunner, ) from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks from ...utils import check_outputs_equal from .vlm_utils import custom_inputs, model_utils, runners from .vlm_utils.case_filtering import get_parametrized_options from .vlm_utils.types import ( CustomTestOptions, ExpandableVLMTestArgs, VLMTestInfo, VLMTestType, ) # This hack is needed for phi3v & paligemma models # ROCm Triton FA can run into shared memory issues with these models, # use other backends in the meantime # FIXME (mattwong, gshtrasb, hongxiayan) if current_platform.is_rocm(): os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" COMMON_BROADCAST_SETTINGS = { "test_type": VLMTestType.IMAGE, "dtype": "half", "max_tokens": 5, "tensor_parallel_size": 2, "hf_model_kwargs": {"device_map": "auto"}, "image_size_factors": [(0.25, 0.5, 1.0)], "distributed_executor_backend": ( "ray", "mp", ), } ### Test configuration for specific models # NOTE: The convention of the test settings below is to lead each test key # with the name of the model arch used in the test, using underscores in place # of hyphens; this makes it more convenient to filter tests for a specific kind # of model. For example.... # # To run all test types for a specific key: # use the k flag to substring match with a leading square bracket; if the # model arch happens to be a substring of another one, you can add a # trailing hyphen. E.g., # - pytest $TEST_FILE -k "[llava-" # prevents matching on "[llava_next-" & will match just the enabled cases # for llava, i.e., single image, image embedding, and custom input tests. # # To run a test for a Test Info for just one of multiple models: # use the k flag to substring match the model name, e.g., # - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B # prevents matching on nGVLab/InternVL2-2B. # # You can also combine substrings to match more granularly. # ex 1: # pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B" # will run only test_single_image* for OpenGVLab/InternVL2-1B; this would # match both wrappers for single image tests, since it also matches # test_single_image_heavy (which forks if we have a distributed backend) # ex 2: # pytest $TEST_FILE -k "[llava- or [intern_vl-" # will run all of the tests for only llava & internvl. # # NOTE you can add --collect-only to any of the above commands to see # which cases would be selected and deselected by pytest. In general, # this is a good idea for checking your command first, since tests are slow. VLM_TEST_SETTINGS = { #### Core tests to always run in the CI "llava": VLMTestInfo( models=["llava-hf/llava-1.5-7b-hf"], test_type=(VLMTestType.EMBEDDING, VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", convert_assets_to_embeddings=model_utils.get_llava_embeddings, max_model_len=4096, auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, custom_test_opts=[ CustomTestOptions( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" ), limit_mm_per_prompt={"image": 4}, ) ], # TODO: Revert to "auto" when CPU backend can use torch > 2.6 dtype="bfloat16" if current_platform.is_cpu() else "auto", marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "paligemma": VLMTestInfo( models=["google/paligemma-3b-mix-224"], test_type=VLMTestType.IMAGE, prompt_formatter=identity, img_idx_to_prompt=lambda idx: "", # Paligemma uses its own sample prompts because the default one fails single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "caption es", "cherry_blossom": "What is in the picture?", } ), auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, dtype="bfloat16", marks=[ pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask") ], # noqa: E501 ), "qwen2_5_vl": VLMTestInfo( models=["Qwen/Qwen2.5-VL-3B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "qwen2_5_omni": VLMTestInfo( models=["Qwen/Qwen2.5-Omni-3B"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501 max_model_len=4096, max_num_seqs=2, num_logprobs=6 if current_platform.is_cpu() else 5, auto_cls=AutoModelForTextToWaveform, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "ultravox": VLMTestInfo( models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"], test_type=VLMTestType.AUDIO, prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 audio_idx_to_prompt=lambda idx: "<|audio|>", max_model_len=4096, max_num_seqs=2, auto_cls=AutoModel, hf_output_post_proc=model_utils.ultravox_trunc_hf_output, marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), #### Transformers fallback to test ## To reduce test burden, we only test batching arbitrary image size # Dynamic image length and number of patches "llava-onevision-transformers": VLMTestInfo( models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.IMAGE, prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 max_model_len=16384, hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" ), # noqa: E501 auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, image_size_factors=[(0.25, 0.5, 1.0)], vllm_runner_kwargs={ "model_impl": "transformers", "default_torch_num_threads": 1, }, # FIXME: Investigate why the test hangs # when processing the 3rd prompt in vLLM marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")], ), "idefics3-transformers": VLMTestInfo( models=["HuggingFaceTB/SmolVLM-256M-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, hf_output_post_proc=model_utils.idefics3_trunc_hf_output, image_size_factors=[(0.25, 0.5, 1.0)], vllm_runner_kwargs={ "model_impl": "transformers", }, marks=[pytest.mark.core_model], ), # Pixel values from processor are not 4D or 5D arrays "qwen2_5_vl-transformers": VLMTestInfo( models=["Qwen/Qwen2.5-VL-3B-Instruct"], test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, image_size_factors=[(0.25, 0.2, 0.15)], vllm_runner_kwargs={ "model_impl": "transformers", }, # FIXME: Investigate mrope issue marks=[large_gpu_mark(min_gb=32), pytest.mark.skip(reason="Mrope issue")], ), #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "Please describe the image shortly.", "cherry_blossom": "Please infer the season with reason.", # noqa: E501 } ), multi_image_prompt="Describe the two images shortly.", # noqa: E501 stop_str=["<|im_end|>"], image_size_factors=[(0.10, 0.15)], max_tokens=64, marks=[large_gpu_mark(min_gb=64)], ), "aya_vision": VLMTestInfo( models=["CohereForAI/aya-vision-8b"], test_type=(VLMTestType.IMAGE), prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "What's the content in the center of the image?", # noqa: E501 "cherry_blossom": "What is the season?", # noqa: E501 } ), multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}, ), "aya_vision-multi_image": VLMTestInfo( models=["CohereForAI/aya-vision-8b"], test_type=(VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "What's the content in the center of the image?", # noqa: E501 "cherry_blossom": "What is the season?", # noqa: E501 } ), multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}, marks=[large_gpu_mark(min_gb=32)], ), "blip2": VLMTestInfo( models=["Salesforce/blip2-opt-2.7b"], test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:", img_idx_to_prompt=lambda idx: "", auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output, # FIXME: https://github.com/huggingface/transformers/pull/38510 marks=[pytest.mark.skip("Model is broken")], ), "chameleon": VLMTestInfo( models=["facebook/chameleon-7b"], test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, # For chameleon, we only compare the sequences vllm_output_post_proc=lambda vllm_output, model: vllm_output[:2], hf_output_post_proc=lambda hf_output, model: hf_output[:2], comparator=check_outputs_equal, max_tokens=8, dtype="bfloat16", ), "deepseek_vl_v2": VLMTestInfo( models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 max_model_len=4096, max_num_seqs=2, single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501 } ), multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501 patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"{img_prompt}\n", img_idx_to_prompt=lambda idx: "", max_model_len=2048, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, use_tokenizer_eos=True, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[large_gpu_mark(min_gb=32)], ), "gemma3": VLMTestInfo( models=["google/gemma-3-4b-it"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"user\n{img_prompt}\nmodel\n", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "What's the content in the center of the image?", # noqa: E501 "cherry_blossom": "What is the season?", # noqa: E501 } ), multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, patch_hf_runner=model_utils.gemma3_patch_hf_runner, num_logprobs=10, ), "glm4v": VLMTestInfo( models=["zai-org/glm-4v-9b"], test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501 "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501 } ), max_model_len=2048, max_num_seqs=2, get_stop_token_ids=lambda tok: [151329, 151336, 151338], patch_hf_runner=model_utils.glm4v_patch_hf_runner, # The image embeddings match with HF but the outputs of the language # decoder are only consistent up to 2 decimal places. # So, we need to reduce the number of tokens for the test to pass. max_tokens=8, num_logprobs=10, marks=[large_gpu_mark(min_gb=32)], ), "glm4_1v": VLMTestInfo( models=["zai-org/GLM-4.1V-9B-Thinking"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501 max_model_len=2048, max_num_seqs=2, get_stop_token_ids=lambda tok: [151329, 151336, 151338], num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], auto_cls=AutoModelForImageTextToText, marks=[large_gpu_mark(min_gb=32)], ), "glm4_1v-video": VLMTestInfo( models=["zai-org/GLM-4.1V-9B-Thinking"], # GLM4.1V require include video metadata for input test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, patch_hf_runner=model_utils.glm4_1v_patch_hf_runner, custom_test_opts=[ CustomTestOptions( inputs=custom_inputs.video_with_metadata_glm4_1v(), limit_mm_per_prompt={"video": 1}, ) ], marks=[large_gpu_mark(min_gb=32)], ), "h2ovl": VLMTestInfo( models=[ "h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-2b", ], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 "cherry_blossom": "\nWhat is the season?", } ), multi_image_prompt="Image-1: \nImage-2: \nDescribe the two images in short.", # noqa: E501 max_model_len=8192, use_tokenizer_eos=True, num_logprobs=10, patch_hf_runner=model_utils.h2ovl_patch_hf_runner, ), "idefics3": VLMTestInfo( models=["HuggingFaceTB/SmolVLM-256M-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, hf_output_post_proc=model_utils.idefics3_trunc_hf_output, ), "intern_vl": VLMTestInfo( models=[ "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-2B", # FIXME: Config cannot be loaded in transformers 4.52 # "OpenGVLab/Mono-InternVL-2B", ], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts( { "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 "cherry_blossom": "\nWhat is the season?", } ), multi_image_prompt="Image-1: \nImage-2: \nDescribe the two images in short.", # noqa: E501 max_model_len=4096, use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, ), "intern_vl-video": VLMTestInfo( models=[ "OpenGVLab/InternVL3-1B", ], test_type=VLMTestType.VIDEO, prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 video_idx_to_prompt=lambda idx: "