diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 92fb2404d8a2..2f903a33c6b8 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -9,7 +9,7 @@ from pathlib import PosixPath import pytest from packaging.version import Version -from transformers import AutoModelForPreTraining, AutoModelForVision2Seq +from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq from transformers import __version__ as TRANSFORMERS_VERSION from vllm.platforms import current_platform @@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = { prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", convert_assets_to_embeddings=model_utils.get_llava_embeddings, max_model_len=4096, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( @@ -121,7 +121,7 @@ VLM_TEST_SETTINGS = { "stop_sign": "caption es", "cherry_blossom": "What is in the picture?", }), - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), @@ -190,7 +190,7 @@ VLM_TEST_SETTINGS = { test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:", img_idx_to_prompt=lambda idx: "", - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output, ), "chameleon": VLMTestInfo( @@ -199,7 +199,7 @@ VLM_TEST_SETTINGS = { prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), @@ -240,6 +240,7 @@ VLM_TEST_SETTINGS = { img_idx_to_prompt=lambda idx: "", max_model_len=2048, max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, use_tokenizer_eos=True, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, @@ -256,8 +257,7 @@ VLM_TEST_SETTINGS = { multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, - # TODO: Use AutoModelForVision2Seq once transformers supports this - auto_cls=AutoModelForPreTraining, + auto_cls=AutoModelForImageTextToText, dtype="bfloat16", vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, patch_hf_runner=model_utils.gemma3_patch_hf_runner, @@ -307,7 +307,7 @@ VLM_TEST_SETTINGS = { img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, hf_output_post_proc=model_utils.idefics3_trunc_hf_output, ), "intern_vl": VLMTestInfo( @@ -336,7 +336,7 @@ VLM_TEST_SETTINGS = { test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", max_model_len=10240, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( @@ -382,7 +382,7 @@ VLM_TEST_SETTINGS = { "pixel_values" ), get_stop_token_ids=lambda tok: [128009], - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, patch_hf_runner=model_utils.mantis_patch_hf_runner, marks=[ @@ -463,7 +463,7 @@ VLM_TEST_SETTINGS = { img_idx_to_prompt=lambda idx: "[IMG]", max_model_len=8192, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, marks=[large_gpu_mark(min_gb=48)], ), "qwen_vl": VLMTestInfo( @@ -481,7 +481,7 @@ VLM_TEST_SETTINGS = { models=["facebook/chameleon-7b"], prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), @@ -495,7 +495,7 @@ VLM_TEST_SETTINGS = { models=["llava-hf/llava-1.5-7b-hf"], prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, marks=multi_gpu_marks(num_gpus=2), **COMMON_BROADCAST_SETTINGS # type: ignore @@ -504,7 +504,7 @@ VLM_TEST_SETTINGS = { models=["llava-hf/llava-v1.6-mistral-7b-hf"], prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", max_model_len=10240, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, marks=multi_gpu_marks(num_gpus=2), **COMMON_BROADCAST_SETTINGS # type: ignore diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 8b9a856d005e..d5d410f17ddf 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -2,7 +2,7 @@ import pytest import torch.nn.functional as F -from transformers import AutoModelForVision2Seq +from transformers import AutoModelForImageTextToText from vllm.platforms import current_platform @@ -70,7 +70,7 @@ def _run_test( vllm_outputs = vllm_model.encode(input_texts, images=input_images) with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: + auto_cls=AutoModelForImageTextToText) as hf_model: # Patch the issue where generation_config.json is missing hf_model.processor.patch_size = \ hf_model.model.config.vision_config.patch_size diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 08e4b1b2f309..d2cdcfe4a56a 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -4,8 +4,8 @@ from typing import Optional, overload import pytest import torch -from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, - BatchEncoding) +from transformers import (AutoConfig, AutoModelForImageTextToText, + AutoTokenizer, BatchEncoding) from vllm import LLM, SamplingParams from vllm.attention.backends.flash_attn import FlashAttentionMetadata @@ -234,7 +234,7 @@ def _run_test( dtype=dtype, model_kwargs={"device_map": "auto"}, postprocess_inputs=process, - auto_cls=AutoModelForVision2Seq) as hf_model: + auto_cls=AutoModelForImageTextToText) as hf_model: hf_outputs_per_image = [ hf_model.generate_greedy_logprobs_limit(prompts, max_tokens,