[CI/Build] Use AutoModelForImageTextToText to load VLMs in tests (#14945)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-03-18 02:35:17 +08:00 committed by GitHub
parent 5340b0e221
commit b89fb2a4a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 19 additions and 19 deletions

View File

@ -9,7 +9,7 @@ from pathlib import PosixPath
import pytest import pytest
from packaging.version import Version from packaging.version import Version
from transformers import AutoModelForPreTraining, AutoModelForVision2Seq from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
from transformers import __version__ as TRANSFORMERS_VERSION from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings, convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@ -121,7 +121,7 @@ VLM_TEST_SETTINGS = {
"stop_sign": "caption es", "stop_sign": "caption es",
"cherry_blossom": "What is in the picture?", "cherry_blossom": "What is in the picture?",
}), }),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
), ),
@ -190,7 +190,7 @@ VLM_TEST_SETTINGS = {
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:", prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output, vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
), ),
"chameleon": VLMTestInfo( "chameleon": VLMTestInfo(
@ -199,7 +199,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
), ),
@ -240,6 +240,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
use_tokenizer_eos=True, use_tokenizer_eos=True,
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10, num_logprobs=10,
@ -256,8 +257,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
# TODO: Use AutoModelForVision2Seq once transformers supports this auto_cls=AutoModelForImageTextToText,
auto_cls=AutoModelForPreTraining,
dtype="bfloat16", dtype="bfloat16",
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
@ -307,7 +307,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
hf_output_post_proc=model_utils.idefics3_trunc_hf_output, hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
), ),
"intern_vl": VLMTestInfo( "intern_vl": VLMTestInfo(
@ -336,7 +336,7 @@ VLM_TEST_SETTINGS = {
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@ -382,7 +382,7 @@ VLM_TEST_SETTINGS = {
"pixel_values" "pixel_values"
), ),
get_stop_token_ids=lambda tok: [128009], get_stop_token_ids=lambda tok: [128009],
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
patch_hf_runner=model_utils.mantis_patch_hf_runner, patch_hf_runner=model_utils.mantis_patch_hf_runner,
marks=[ marks=[
@ -463,7 +463,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "[IMG]", img_idx_to_prompt=lambda idx: "[IMG]",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"qwen_vl": VLMTestInfo( "qwen_vl": VLMTestInfo(
@ -481,7 +481,7 @@ VLM_TEST_SETTINGS = {
models=["facebook/chameleon-7b"], models=["facebook/chameleon-7b"],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
), ),
@ -495,7 +495,7 @@ VLM_TEST_SETTINGS = {
models=["llava-hf/llava-1.5-7b-hf"], models=["llava-hf/llava-1.5-7b-hf"],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS # type: ignore
@ -504,7 +504,7 @@ VLM_TEST_SETTINGS = {
models=["llava-hf/llava-v1.6-mistral-7b-hf"], models=["llava-hf/llava-v1.6-mistral-7b-hf"],
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS # type: ignore

View File

@ -2,7 +2,7 @@
import pytest import pytest
import torch.nn.functional as F import torch.nn.functional as F
from transformers import AutoModelForVision2Seq from transformers import AutoModelForImageTextToText
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -70,7 +70,7 @@ def _run_test(
vllm_outputs = vllm_model.encode(input_texts, images=input_images) vllm_outputs = vllm_model.encode(input_texts, images=input_images)
with hf_runner(model, dtype=dtype, with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model: auto_cls=AutoModelForImageTextToText) as hf_model:
# Patch the issue where generation_config.json is missing # Patch the issue where generation_config.json is missing
hf_model.processor.patch_size = \ hf_model.processor.patch_size = \
hf_model.model.config.vision_config.patch_size hf_model.model.config.vision_config.patch_size

View File

@ -4,8 +4,8 @@ from typing import Optional, overload
import pytest import pytest
import torch import torch
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, from transformers import (AutoConfig, AutoModelForImageTextToText,
BatchEncoding) AutoTokenizer, BatchEncoding)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.attention.backends.flash_attn import FlashAttentionMetadata from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@ -234,7 +234,7 @@ def _run_test(
dtype=dtype, dtype=dtype,
model_kwargs={"device_map": "auto"}, model_kwargs={"device_map": "auto"},
postprocess_inputs=process, postprocess_inputs=process,
auto_cls=AutoModelForVision2Seq) as hf_model: auto_cls=AutoModelForImageTextToText) as hf_model:
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,