mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 17:16:24 +08:00
[CI/Build] Use AutoModelForImageTextToText to load VLMs in tests (#14945)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
5340b0e221
commit
b89fb2a4a1
@ -9,7 +9,7 @@ from pathlib import PosixPath
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from packaging.version import Version
|
from packaging.version import Version
|
||||||
from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
|
from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
|
||||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[CustomTestOptions(
|
||||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
@ -121,7 +121,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"stop_sign": "caption es",
|
"stop_sign": "caption es",
|
||||||
"cherry_blossom": "What is in the picture?",
|
"cherry_blossom": "What is in the picture?",
|
||||||
}),
|
}),
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||||
"pixel_values"
|
"pixel_values"
|
||||||
),
|
),
|
||||||
@ -190,7 +190,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
||||||
img_idx_to_prompt=lambda idx: "",
|
img_idx_to_prompt=lambda idx: "",
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
||||||
),
|
),
|
||||||
"chameleon": VLMTestInfo(
|
"chameleon": VLMTestInfo(
|
||||||
@ -199,7 +199,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||||
"pixel_values"
|
"pixel_values"
|
||||||
),
|
),
|
||||||
@ -240,6 +240,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
img_idx_to_prompt=lambda idx: "",
|
img_idx_to_prompt=lambda idx: "",
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
|
auto_cls=AutoModelForImageTextToText,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||||
num_logprobs=10,
|
num_logprobs=10,
|
||||||
@ -256,8 +257,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
# TODO: Use AutoModelForVision2Seq once transformers supports this
|
auto_cls=AutoModelForImageTextToText,
|
||||||
auto_cls=AutoModelForPreTraining,
|
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
||||||
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||||
@ -307,7 +307,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
img_idx_to_prompt=lambda idx: "<image>",
|
img_idx_to_prompt=lambda idx: "<image>",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
||||||
),
|
),
|
||||||
"intern_vl": VLMTestInfo(
|
"intern_vl": VLMTestInfo(
|
||||||
@ -336,7 +336,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
||||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||||
max_model_len=10240,
|
max_model_len=10240,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[CustomTestOptions(
|
||||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
@ -382,7 +382,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"pixel_values"
|
"pixel_values"
|
||||||
),
|
),
|
||||||
get_stop_token_ids=lambda tok: [128009],
|
get_stop_token_ids=lambda tok: [128009],
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
|
||||||
patch_hf_runner=model_utils.mantis_patch_hf_runner,
|
patch_hf_runner=model_utils.mantis_patch_hf_runner,
|
||||||
marks=[
|
marks=[
|
||||||
@ -463,7 +463,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
img_idx_to_prompt=lambda idx: "[IMG]",
|
img_idx_to_prompt=lambda idx: "[IMG]",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
marks=[large_gpu_mark(min_gb=48)],
|
marks=[large_gpu_mark(min_gb=48)],
|
||||||
),
|
),
|
||||||
"qwen_vl": VLMTestInfo(
|
"qwen_vl": VLMTestInfo(
|
||||||
@ -481,7 +481,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["facebook/chameleon-7b"],
|
models=["facebook/chameleon-7b"],
|
||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||||
"pixel_values"
|
"pixel_values"
|
||||||
),
|
),
|
||||||
@ -495,7 +495,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["llava-hf/llava-1.5-7b-hf"],
|
models=["llava-hf/llava-1.5-7b-hf"],
|
||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
marks=multi_gpu_marks(num_gpus=2),
|
marks=multi_gpu_marks(num_gpus=2),
|
||||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||||
@ -504,7 +504,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||||
max_model_len=10240,
|
max_model_len=10240,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
marks=multi_gpu_marks(num_gpus=2),
|
marks=multi_gpu_marks(num_gpus=2),
|
||||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from transformers import AutoModelForVision2Seq
|
from transformers import AutoModelForImageTextToText
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
@ -70,7 +70,7 @@ def _run_test(
|
|||||||
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
|
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype,
|
with hf_runner(model, dtype=dtype,
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||||
# Patch the issue where generation_config.json is missing
|
# Patch the issue where generation_config.json is missing
|
||||||
hf_model.processor.patch_size = \
|
hf_model.processor.patch_size = \
|
||||||
hf_model.model.config.vision_config.patch_size
|
hf_model.model.config.vision_config.patch_size
|
||||||
|
|||||||
@ -4,8 +4,8 @@ from typing import Optional, overload
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
from transformers import (AutoConfig, AutoModelForImageTextToText,
|
||||||
BatchEncoding)
|
AutoTokenizer, BatchEncoding)
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
|
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
|
||||||
@ -234,7 +234,7 @@ def _run_test(
|
|||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
model_kwargs={"device_map": "auto"},
|
model_kwargs={"device_map": "auto"},
|
||||||
postprocess_inputs=process,
|
postprocess_inputs=process,
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||||
hf_outputs_per_image = [
|
hf_outputs_per_image = [
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user