mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-08 09:03:36 +08:00
[CI/Build] VLM Test Consolidation (#9372)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
This commit is contained in:
parent
211fe91aa8
commit
cc98f1e079
@ -338,7 +338,10 @@ steps:
|
|||||||
- tests/models/decoder_only/vision_language
|
- tests/models/decoder_only/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/decoder_only/audio_language
|
- pytest -v -s models/decoder_only/audio_language
|
||||||
- pytest -v -s models/decoder_only/vision_language
|
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||||
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language
|
||||||
|
|
||||||
- label: Other Models Test # 6min
|
- label: Other Models Test # 6min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
@ -413,7 +416,7 @@ steps:
|
|||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
||||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||||
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
|
|||||||
@ -259,8 +259,7 @@ class HfRunner:
|
|||||||
is_sentence_transformer: bool = False,
|
is_sentence_transformer: bool = False,
|
||||||
skip_tokenizer_init: bool = False,
|
skip_tokenizer_init: bool = False,
|
||||||
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||||
postprocess_inputs: Callable[[BatchEncoding],
|
postprocess_inputs: Callable[..., BatchEncoding] = identity,
|
||||||
BatchEncoding] = identity,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||||
|
|
||||||
@ -303,6 +302,7 @@ class HfRunner:
|
|||||||
if skip_tokenizer_init:
|
if skip_tokenizer_init:
|
||||||
self.tokenizer = self.processor.tokenizer
|
self.tokenizer = self.processor.tokenizer
|
||||||
|
|
||||||
|
self.dtype = dtype
|
||||||
self.postprocess_inputs = postprocess_inputs
|
self.postprocess_inputs = postprocess_inputs
|
||||||
|
|
||||||
def get_inputs(
|
def get_inputs(
|
||||||
@ -337,7 +337,7 @@ class HfRunner:
|
|||||||
processor_kwargs["sampling_rate"] = sr
|
processor_kwargs["sampling_rate"] = sr
|
||||||
|
|
||||||
inputs = self.processor(**processor_kwargs)
|
inputs = self.processor(**processor_kwargs)
|
||||||
inputs = self.postprocess_inputs(inputs)
|
inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
|
||||||
|
|
||||||
all_inputs.append(inputs)
|
all_inputs.append(inputs)
|
||||||
|
|
||||||
|
|||||||
29
tests/engine/test_short_mm_context.py
Normal file
29
tests/engine/test_short_mm_context.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from ..conftest import IMAGE_ASSETS
|
||||||
|
|
||||||
|
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
|
"stop_sign":
|
||||||
|
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
||||||
|
"cherry_blossom":
|
||||||
|
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
||||||
|
})
|
||||||
|
|
||||||
|
models = ["llava-hf/llava-1.5-7b-hf"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
def test_context_length_too_short(vllm_runner, image_assets, model):
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="too long to fit into the model"):
|
||||||
|
vllm_model = vllm_runner(
|
||||||
|
model,
|
||||||
|
max_model_len=128, # LLaVA has a feature size of 576
|
||||||
|
enforce_eager=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with vllm_model:
|
||||||
|
vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
|
||||||
|
max_tokens=1,
|
||||||
|
images=[images[0]])
|
||||||
@ -92,7 +92,7 @@ def run_test(
|
|||||||
for vllm_prompt, _, audio in prompts_and_audios
|
for vllm_prompt, _, audio in prompts_and_audios
|
||||||
]
|
]
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding):
|
def process(hf_inputs: BatchEncoding, **kwargs):
|
||||||
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
|
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
|
||||||
.to(torch_dtype) # type: ignore
|
.to(torch_dtype) # type: ignore
|
||||||
return hf_inputs
|
return hf_inputs
|
||||||
|
|||||||
34
tests/models/decoder_only/language/test_qwen.py
Normal file
34
tests/models/decoder_only/language/test_qwen.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
"""Ensure that a text-only Qwen model can be run without throwing an error.
|
||||||
|
We explicitly test this because Qwen is implemented as a multimodal and
|
||||||
|
supports a visual encoder for models like Qwen-VL.
|
||||||
|
"""
|
||||||
|
from typing import List, Type
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ....conftest import VllmRunner
|
||||||
|
|
||||||
|
models = [
|
||||||
|
"Qwen/Qwen-7B-Chat" # Has no visual encoder
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_text_only_qwen_model_can_be_loaded_and_run(
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
example_prompts: List[str],
|
||||||
|
model: str,
|
||||||
|
*,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
):
|
||||||
|
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||||
|
vllm_model.generate_greedy_logprobs(
|
||||||
|
example_prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
)
|
||||||
@ -0,0 +1,68 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.inputs import InputContext
|
||||||
|
|
||||||
|
from ....utils import build_model_context
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def get_max_llava_next_image_tokens():
|
||||||
|
from vllm.model_executor.models.llava_next import (
|
||||||
|
get_max_llava_next_image_tokens)
|
||||||
|
return get_max_llava_next_image_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def dummy_data_for_llava_next():
|
||||||
|
from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
|
||||||
|
return dummy_data_for_llava_next
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
|
||||||
|
([[336, 336]], 1176),
|
||||||
|
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
|
||||||
|
])
|
||||||
|
def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
|
||||||
|
get_max_llava_next_image_tokens):
|
||||||
|
ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
|
||||||
|
|
||||||
|
# Update the config image_grid_pinpoints
|
||||||
|
# and calculate the resulting max tokens
|
||||||
|
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
|
||||||
|
|
||||||
|
actual_max_tokens = get_max_llava_next_image_tokens(
|
||||||
|
InputContext(ctx.model_config))
|
||||||
|
|
||||||
|
assert expected_max_tokens == actual_max_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"gridpoints,expected_size",
|
||||||
|
[
|
||||||
|
# One point; it has to be the largest
|
||||||
|
([[336, 336]], (336, 336)),
|
||||||
|
# Default for most llava next models; the 2x2 tile is the largest
|
||||||
|
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
|
||||||
|
(672, 672)),
|
||||||
|
# If two rectangular gridpoints are the same, the more vertical
|
||||||
|
# one has the higher feature count due to newline features
|
||||||
|
([[336, 672], [672, 336]], (672, 336))
|
||||||
|
])
|
||||||
|
def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
|
||||||
|
gridpoints, expected_size):
|
||||||
|
ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
|
||||||
|
|
||||||
|
# Update the config image_grid_pinpoints
|
||||||
|
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
|
||||||
|
seq_len = 5000 # bigger than the max feature size for any image
|
||||||
|
|
||||||
|
seq_data, mm_data = dummy_data_for_llava_next(
|
||||||
|
ctx,
|
||||||
|
seq_len=seq_len,
|
||||||
|
mm_counts={"image": 1},
|
||||||
|
)
|
||||||
|
|
||||||
|
# The dummy data dims should match the gridpoint with the biggest feat size
|
||||||
|
assert mm_data["image"].height == expected_size[0]
|
||||||
|
assert mm_data["image"].width == expected_size[1]
|
||||||
|
assert len(seq_data.get_token_ids()) >= seq_len
|
||||||
@ -0,0 +1,181 @@
|
|||||||
|
"""Tests for phi3v's multimodal preprocessing kwargs."""
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
from transformers import AutoImageProcessor, AutoTokenizer
|
||||||
|
|
||||||
|
from vllm.inputs import InputContext, token_inputs
|
||||||
|
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
|
||||||
|
from vllm.multimodal import MultiModalRegistry
|
||||||
|
|
||||||
|
from .....conftest import _ImageAssets
|
||||||
|
from ....utils import build_model_context
|
||||||
|
|
||||||
|
models = ["microsoft/Phi-3.5-vision-instruct"]
|
||||||
|
|
||||||
|
|
||||||
|
# Wrap lazy imports to avoid initializing CUDA during test collection
|
||||||
|
@pytest.fixture()
|
||||||
|
def input_processor_for_phi3v():
|
||||||
|
from vllm.model_executor.models.phi3v import input_processor_for_phi3v
|
||||||
|
return input_processor_for_phi3v
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def dummy_data_for_phi3v():
|
||||||
|
from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
|
||||||
|
return dummy_data_for_phi3v
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def get_max_phi3v_image_tokens():
|
||||||
|
from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
|
||||||
|
return get_max_phi3v_image_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize("num_crops", [4, 16, None])
|
||||||
|
def test_input_mapper_override(model: str, image_assets: _ImageAssets,
|
||||||
|
num_crops: Optional[int]):
|
||||||
|
"""Ensure that the [default] input mapper handles num_crops properly."""
|
||||||
|
# We pass the processor kwargs here since for this model, we fall back to
|
||||||
|
# the default mapper; this will fall back to the HF mapper and forward
|
||||||
|
# mm_processor_kwargs to it.
|
||||||
|
mm_processor_kwargs = {
|
||||||
|
"num_crops": num_crops
|
||||||
|
} if num_crops is not None else {}
|
||||||
|
ctx = build_model_context(
|
||||||
|
model_name=model,
|
||||||
|
tokenizer_name=model,
|
||||||
|
trust_remote_code=True,
|
||||||
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
hf_processor = AutoImageProcessor.from_pretrained(model,
|
||||||
|
trust_remote_code=True,
|
||||||
|
**mm_processor_kwargs)
|
||||||
|
|
||||||
|
mm_registry = MultiModalRegistry()
|
||||||
|
mm_registry.init_mm_limits_per_prompt(ctx.model_config)
|
||||||
|
|
||||||
|
image = image_assets[0].pil_image
|
||||||
|
hf_result = hf_processor.preprocess(
|
||||||
|
image,
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
|
||||||
|
vllm_result = mm_registry.map_input(
|
||||||
|
ctx.model_config,
|
||||||
|
{"image": image},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
|
||||||
|
assert torch.all(
|
||||||
|
hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
|
||||||
|
|
||||||
|
# For pixel values, the second axis should be the num_crops + 1
|
||||||
|
# for the rescaled original image. The default value in VLLM falls
|
||||||
|
# back to the HF config, which is why we compare to the processor num_crops
|
||||||
|
assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
|
||||||
|
assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize("num_crops,expected_max_tokens", [
|
||||||
|
(4, 781),
|
||||||
|
(16, 2653),
|
||||||
|
])
|
||||||
|
def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
|
||||||
|
num_crops: int, expected_max_tokens: int):
|
||||||
|
"""Ensure get_max_phi3v_image_tokens handles num_crops properly."""
|
||||||
|
# NOTE: mm_processor_kwargs on the context in this test is unused, since
|
||||||
|
# this is testing the mapper directly. In practice, the processor kwargs
|
||||||
|
# are wrapped in a closure when calling the max tokens func. We explicitly
|
||||||
|
# do NOT use the mm_processor_kwargs in the model context here to ensure
|
||||||
|
# that the max image tokens implementation is referencing a mix of the
|
||||||
|
# kwargs to the function and the original mm_processor_kwargs in case
|
||||||
|
# values are somehow updated and end up in a bad state.
|
||||||
|
ctx = build_model_context(
|
||||||
|
model_name=model,
|
||||||
|
tokenizer_name=model,
|
||||||
|
trust_remote_code=True,
|
||||||
|
mm_processor_kwargs=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
actual_max_tokens = get_max_phi3v_image_tokens(
|
||||||
|
InputContext(ctx.model_config),
|
||||||
|
num_crops=num_crops,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert expected_max_tokens == actual_max_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
|
||||||
|
(4, 781, 1),
|
||||||
|
(4, 781, 2),
|
||||||
|
(16, 2653, 1),
|
||||||
|
(16, 2653, 2),
|
||||||
|
])
|
||||||
|
def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
|
||||||
|
toks_per_img: int, num_imgs: int):
|
||||||
|
"""Ensure dummy_data_for_phi3v handles num_crops properly."""
|
||||||
|
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||||
|
# in this test and assume that the kwargs will be correctly expanded by
|
||||||
|
# the partial when calling the dummy data func.
|
||||||
|
ctx = build_model_context(
|
||||||
|
model_name=model,
|
||||||
|
tokenizer_name=model,
|
||||||
|
trust_remote_code=True,
|
||||||
|
mm_processor_kwargs=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
sequence_data, _, = dummy_data_for_phi3v(
|
||||||
|
ctx=ctx,
|
||||||
|
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
|
||||||
|
mm_counts={"image": num_imgs},
|
||||||
|
num_crops=num_crops,
|
||||||
|
)
|
||||||
|
# Ensure we have the right number of placeholders per num_crops size
|
||||||
|
img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
|
||||||
|
assert img_tok_count == toks_per_img * num_imgs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
|
||||||
|
(4, 757, 1),
|
||||||
|
(4, 757, 2),
|
||||||
|
(16, 1921, 1),
|
||||||
|
(16, 1921, 2),
|
||||||
|
])
|
||||||
|
def test_input_processor_override(input_processor_for_phi3v,
|
||||||
|
image_assets: _ImageAssets, model: str,
|
||||||
|
num_crops: int, expected_toks_per_img: int,
|
||||||
|
num_imgs: int):
|
||||||
|
"""Ensure input_processor_for_phi3v handles num_crops properly."""
|
||||||
|
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||||
|
# in this test and assume that the kwargs will be correctly expanded by
|
||||||
|
# the partial when calling the custom input processor.
|
||||||
|
ctx = build_model_context(
|
||||||
|
model_name=model,
|
||||||
|
tokenizer_name=model,
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
||||||
|
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
||||||
|
images = [image_assets[0].pil_image] * num_imgs
|
||||||
|
|
||||||
|
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
|
||||||
|
prompt=prompt,
|
||||||
|
multi_modal_data={"image": images})
|
||||||
|
|
||||||
|
processed_inputs = input_processor_for_phi3v(ctx,
|
||||||
|
inputs,
|
||||||
|
num_crops=num_crops)
|
||||||
|
|
||||||
|
# Ensure we have the right number of placeholders per num_crops size
|
||||||
|
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
|
||||||
|
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||||
@ -0,0 +1,144 @@
|
|||||||
|
"""Tests for Qwen's multimodal preprocessing kwargs."""
|
||||||
|
from typing import Dict, List, Union
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
from PIL.Image import Image
|
||||||
|
|
||||||
|
from vllm.inputs import InputContext, token_inputs
|
||||||
|
from vllm.multimodal.base import MultiModalInputs
|
||||||
|
from vllm.multimodal.utils import cached_get_tokenizer
|
||||||
|
|
||||||
|
from .....conftest import IMAGE_ASSETS
|
||||||
|
from ....utils import build_model_context
|
||||||
|
|
||||||
|
### Multimodal preprocessing tests
|
||||||
|
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
|
||||||
|
# These values are specific to Qwen-VL/Chat; we can get these from the model
|
||||||
|
# config also, but they are hardcoded here to keep the parameterize/fixtures
|
||||||
|
# easy to read.
|
||||||
|
IMG_START_ID = 151857
|
||||||
|
IMG_END_ID = 151858
|
||||||
|
IMG_PAD_ID = 151859
|
||||||
|
TOKS_PER_IMG = 256
|
||||||
|
VIS_ENC_DIM = 4096
|
||||||
|
IMG_SIZE = 448
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def input_mapper_for_qwen():
|
||||||
|
# Lazy import to avoid initializing CUDA during test collection
|
||||||
|
from vllm.model_executor.models.qwen import input_mapper_for_qwen
|
||||||
|
return input_mapper_for_qwen
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def input_processor_for_qwen():
|
||||||
|
# Lazy import to avoid initializing CUDA during test collection
|
||||||
|
from vllm.model_executor.models.qwen import input_processor_for_qwen
|
||||||
|
return input_processor_for_qwen
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def qwen_vl_context() -> InputContext:
|
||||||
|
"""Get an InputContext for Qwen-VL."""
|
||||||
|
return build_model_context(model_name="Qwen/Qwen-VL",
|
||||||
|
trust_remote_code=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Happy path tests for single/multi-image scenarios for the multimodal
|
||||||
|
# input processor and mapper, respectively
|
||||||
|
@pytest.mark.parametrize("num_images", [1, 2])
|
||||||
|
def test_input_processor_valid_mm_data(input_processor_for_qwen,
|
||||||
|
qwen_vl_context: InputContext,
|
||||||
|
num_images: int):
|
||||||
|
"""Happy cases for image inputs to Qwen's multimodal input processor."""
|
||||||
|
prompt = "".join(
|
||||||
|
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
|
||||||
|
inputs = token_inputs(
|
||||||
|
prompt=prompt,
|
||||||
|
# When processing multimodal data for a multimodal model, the qwen
|
||||||
|
# input processor will overwrite the provided prompt_token_ids with
|
||||||
|
# the image prompts
|
||||||
|
prompt_token_ids=[],
|
||||||
|
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
|
||||||
|
)
|
||||||
|
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
|
||||||
|
assert isinstance(proc_inputs, dict)
|
||||||
|
|
||||||
|
# Each image should have one start / stop and a fixed context of 256
|
||||||
|
proc_tokens = proc_inputs["prompt_token_ids"]
|
||||||
|
assert proc_tokens.count(IMG_START_ID) == num_images
|
||||||
|
assert proc_tokens.count(IMG_END_ID) == num_images
|
||||||
|
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"img_data,expected_shape",
|
||||||
|
[
|
||||||
|
# single / multi-image
|
||||||
|
(SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
|
||||||
|
(2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
|
||||||
|
# single / multi-image embeddings
|
||||||
|
(torch.rand(
|
||||||
|
(TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||||
|
(torch.rand(
|
||||||
|
(1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||||
|
(torch.rand(
|
||||||
|
(2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||||
|
])
|
||||||
|
def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
||||||
|
qwen_vl_context: InputContext,
|
||||||
|
img_data: Union[torch.Tensor, List[Image],
|
||||||
|
Image],
|
||||||
|
expected_shape: List[int]):
|
||||||
|
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
|
||||||
|
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||||
|
# Ensure that we get the appropriately shaped pixel_values
|
||||||
|
# for images and image embeddings, respectively.
|
||||||
|
assert isinstance(mapped_img_data, MultiModalInputs)
|
||||||
|
assert "pixel_values" in mapped_img_data
|
||||||
|
assert mapped_img_data["pixel_values"].shape == expected_shape
|
||||||
|
|
||||||
|
|
||||||
|
# Sad path tests for the multimodal input processor and mapper, respectively
|
||||||
|
@pytest.mark.parametrize("mm_data", [
|
||||||
|
{
|
||||||
|
"image": torch.rand((5))
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"image": torch.rand((5, 5, 5, 5, 5))
|
||||||
|
},
|
||||||
|
])
|
||||||
|
def test_input_processor_invalid_mm_data(input_processor_for_qwen,
|
||||||
|
qwen_vl_context: InputContext,
|
||||||
|
mm_data: Dict[str, torch.Tensor]):
|
||||||
|
"""Test sad cases validated in Qwen's multimodal input processor."""
|
||||||
|
tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
|
||||||
|
trust_remote_code=True)
|
||||||
|
prompt = "Picture 1: <img></img>\n"
|
||||||
|
prompt_token_ids = tokenizer.encode(prompt)
|
||||||
|
inputs = token_inputs(prompt=prompt,
|
||||||
|
prompt_token_ids=prompt_token_ids,
|
||||||
|
multi_modal_data=mm_data)
|
||||||
|
# Should fail since we have too many or too few dimensions for embeddings
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
input_processor_for_qwen(qwen_vl_context, inputs)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"img_data",
|
||||||
|
[
|
||||||
|
# Wrong context length
|
||||||
|
torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
|
||||||
|
# Wrong visual encoder output size
|
||||||
|
torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
|
||||||
|
])
|
||||||
|
def test_input_mapper_invalid_mm_data(
|
||||||
|
input_mapper_for_qwen,
|
||||||
|
qwen_vl_context: InputContext,
|
||||||
|
img_data: Union[torch.Tensor, List[Image], Image],
|
||||||
|
):
|
||||||
|
"""Sad cases validated in Qwen VL's multimodal input mapper."""
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||||
@ -8,8 +8,8 @@ from transformers import AutoTokenizer
|
|||||||
from vllm.inputs import InputContext, token_inputs
|
from vllm.inputs import InputContext, token_inputs
|
||||||
from vllm.multimodal import MultiModalRegistry
|
from vllm.multimodal import MultiModalRegistry
|
||||||
|
|
||||||
from ....conftest import _ImageAssets
|
from .....conftest import _ImageAssets
|
||||||
from ...utils import build_model_context
|
from ....utils import build_model_context
|
||||||
|
|
||||||
MODEL = "Qwen/Qwen2-VL-2B-Instruct"
|
MODEL = "Qwen/Qwen2-VL-2B-Instruct"
|
||||||
MIN_PIXELS = "min_pixels"
|
MIN_PIXELS = "min_pixels"
|
||||||
@ -1,101 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import AutoModelForVision2Seq, AutoTokenizer
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"Question: What's the content of the image? Answer:",
|
|
||||||
"cherry_blossom":
|
|
||||||
"Question: What is the season? Answer:",
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]],
|
|
||||||
model: str):
|
|
||||||
"""Sanitize vllm output to be comparable with hf output."""
|
|
||||||
_, output_str, out_logprobs = vllm_output
|
|
||||||
|
|
||||||
hf_output_str = output_str + "\n"
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
hf_output_ids = tokenizer.encode(hf_output_str)
|
|
||||||
assert hf_output_ids[0] == tokenizer.bos_token_id
|
|
||||||
hf_output_ids = hf_output_ids[1:]
|
|
||||||
|
|
||||||
return hf_output_ids, hf_output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"])
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalData objects and corresponding
|
|
||||||
MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output, model)
|
|
||||||
for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
@ -1,46 +0,0 @@
|
|||||||
import pytest
|
|
||||||
import transformers
|
|
||||||
|
|
||||||
from ....utils import multi_gpu_test
|
|
||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=2)
|
|
||||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
|
||||||
@pytest.mark.parametrize("model", [
|
|
||||||
"llava-hf/llava-1.5-7b-hf",
|
|
||||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
|
||||||
"facebook/chameleon-7b",
|
|
||||||
])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets,
|
|
||||||
distributed_executor_backend, model) -> None:
|
|
||||||
|
|
||||||
dtype = "half"
|
|
||||||
max_tokens = 5
|
|
||||||
num_logprobs = 5
|
|
||||||
tensor_parallel_size = 2
|
|
||||||
|
|
||||||
if model.startswith("llava-hf/llava-1.5"):
|
|
||||||
from .test_llava import models, run_test
|
|
||||||
elif model.startswith("llava-hf/llava-v1.6"):
|
|
||||||
from .test_llava_next import models, run_test # type: ignore[no-redef]
|
|
||||||
elif model.startswith("facebook/chameleon"):
|
|
||||||
if transformers.__version__.startswith("4.46"):
|
|
||||||
pytest.skip("Model broken in HF, "
|
|
||||||
"see huggingface/transformers#34379")
|
|
||||||
from .test_chameleon import models, run_test # type: ignore[no-redef]
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(f"Unsupported model: {model}")
|
|
||||||
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
image_assets,
|
|
||||||
model=models[0],
|
|
||||||
# So that LLaVA-NeXT processor may return nested list
|
|
||||||
size_factors=[0.25, 0.5, 1.0],
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
)
|
|
||||||
@ -1,130 +0,0 @@
|
|||||||
from typing import List, Optional, Type
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import transformers
|
|
||||||
from transformers import AutoModelForVision2Seq, BatchEncoding
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
|
||||||
from ...utils import check_outputs_equal
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
|
||||||
"cherry_blossom":
|
|
||||||
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
|
||||||
})
|
|
||||||
|
|
||||||
models = ["facebook/chameleon-7b"]
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding vision language config as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
with vllm_runner(model,
|
|
||||||
max_model_len=4096,
|
|
||||||
dtype=dtype,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding):
|
|
||||||
hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
|
|
||||||
.to(torch_dtype) # type: ignore
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
with hf_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
postprocess_inputs=process,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
# HF Logprobs include image tokens, unlike vLLM, so we don't directly
|
|
||||||
# compare them
|
|
||||||
check_outputs_equal(
|
|
||||||
outputs_0_lst=[outputs[:2] for outputs in hf_outputs],
|
|
||||||
outputs_1_lst=[outputs[:2] for outputs in vllm_outputs],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
transformers.__version__.startswith("4.46.0"),
|
|
||||||
reason="Model broken in HF, see huggingface/transformers#34379",
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [8])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype, max_tokens, num_logprobs) -> None:
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
image_assets,
|
|
||||||
model,
|
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
@ -1,139 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple, Type
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"What's the content of the image?\n",
|
|
||||||
"cherry_blossom":
|
|
||||||
"What is the season?\n",
|
|
||||||
})
|
|
||||||
|
|
||||||
models = ["adept/fuyu-8b"]
|
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]]):
|
|
||||||
"""Sanitize vllm output to be comparable with hf output."""
|
|
||||||
output_ids, output_str, out_logprobs = vllm_output
|
|
||||||
|
|
||||||
hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
|
|
||||||
|
|
||||||
return output_ids, hf_output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
max_model_len=2048,
|
|
||||||
max_num_seqs=2,
|
|
||||||
dtype=dtype,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
|
||||||
eos_token_id = hf_model.processor.tokenizer.eos_token_id
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images,
|
|
||||||
eos_token_id=eos_token_id)
|
|
||||||
for prompts, images in inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
target_dtype = "half"
|
|
||||||
if current_platform.is_cpu():
|
|
||||||
target_dtype = "bfloat16"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[0.25],
|
|
||||||
# Single-scale, batched
|
|
||||||
[0.25, 0.25, 0.25],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.2, 0.15],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [10])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
image_assets,
|
|
||||||
model,
|
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
@ -1,133 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple, Type
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
|
||||||
from vllm.transformers_utils.tokenizer import patch_padding_side
|
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
|
||||||
from ....utils import large_gpu_test
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"What's the content of the image?",
|
|
||||||
"cherry_blossom":
|
|
||||||
"What is the season?",
|
|
||||||
})
|
|
||||||
|
|
||||||
models = ["THUDM/glm-4v-9b"]
|
|
||||||
target_dtype = "bfloat16"
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
mm_limit: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
max_model_len=2048,
|
|
||||||
max_num_seqs=2,
|
|
||||||
dtype=dtype,
|
|
||||||
limit_mm_per_prompt={"image": mm_limit},
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
stop_token_ids = [151329, 151336, 151338]
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images,
|
|
||||||
stop_token_ids=stop_token_ids)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
|
||||||
hf_processor = hf_model.processor
|
|
||||||
patch_padding_side(hf_processor)
|
|
||||||
|
|
||||||
def processor(*args, text="", images=None, **kwargs):
|
|
||||||
if images is None:
|
|
||||||
return hf_processor(*args, **kwargs)
|
|
||||||
|
|
||||||
return hf_processor.apply_chat_template(
|
|
||||||
[{
|
|
||||||
"role": "user",
|
|
||||||
"image": images,
|
|
||||||
"content": text
|
|
||||||
}],
|
|
||||||
add_generation_prompt=True,
|
|
||||||
tokenize=True,
|
|
||||||
return_dict=True,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
hf_model.processor = processor
|
|
||||||
hf_model.model.get_output_embeddings = lambda: \
|
|
||||||
hf_model.model.transformer.output_layer
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(
|
|
||||||
prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images,
|
|
||||||
) for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=vllm_outputs,
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@large_gpu_test(min_gb=48)
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs_per_image,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
mm_limit=1,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
@ -1,15 +1,11 @@
|
|||||||
import types
|
from typing import List, Optional, Tuple, Type
|
||||||
from typing import List, Optional, Tuple, Type, Union
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from PIL.Image import Image
|
|
||||||
from transformers import AutoConfig
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.utils import rescale_image_size
|
||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
|
||||||
_ImageAssets)
|
|
||||||
from ...utils import check_logprobs_close
|
from ...utils import check_logprobs_close
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
@ -18,171 +14,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|||||||
"cherry_blossom":
|
"cherry_blossom":
|
||||||
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
})
|
})
|
||||||
HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
|
||||||
|
|
||||||
models = [
|
|
||||||
"OpenGVLab/InternVL2-1B",
|
|
||||||
"OpenGVLab/InternVL2-2B",
|
|
||||||
# NOTE: Mono-InternVL-2B doesn't work with fp16,
|
|
||||||
# it will result NaN during inference.
|
|
||||||
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
|
|
||||||
"OpenGVLab/Mono-InternVL-2B",
|
|
||||||
# Broken due to outdated implementation of Phi-3
|
|
||||||
# See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
|
|
||||||
# "OpenGVLab/InternVL2-4B",
|
|
||||||
]
|
|
||||||
target_dtype = "bfloat16"
|
|
||||||
|
|
||||||
|
|
||||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
|
|
||||||
def generate(
|
|
||||||
self,
|
|
||||||
pixel_values: torch.FloatTensor,
|
|
||||||
input_ids: torch.FloatTensor,
|
|
||||||
attention_mask: Optional[torch.LongTensor] = None,
|
|
||||||
**generate_kwargs,
|
|
||||||
) -> torch.LongTensor:
|
|
||||||
"""Generate method for InternVL2 model without fixed use_cache."""
|
|
||||||
assert self.img_context_token_id is not None
|
|
||||||
vit_embeds = self.extract_feature(pixel_values)
|
|
||||||
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
|
||||||
B, N, C = input_embeds.shape
|
|
||||||
input_embeds = input_embeds.reshape(B * N, C)
|
|
||||||
|
|
||||||
input_ids = input_ids.reshape(B * N)
|
|
||||||
selected = (input_ids == self.img_context_token_id)
|
|
||||||
assert selected.sum() != 0
|
|
||||||
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
|
||||||
|
|
||||||
input_embeds = input_embeds.reshape(B, N, C)
|
|
||||||
|
|
||||||
forward_kwargs = dict(
|
|
||||||
inputs_embeds=input_embeds,
|
|
||||||
attention_mask=attention_mask,
|
|
||||||
)
|
|
||||||
if getattr(self, "use_visual_token_mask", False):
|
|
||||||
visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
|
|
||||||
forward_kwargs["visual_token_mask"] = visual_token_mask
|
|
||||||
outputs = self.language_model.generate(
|
|
||||||
**forward_kwargs,
|
|
||||||
**generate_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
mm_limit: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
|
||||||
|
|
||||||
class InternVLProcessor:
|
|
||||||
"""A simple processor for InternVL2 which misses a processor."""
|
|
||||||
|
|
||||||
def __init__(self, hf_runner: HfRunner):
|
|
||||||
self.num_image_token = hf_runner.model.num_image_token
|
|
||||||
self.tokenizer = hf_runner.tokenizer
|
|
||||||
self.dtype = hf_runner.model.dtype
|
|
||||||
|
|
||||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
|
||||||
trust_remote_code=True)
|
|
||||||
self.vision_config = self.config.vision_config
|
|
||||||
self.use_thumbnail = self.config.use_thumbnail
|
|
||||||
self.min_num = self.config.min_dynamic_patch
|
|
||||||
self.max_num = self.config.max_dynamic_patch
|
|
||||||
self.image_size = self.vision_config.image_size
|
|
||||||
|
|
||||||
def __call__(self, text: str, images: Union[Image, List[Image]],
|
|
||||||
**kwargs):
|
|
||||||
from vllm.model_executor.models.internvl import (
|
|
||||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
|
||||||
images = [images] if isinstance(images, Image) else images
|
|
||||||
pixel_values = [
|
|
||||||
image_to_pixel_values(image, self.image_size, self.min_num,
|
|
||||||
self.max_num,
|
|
||||||
self.use_thumbnail).to(self.dtype)
|
|
||||||
for image in images
|
|
||||||
]
|
|
||||||
num_patches_list = [
|
|
||||||
pixel_value.shape[0] for pixel_value in pixel_values
|
|
||||||
]
|
|
||||||
pixel_values = torch.cat(pixel_values, dim=0)
|
|
||||||
for num_patches in num_patches_list:
|
|
||||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
|
||||||
* num_patches
|
|
||||||
image_tokens = IMG_START + context_tokens + IMG_END
|
|
||||||
text = text.replace('<image>', image_tokens, 1)
|
|
||||||
prompt = self.tokenizer(text, return_tensors="pt")
|
|
||||||
prompt.update({"pixel_values": pixel_values})
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
max_model_len=4096,
|
|
||||||
dtype=dtype,
|
|
||||||
limit_mm_per_prompt={"image": mm_limit},
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
|
||||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
|
||||||
"<IMG_CONTEXT>")
|
|
||||||
hf_model.model.img_context_token_id = img_context_token_id
|
|
||||||
hf_model.processor = InternVLProcessor(hf_model)
|
|
||||||
hf_model.model.get_output_embeddings = lambda: \
|
|
||||||
hf_model.model.language_model.get_output_embeddings()
|
|
||||||
hf_model.model.generate = types.MethodType(generate, hf_model.model)
|
|
||||||
eos_token_id = hf_model.tokenizer.eos_token_id
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=hf_images,
|
|
||||||
eos_token_id=eos_token_id)
|
|
||||||
for prompts, hf_images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
# TODO: Check whether using original CLIPVisionModel can improve
|
|
||||||
# consistency against HF
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=vllm_outputs,
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def run_awq_test(
|
def run_awq_test(
|
||||||
@ -253,123 +84,6 @@ def run_awq_test(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
@torch.inference_mode()
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs_per_image,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
mm_limit=1,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.5, 0.75, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
@torch.inference_mode()
|
|
||||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
|
||||||
size_factors, dtype: str, max_tokens: int,
|
|
||||||
num_logprobs: int) -> None:
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_case = [
|
|
||||||
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
|
||||||
[[rescale_image_size(image, factor) for image in images]
|
|
||||||
for factor in size_factors])
|
|
||||||
]
|
|
||||||
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs_per_case,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
mm_limit=2,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
|
|
||||||
@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
@torch.inference_mode()
|
|
||||||
def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
|
|
||||||
size_factors, dtype: str, max_tokens: int,
|
|
||||||
num_logprobs: int) -> None:
|
|
||||||
images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_batching = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
inputs_multi_images = [
|
|
||||||
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
|
||||||
[[rescale_image_size(image, factor) for image in images]
|
|
||||||
for factor in size_factors])
|
|
||||||
]
|
|
||||||
for inputs in [inputs_batching, inputs_multi_images]:
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
mm_limit=2,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
|
"models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|||||||
@ -1,313 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple, Type, overload
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
|
||||||
BatchEncoding)
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
|
||||||
_ImageAssets)
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
_LIMIT_IMAGE_PER_PROMPT = 4
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
|
||||||
"cherry_blossom":
|
|
||||||
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
|
||||||
})
|
|
||||||
|
|
||||||
models = [
|
|
||||||
"llava-hf/llava-1.5-7b-hf",
|
|
||||||
# TODO: Get this model to produce meaningful output in vLLM
|
|
||||||
# "TIGER-Lab/Mantis-8B-siglip-llama3",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]],
|
|
||||||
model: str):
|
|
||||||
"""Sanitize vllm output to be comparable with hf output."""
|
|
||||||
output_ids, output_str, out_logprobs = vllm_output
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(model)
|
|
||||||
image_token_id = config.image_token_index
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
eos_token_id = tokenizer.eos_token_id
|
|
||||||
|
|
||||||
hf_output_ids = [
|
|
||||||
token_id for idx, token_id in enumerate(output_ids)
|
|
||||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
|
||||||
]
|
|
||||||
|
|
||||||
assert output_str[0] == " "
|
|
||||||
hf_output_str = output_str[1:]
|
|
||||||
if hf_output_ids[-1] == eos_token_id:
|
|
||||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
|
||||||
|
|
||||||
return hf_output_ids, hf_output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
sizes: List[Tuple[int, int]],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: Optional[List[float]] = None,
|
|
||||||
sizes: Optional[List[Tuple[int, int]]] = None,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
if size_factors is not None:
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
elif sizes is not None:
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in sizes],
|
|
||||||
[image.resize(size) for size in sizes],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
else:
|
|
||||||
raise ValueError("You must provide either `size_factors` or `sizes`")
|
|
||||||
|
|
||||||
_run_test(hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs_per_image,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend)
|
|
||||||
|
|
||||||
|
|
||||||
def _run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
# NOTE: For local use; this isn't tested in CI yet (see TODO above)
|
|
||||||
if model.startswith("TIGER-Lab/Mantis"):
|
|
||||||
from mantis.models.mllava import MLlavaProcessor
|
|
||||||
|
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
mantis_processor = MLlavaProcessor.from_pretrained(
|
|
||||||
model, torch_dtype=torch_dtype)
|
|
||||||
assert isinstance(mantis_processor, MLlavaProcessor)
|
|
||||||
else:
|
|
||||||
mantis_processor = None
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_model_len=4096,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True,
|
|
||||||
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
|
|
||||||
}) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
if mantis_processor is not None:
|
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding):
|
|
||||||
hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
|
|
||||||
.to(torch_dtype) # type: ignore
|
|
||||||
return hf_inputs
|
|
||||||
else:
|
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding):
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
with hf_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
postprocess_inputs=process,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
# TODO: Check whether using original CLIPVisionModel can improve
|
|
||||||
# consistency against HF
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output, model)
|
|
||||||
for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype, max_tokens, num_logprobs) -> None:
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
image_assets,
|
|
||||||
model,
|
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
|
|
||||||
model, dtype, max_tokens,
|
|
||||||
num_logprobs) -> None:
|
|
||||||
stop_sign = image_assets[0].pil_image
|
|
||||||
cherry_blossom = image_assets[1].pil_image
|
|
||||||
|
|
||||||
inputs = [(
|
|
||||||
[
|
|
||||||
"USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
|
|
||||||
"USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
|
|
||||||
"USER: <image><image><image><image>\nDescribe 4 images.\nASSISTANT:", # noqa: E501
|
|
||||||
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
|
||||||
],
|
|
||||||
[
|
|
||||||
[stop_sign, cherry_blossom],
|
|
||||||
# Images with different sizes and aspect-ratios
|
|
||||||
[
|
|
||||||
rescale_image_size(stop_sign, 0.1),
|
|
||||||
stop_sign,
|
|
||||||
],
|
|
||||||
[
|
|
||||||
stop_sign,
|
|
||||||
rescale_image_size(stop_sign, 0.25),
|
|
||||||
cherry_blossom.resize((183, 488)),
|
|
||||||
cherry_blossom.resize((488, 183))
|
|
||||||
],
|
|
||||||
cherry_blossom,
|
|
||||||
])]
|
|
||||||
|
|
||||||
_run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
def test_context_length_too_short(vllm_runner, image_assets, model):
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
with pytest.raises(ValueError, match="too long to fit into the model"):
|
|
||||||
vllm_model = vllm_runner(
|
|
||||||
model,
|
|
||||||
max_model_len=128, # LLaVA has a feature size of 576
|
|
||||||
enforce_eager=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
with vllm_model:
|
|
||||||
vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
|
|
||||||
max_tokens=1,
|
|
||||||
images=[images[0]])
|
|
||||||
@ -1,158 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple, Type
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
|
||||||
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
|
||||||
"cherry_blossom":
|
|
||||||
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
|
||||||
})
|
|
||||||
|
|
||||||
models = [
|
|
||||||
"llava-hf/llava-1.5-7b-hf",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]],
|
|
||||||
model: str):
|
|
||||||
"""Sanitize vllm output to be comparable with hf output."""
|
|
||||||
output_ids, output_str, out_logprobs = vllm_output
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(model)
|
|
||||||
image_token_id = config.image_token_index
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
eos_token_id = tokenizer.eos_token_id
|
|
||||||
|
|
||||||
hf_output_ids = [
|
|
||||||
token_id for idx, token_id in enumerate(output_ids)
|
|
||||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
|
||||||
]
|
|
||||||
|
|
||||||
assert output_str[0] == " "
|
|
||||||
hf_output_str = output_str[1:]
|
|
||||||
if hf_output_ids[-1] == eos_token_id:
|
|
||||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
|
||||||
|
|
||||||
return hf_output_ids, hf_output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding vision language config as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# vLLM to load from image embeddings
|
|
||||||
vllm_images = [asset.image_embeds for asset in image_assets]
|
|
||||||
|
|
||||||
# transformers to load from PIL images
|
|
||||||
hf_images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
vllm_inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[image for _ in size_factors],
|
|
||||||
) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
hf_inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[image for _ in size_factors],
|
|
||||||
) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in vllm_inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in hf_inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
# TODO: Check whether using original CLIPVisionModel can improve
|
|
||||||
# consistency against HF
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output, model)
|
|
||||||
for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
image_assets,
|
|
||||||
model,
|
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
@ -1,347 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple, Type, overload
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
|
||||||
|
|
||||||
from vllm.inputs import InputContext
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
|
||||||
_ImageAssets)
|
|
||||||
from ...utils import build_model_context, check_logprobs_close
|
|
||||||
|
|
||||||
_LIMIT_IMAGE_PER_PROMPT = 4
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"[INST] <image>\nWhat's the content of the image? [/INST]",
|
|
||||||
"cherry_blossom":
|
|
||||||
"[INST] <image>\nWhat is the season? [/INST]",
|
|
||||||
})
|
|
||||||
|
|
||||||
models = ["llava-hf/llava-v1.6-mistral-7b-hf"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def get_max_llava_next_image_tokens():
|
|
||||||
from vllm.model_executor.models.llava_next import (
|
|
||||||
get_max_llava_next_image_tokens)
|
|
||||||
return get_max_llava_next_image_tokens
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def dummy_data_for_llava_next():
|
|
||||||
from vllm.model_executor.models.llava_next import dummy_data_for_llava_next
|
|
||||||
return dummy_data_for_llava_next
|
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]],
|
|
||||||
model: str):
|
|
||||||
"""Sanitize vllm output to be comparable with hf output."""
|
|
||||||
output_ids, output_str, out_logprobs = vllm_output
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(model)
|
|
||||||
image_token_id = config.image_token_index
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
eos_token_id = tokenizer.eos_token_id
|
|
||||||
|
|
||||||
hf_output_ids = [
|
|
||||||
token_id for idx, token_id in enumerate(output_ids)
|
|
||||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
|
||||||
]
|
|
||||||
|
|
||||||
assert output_str[0] == " "
|
|
||||||
hf_output_str = output_str[1:]
|
|
||||||
if hf_output_ids[-1] == eos_token_id:
|
|
||||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
|
||||||
|
|
||||||
return hf_output_ids, hf_output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
sizes: List[Tuple[int, int]],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: Optional[List[float]] = None,
|
|
||||||
sizes: Optional[List[Tuple[int, int]]] = None,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
if size_factors is not None:
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
elif sizes is not None:
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in sizes],
|
|
||||||
[image.resize(size) for size in sizes],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
else:
|
|
||||||
raise ValueError("You must provide either `size_factors` or `sizes`")
|
|
||||||
|
|
||||||
_run_test(hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs_per_image,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend)
|
|
||||||
|
|
||||||
|
|
||||||
def _run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
|
||||||
model: str,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_model_len=10240,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True,
|
|
||||||
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
|
|
||||||
}) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
# TODO: Check whether using original CLIPVisionModel can improve
|
|
||||||
# consistency against HF
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output, model)
|
|
||||||
for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype, max_tokens, num_logprobs) -> None:
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
image_assets,
|
|
||||||
model,
|
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sizes",
|
|
||||||
[[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes,
|
|
||||||
dtype, max_tokens, num_logprobs) -> None:
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
image_assets,
|
|
||||||
model,
|
|
||||||
sizes=sizes,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
|
|
||||||
model, dtype, max_tokens,
|
|
||||||
num_logprobs) -> None:
|
|
||||||
stop_sign = image_assets[0].pil_image
|
|
||||||
cherry_blossom = image_assets[1].pil_image
|
|
||||||
|
|
||||||
inputs = [(
|
|
||||||
[
|
|
||||||
"[INST] <image><image>\nDescribe 2 images. [/INST]",
|
|
||||||
"[INST] <image><image>\nDescribe 2 images. [/INST]",
|
|
||||||
"[INST] <image><image><image><image>\nDescribe 4 images. [/INST]",
|
|
||||||
"[INST] <image>\nWhat is the season? [/INST]"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
[stop_sign, cherry_blossom],
|
|
||||||
# Images with different sizes and aspect-ratios
|
|
||||||
[
|
|
||||||
rescale_image_size(stop_sign, 0.1),
|
|
||||||
stop_sign,
|
|
||||||
],
|
|
||||||
[
|
|
||||||
stop_sign,
|
|
||||||
rescale_image_size(stop_sign, 0.25),
|
|
||||||
cherry_blossom.resize((183, 488)),
|
|
||||||
cherry_blossom.resize((488, 183))
|
|
||||||
],
|
|
||||||
cherry_blossom,
|
|
||||||
])]
|
|
||||||
|
|
||||||
_run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("gridpoints,expected_max_tokens", [
|
|
||||||
([[336, 336]], 1176),
|
|
||||||
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928),
|
|
||||||
])
|
|
||||||
def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens,
|
|
||||||
get_max_llava_next_image_tokens):
|
|
||||||
ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
|
|
||||||
|
|
||||||
# Update the config image_grid_pinpoints
|
|
||||||
# and calculate the resulting max tokens
|
|
||||||
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
|
|
||||||
|
|
||||||
actual_max_tokens = get_max_llava_next_image_tokens(
|
|
||||||
InputContext(ctx.model_config))
|
|
||||||
|
|
||||||
assert expected_max_tokens == actual_max_tokens
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"gridpoints,expected_size",
|
|
||||||
[
|
|
||||||
# One point; it has to be the largest
|
|
||||||
([[336, 336]], (336, 336)),
|
|
||||||
# Default for most llava next models; the 2x2 tile is the largest
|
|
||||||
([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]],
|
|
||||||
(672, 672)),
|
|
||||||
# If two rectangular gridpoints are the same, the more vertical
|
|
||||||
# one has the higher feature count due to newline features
|
|
||||||
([[336, 672], [672, 336]], (672, 336))
|
|
||||||
])
|
|
||||||
def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
|
|
||||||
gridpoints, expected_size):
|
|
||||||
ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf")
|
|
||||||
|
|
||||||
# Update the config image_grid_pinpoints
|
|
||||||
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
|
|
||||||
seq_len = 5000 # bigger than the max feature size for any image
|
|
||||||
|
|
||||||
seq_data, mm_data = dummy_data_for_llava_next(
|
|
||||||
ctx,
|
|
||||||
seq_len=seq_len,
|
|
||||||
mm_counts={"image": 1},
|
|
||||||
)
|
|
||||||
|
|
||||||
# The dummy data dims should match the gridpoint with the biggest feat size
|
|
||||||
assert mm_data["image"].height == expected_size[0]
|
|
||||||
assert mm_data["image"].width == expected_size[1]
|
|
||||||
assert len(seq_data.get_token_ids()) >= seq_len
|
|
||||||
@ -1,226 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple, Type, overload
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import (rescale_video_size, resize_video,
|
|
||||||
sample_frames_from_video)
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
|
|
||||||
from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
_PREFACE = (
|
|
||||||
"A chat between a curious human and an artificial intelligence assistant. "
|
|
||||||
"The assistant gives helpful, detailed, and polite answers to the human's "
|
|
||||||
"questions.")
|
|
||||||
|
|
||||||
HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
|
||||||
"sample_demo_1":
|
|
||||||
f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
|
|
||||||
})
|
|
||||||
|
|
||||||
models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
|
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]],
|
|
||||||
model: str):
|
|
||||||
"""Sanitize vllm output to be comparable with hf output."""
|
|
||||||
output_ids, output_str, out_logprobs = vllm_output
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(model)
|
|
||||||
video_token_id = config.video_token_index
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
eos_token_id = tokenizer.eos_token_id
|
|
||||||
|
|
||||||
hf_output_ids = [
|
|
||||||
token_id for idx, token_id in enumerate(output_ids)
|
|
||||||
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
|
||||||
]
|
|
||||||
|
|
||||||
assert output_str[0] == " "
|
|
||||||
hf_output_str = output_str[1:]
|
|
||||||
if hf_output_ids[-1] == eos_token_id:
|
|
||||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
|
||||||
|
|
||||||
return hf_output_ids, hf_output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
video_assets: _VideoAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
num_frames: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
video_assets: _VideoAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
sizes: List[Tuple[int, int]],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
num_frames: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
video_assets: _VideoAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: Optional[List[float]] = None,
|
|
||||||
sizes: Optional[List[Tuple[int, int]]] = None,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
num_frames: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
videos = [
|
|
||||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
|
||||||
for asset in video_assets
|
|
||||||
]
|
|
||||||
|
|
||||||
if size_factors is not None:
|
|
||||||
inputs_per_video = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_video_size(video, factor) for factor in size_factors],
|
|
||||||
) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
|
|
||||||
elif sizes is not None:
|
|
||||||
inputs_per_video = [(
|
|
||||||
[prompt for _ in sizes],
|
|
||||||
[resize_video(video, size) for size in sizes],
|
|
||||||
) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
|
|
||||||
else:
|
|
||||||
raise ValueError("You must provide either `size_factors` or `sizes`")
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_model_len=4096,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs_per_video = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
videos=videos)
|
|
||||||
for prompts, videos in inputs_per_video
|
|
||||||
]
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_video = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
videos=videos)
|
|
||||||
for prompts, videos in inputs_per_video
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
|
|
||||||
vllm_outputs_per_video):
|
|
||||||
# TODO: Check whether using original CLIPVisionModel can improve
|
|
||||||
# consistency against HF
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output, model)
|
|
||||||
for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No video
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
@pytest.mark.parametrize("num_frames", [16])
|
|
||||||
def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
|
|
||||||
dtype, max_tokens, num_logprobs, num_frames) -> None:
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test is under tests/videos.
|
|
||||||
For huggingface runner, we provide the np.ndarray as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
video_assets,
|
|
||||||
model,
|
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
num_frames=num_frames,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sizes",
|
|
||||||
[[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
@pytest.mark.parametrize("num_frames", [16])
|
|
||||||
def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
|
|
||||||
dtype, max_tokens, num_logprobs,
|
|
||||||
num_frames) -> None:
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
video_assets,
|
|
||||||
model,
|
|
||||||
sizes=sizes,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
num_frames=num_frames,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
@ -1,272 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple, Type
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
|
||||||
BatchEncoding)
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
|
||||||
resize_video, sample_frames_from_video)
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
|
|
||||||
from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput,
|
|
||||||
PromptVideoInput, VllmRunner)
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
# Video test
|
|
||||||
HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
|
||||||
"sample_demo_1":
|
|
||||||
"<|im_start|>user\n<video>\nwhy is this video funny?<|im_end|>\n<|im_start|>assistant\n" # noqa: E501
|
|
||||||
})
|
|
||||||
|
|
||||||
models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
|
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]],
|
|
||||||
model: str):
|
|
||||||
"""Sanitize vllm output to be comparable with hf output."""
|
|
||||||
output_ids, output_str, out_logprobs = vllm_output
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(model)
|
|
||||||
video_token_id = config.video_token_index
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
eos_token_id = tokenizer.eos_token_id
|
|
||||||
|
|
||||||
hf_output_ids = [
|
|
||||||
token_id for idx, token_id in enumerate(output_ids)
|
|
||||||
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
|
||||||
]
|
|
||||||
|
|
||||||
hf_output_str = output_str
|
|
||||||
if hf_output_ids[-1] == eos_token_id:
|
|
||||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
|
||||||
|
|
||||||
return hf_output_ids, hf_output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
# Video test
|
|
||||||
_LIMIT_VIDEO_PER_PROMPT = 4
|
|
||||||
|
|
||||||
|
|
||||||
def run_video_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
inputs: List[Tuple[List[str], PromptVideoInput]],
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
num_frames: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
with vllm_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_model_len=16384,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True,
|
|
||||||
limit_mm_per_prompt={"video": _LIMIT_VIDEO_PER_PROMPT
|
|
||||||
}) as vllm_model:
|
|
||||||
vllm_outputs_per_input = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
videos=videos)
|
|
||||||
for prompts, videos in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding):
|
|
||||||
hf_inputs["pixel_values_videos"] = hf_inputs["pixel_values_videos"] \
|
|
||||||
.to(torch_dtype) # type: ignore
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
with hf_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
postprocess_inputs=process,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_input = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
videos=videos)
|
|
||||||
for prompts, videos in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_input,
|
|
||||||
vllm_outputs_per_input):
|
|
||||||
# TODO: Check whether using original CLIPVisionModel can improve
|
|
||||||
# consistency against HF
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output, model)
|
|
||||||
for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
@pytest.mark.parametrize("num_frames", [16])
|
|
||||||
def test_models_multiple_video_inputs(hf_runner, vllm_runner, video_assets,
|
|
||||||
model, dtype, max_tokens, num_logprobs,
|
|
||||||
num_frames) -> None:
|
|
||||||
video = sample_frames_from_video(video_assets[0].np_ndarrays, num_frames)
|
|
||||||
inputs = [(
|
|
||||||
[
|
|
||||||
"<|im_start|>user <video><video>\nDescribe 2 videos. \
|
|
||||||
<|im_end|><|im_start|>assistant\n",
|
|
||||||
"<|im_start|>user <video><video>\nDescribe 2 videos. \
|
|
||||||
<|im_end|><|im_start|>assistant\n",
|
|
||||||
"<|im_start|>user <video><video><video><video>\nDescribe 4 videos. \
|
|
||||||
<|im_end|><|im_start|>assistant\n",
|
|
||||||
"<|im_start|>user <video>\nwhy is this video funny? \
|
|
||||||
<|im_end|><|im_start|>assistant\n",
|
|
||||||
],
|
|
||||||
[
|
|
||||||
[video, video],
|
|
||||||
# Images with different sizes and aspect-ratios
|
|
||||||
[
|
|
||||||
rescale_video_size(video, 0.1),
|
|
||||||
video,
|
|
||||||
],
|
|
||||||
[
|
|
||||||
video,
|
|
||||||
rescale_video_size(video, 0.25),
|
|
||||||
resize_video(video, (183, 488)),
|
|
||||||
resize_video(video, (488, 183))
|
|
||||||
],
|
|
||||||
video,
|
|
||||||
])]
|
|
||||||
run_video_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
num_frames=num_frames,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Image test
|
|
||||||
_LIMIT_IMAGE_PER_PROMPT = 4
|
|
||||||
|
|
||||||
|
|
||||||
def run_image_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
|
||||||
model: str,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_model_len=16384,
|
|
||||||
max_num_seqs=2,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True,
|
|
||||||
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
|
|
||||||
}) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding):
|
|
||||||
hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
|
|
||||||
.to(torch_dtype) # type: ignore
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
with hf_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
postprocess_inputs=process,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
# TODO: Check whether using original CLIPVisionModel can improve
|
|
||||||
# consistency against HF
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output, model)
|
|
||||||
for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
|
|
||||||
model, dtype, max_tokens,
|
|
||||||
num_logprobs) -> None:
|
|
||||||
stop_sign = image_assets[0].pil_image
|
|
||||||
cherry_blossom = image_assets[1].pil_image
|
|
||||||
|
|
||||||
inputs = [(
|
|
||||||
[
|
|
||||||
"<|im_start|>user\n<image><image>\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
||||||
"<|im_start|>user\n<image><image>\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
||||||
"<|im_start|>user\n<image><image><image><image>\nDescribe 4 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
||||||
"<|im_start|>user\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
||||||
],
|
|
||||||
[
|
|
||||||
[stop_sign, cherry_blossom],
|
|
||||||
# Images with different sizes and aspect-ratios
|
|
||||||
[
|
|
||||||
rescale_image_size(stop_sign, 0.1),
|
|
||||||
stop_sign,
|
|
||||||
],
|
|
||||||
[
|
|
||||||
stop_sign,
|
|
||||||
rescale_image_size(stop_sign, 0.25),
|
|
||||||
cherry_blossom.resize((183, 488)),
|
|
||||||
cherry_blossom.resize((488, 183))
|
|
||||||
],
|
|
||||||
cherry_blossom,
|
|
||||||
])]
|
|
||||||
|
|
||||||
run_image_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
@ -1,199 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple, Type, Union
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import torch
|
|
||||||
import torch.types
|
|
||||||
from PIL import Image
|
|
||||||
from transformers import BatchEncoding
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
# The image token is placed before "user" on purpose so that the test can pass
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
|
|
||||||
"(<image>./</image>)\nWhat's the content of the image?<|eot_id|>" \
|
|
||||||
"<|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
|
||||||
"cherry_blossom":
|
|
||||||
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
|
|
||||||
"(<image>./</image>)\nWhat is the season?<|eot_id|>" \
|
|
||||||
"<|start_header_id|>assistant<|end_header_id|>\n\n",
|
|
||||||
})
|
|
||||||
HF_MULTIIMAGE_IMAGE_PROMPT = \
|
|
||||||
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
|
|
||||||
"(<image>./</image>)\n(<image>./</image>)\n" \
|
|
||||||
"Describe these images.<|eot_id|>" \
|
|
||||||
"<|start_header_id|>assistant<|end_header_id|>\n\n"
|
|
||||||
|
|
||||||
models = ["openbmb/MiniCPM-Llama3-V-2_5"]
|
|
||||||
|
|
||||||
|
|
||||||
def _wrap_inputs(hf_inputs: BatchEncoding):
|
|
||||||
return {"model_inputs": hf_inputs}
|
|
||||||
|
|
||||||
|
|
||||||
def trunc_hf_output(hf_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]]):
|
|
||||||
output_ids, output_str, out_logprobs = hf_output
|
|
||||||
if output_str.endswith("<|eot_id|>"):
|
|
||||||
output_str = output_str.split("<|eot_id|>")[0]
|
|
||||||
return output_ids, output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
target_dtype = "half"
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
inputs: List[Tuple[List[str], Union[List[Image.Image],
|
|
||||||
List[List[Image.Image]]]]],
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
mm_limit: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
max_model_len=4096,
|
|
||||||
max_num_seqs=2,
|
|
||||||
dtype=dtype,
|
|
||||||
limit_mm_per_prompt={"image": mm_limit},
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
tokenizer = vllm_model.model.get_tokenizer()
|
|
||||||
stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images,
|
|
||||||
stop_token_ids=stop_token_ids)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
|
|
||||||
with hf_model, torch.no_grad():
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images,
|
|
||||||
tokenizer=tokenizer)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=[
|
|
||||||
trunc_hf_output(hf_output) for hf_output in hf_outputs
|
|
||||||
],
|
|
||||||
outputs_1_lst=vllm_outputs,
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs_per_image,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
mm_limit=1,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
|
||||||
size_factors, dtype: str, max_tokens: int,
|
|
||||||
num_logprobs: int) -> None:
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_case = [
|
|
||||||
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
|
||||||
[[rescale_image_size(image, factor) for image in images]
|
|
||||||
for factor in size_factors])
|
|
||||||
]
|
|
||||||
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs_per_case,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
mm_limit=2,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
594
tests/models/decoder_only/vision_language/test_models.py
Normal file
594
tests/models/decoder_only/vision_language/test_models.py
Normal file
@ -0,0 +1,594 @@
|
|||||||
|
"""Common tests for testing .generate() functionality for single / multiple
|
||||||
|
image, embedding, and video support for different VLMs in vLLM.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from pathlib import PosixPath
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import transformers
|
||||||
|
from transformers import AutoModelForVision2Seq
|
||||||
|
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import cuda_device_count_stateless, identity
|
||||||
|
|
||||||
|
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
|
||||||
|
_VideoAssets)
|
||||||
|
from ....utils import fork_new_process_for_each_test, large_gpu_mark
|
||||||
|
from ...utils import check_outputs_equal
|
||||||
|
from .vlm_utils import custom_inputs, model_utils, runners
|
||||||
|
from .vlm_utils.case_filtering import get_parametrized_options
|
||||||
|
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
|
||||||
|
VLMTestInfo, VLMTestType)
|
||||||
|
|
||||||
|
# This hack is needed for phi3v & paligemma models
|
||||||
|
# ROCm Triton FA can run into shared memory issues with these models,
|
||||||
|
# use other backends in the meantime
|
||||||
|
# FIXME (mattwong, gshtrasb, hongxiayan)
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
COMMON_BROADCAST_SETTINGS = {
|
||||||
|
"test_type": VLMTestType.IMAGE,
|
||||||
|
"dtype": "half",
|
||||||
|
"max_tokens": 5,
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"image_size_factors": [(.25, 0.5, 1.0)],
|
||||||
|
"distributed_executor_backend": (
|
||||||
|
"ray",
|
||||||
|
"mp",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
### Test configuration for specific models
|
||||||
|
# NOTE: The convention of the test settings below is to lead each test key
|
||||||
|
# with the name of the model arch used in the test, using underscores in place
|
||||||
|
# of hyphens; this makes it more convenient to filter tests for a specific kind
|
||||||
|
# of model. For example....
|
||||||
|
#
|
||||||
|
# To run all test types for a specific key:
|
||||||
|
# use the k flag to substring match with a leading square bracket; if the
|
||||||
|
# model arch happens to be a substring of another one, you can add a
|
||||||
|
# trailing hyphen. E.g.,
|
||||||
|
# - pytest $TEST_FILE -k "[llava-"
|
||||||
|
# prevents matching on "[llava_next-" & will match just the enabled cases
|
||||||
|
# for llava, i.e., single image, image embedding, and custom input tests.
|
||||||
|
#
|
||||||
|
# To run a test for a Test Info for just one of multiple models:
|
||||||
|
# use the k flag to substring match the model name, e.g.,
|
||||||
|
# - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
|
||||||
|
# prevents matching on nGVLab/InternVL2-2B.
|
||||||
|
#
|
||||||
|
# You can also combine substrings to match more granularly.
|
||||||
|
# ex 1:
|
||||||
|
# pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
|
||||||
|
# will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
|
||||||
|
# match both wrappers for single image tests, since it also matches
|
||||||
|
# test_single_image_heavy (which forks if we have a distributed backend)
|
||||||
|
# ex 2:
|
||||||
|
# pytest $TEST_FILE -k "[llava- or [intern_vl-"
|
||||||
|
# will run all of the tests for only llava & internvl.
|
||||||
|
#
|
||||||
|
# NOTE you can add --collect-only to any of the above commands to see
|
||||||
|
# which cases would be selected and deselected by pytest. In general,
|
||||||
|
# this is a good idea for checking your command first, since tests are slow.
|
||||||
|
|
||||||
|
VLM_TEST_SETTINGS = {
|
||||||
|
"blip2": VLMTestInfo(
|
||||||
|
models=["Salesforce/blip2-opt-2.7b"],
|
||||||
|
test_type=VLMTestType.IMAGE,
|
||||||
|
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
||||||
|
img_idx_to_prompt=lambda idx: "",
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
||||||
|
),
|
||||||
|
"chameleon": VLMTestInfo(
|
||||||
|
models=["facebook/chameleon-7b"],
|
||||||
|
test_type=VLMTestType.IMAGE,
|
||||||
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
|
max_model_len=4096,
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||||
|
"pixel_values"
|
||||||
|
),
|
||||||
|
# For chameleon, we only compare the sequences
|
||||||
|
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||||
|
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||||
|
comparator=check_outputs_equal,
|
||||||
|
max_tokens=8,
|
||||||
|
dtype="bfloat16",
|
||||||
|
marks=[
|
||||||
|
pytest.mark.skipif(
|
||||||
|
transformers.__version__.startswith("4.46"),
|
||||||
|
reason="Model broken in HF, see huggingface/transformers#34379"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"fuyu": VLMTestInfo(
|
||||||
|
models=["adept/fuyu-8b"],
|
||||||
|
test_type=VLMTestType.IMAGE,
|
||||||
|
prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
|
||||||
|
img_idx_to_prompt=lambda idx: "",
|
||||||
|
max_model_len=2048,
|
||||||
|
max_num_seqs=2,
|
||||||
|
use_tokenizer_eos=True,
|
||||||
|
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||||
|
num_logprobs=10,
|
||||||
|
dtype="bfloat16" if current_platform.is_cpu() else "half",
|
||||||
|
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||||
|
),
|
||||||
|
"glm4": VLMTestInfo(
|
||||||
|
models=["THUDM/glm-4v-9b"],
|
||||||
|
test_type=VLMTestType.IMAGE,
|
||||||
|
prompt_formatter=identity,
|
||||||
|
img_idx_to_prompt=lambda idx: "",
|
||||||
|
max_model_len=2048,
|
||||||
|
max_num_seqs=2,
|
||||||
|
dtype="bfloat16",
|
||||||
|
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||||
|
marks=[large_gpu_mark(min_gb=48)],
|
||||||
|
patch_hf_runner=model_utils.glm_patch_hf_runner,
|
||||||
|
),
|
||||||
|
"intern_vl": VLMTestInfo(
|
||||||
|
models=[
|
||||||
|
"OpenGVLab/InternVL2-1B",
|
||||||
|
"OpenGVLab/InternVL2-2B",
|
||||||
|
"OpenGVLab/Mono-InternVL-2B",
|
||||||
|
],
|
||||||
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||||
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||||
|
"cherry_blossom": "<image>\nWhat is the season?",
|
||||||
|
}),
|
||||||
|
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||||
|
max_model_len=4096,
|
||||||
|
# NOTE: Mono-InternVL-2B doesn't work with fp16,
|
||||||
|
# it will result NaN during inference.
|
||||||
|
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
|
||||||
|
dtype="bfloat16",
|
||||||
|
use_tokenizer_eos=True,
|
||||||
|
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||||
|
),
|
||||||
|
"llava": VLMTestInfo(
|
||||||
|
models=["llava-hf/llava-1.5-7b-hf"],
|
||||||
|
test_type=(
|
||||||
|
VLMTestType.EMBEDDING,
|
||||||
|
VLMTestType.IMAGE,
|
||||||
|
VLMTestType.CUSTOM_INPUTS
|
||||||
|
),
|
||||||
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
|
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
||||||
|
max_model_len=4096,
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
|
custom_test_opts=[CustomTestOptions(
|
||||||
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
|
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
|
||||||
|
),
|
||||||
|
limit_mm_per_prompt={"image": 4},
|
||||||
|
)],
|
||||||
|
),
|
||||||
|
"llava_next": VLMTestInfo(
|
||||||
|
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||||
|
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
||||||
|
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||||
|
max_model_len=10240,
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
|
custom_test_opts=[CustomTestOptions(
|
||||||
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
|
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
|
||||||
|
),
|
||||||
|
limit_mm_per_prompt={"image": 4},
|
||||||
|
)],
|
||||||
|
# Llava-next tests fixed sizes & the default size factors
|
||||||
|
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
|
||||||
|
),
|
||||||
|
"llava_one_vision": VLMTestInfo(
|
||||||
|
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||||
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
|
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
|
dtype="half",
|
||||||
|
num_video_frames=16,
|
||||||
|
max_model_len=16384,
|
||||||
|
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||||
|
"pixel_values_videos"
|
||||||
|
),
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
|
# Llava-one-vision tests fixed sizes & the default size factors
|
||||||
|
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
|
||||||
|
runner_mm_key="videos",
|
||||||
|
custom_test_opts=[CustomTestOptions(
|
||||||
|
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
|
||||||
|
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
|
),
|
||||||
|
limit_mm_per_prompt={"video": 4},
|
||||||
|
)],
|
||||||
|
),
|
||||||
|
# FIXME
|
||||||
|
"llava_next_video": VLMTestInfo(
|
||||||
|
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
|
||||||
|
test_type=VLMTestType.VIDEO,
|
||||||
|
prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
|
||||||
|
num_video_frames=16,
|
||||||
|
max_model_len=4096,
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
|
||||||
|
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
|
||||||
|
runner_mm_key="videos",
|
||||||
|
marks=[
|
||||||
|
pytest.mark.skip(reason="LLava next video tests currently fail.")
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"minicpmv": VLMTestInfo(
|
||||||
|
models=["openbmb/MiniCPM-Llama3-V-2_5"],
|
||||||
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
|
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||||
|
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
||||||
|
max_model_len=4096,
|
||||||
|
max_num_seqs=2,
|
||||||
|
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
|
||||||
|
postprocess_inputs=model_utils.wrap_inputs_post_processor,
|
||||||
|
hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
|
||||||
|
),
|
||||||
|
"paligemma": VLMTestInfo(
|
||||||
|
models=["google/paligemma-3b-mix-224"],
|
||||||
|
test_type=VLMTestType.IMAGE,
|
||||||
|
prompt_formatter=identity,
|
||||||
|
img_idx_to_prompt = lambda idx: "",
|
||||||
|
# Paligemma uses its own sample prompts because the default one fails
|
||||||
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||||
|
"stop_sign": "caption es",
|
||||||
|
"cherry_blossom": "What is in the picture?",
|
||||||
|
}),
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||||
|
"pixel_values"
|
||||||
|
),
|
||||||
|
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||||
|
dtype="half" if current_platform.is_rocm() else ("half", "float"),
|
||||||
|
),
|
||||||
|
# Tests for phi3v currently live in another file because of a bug in
|
||||||
|
# transformers. Once this issue is fixed, we can enable them here instead.
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
# "phi3v": VLMTestInfo(
|
||||||
|
# models=["microsoft/Phi-3.5-vision-instruct"],
|
||||||
|
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
|
# prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
||||||
|
# img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
||||||
|
# max_model_len=4096,
|
||||||
|
# max_num_seqs=2,
|
||||||
|
# task="generate",
|
||||||
|
# # use eager mode for hf runner since phi3v didn't work with flash_attn
|
||||||
|
# model_kwargs={"_attn_implementation": "eager"},
|
||||||
|
# use_tokenizer_eos=True,
|
||||||
|
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
|
||||||
|
# num_logprobs=10,
|
||||||
|
# ),
|
||||||
|
"qwen": VLMTestInfo(
|
||||||
|
models=["Qwen/Qwen-VL"],
|
||||||
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
|
prompt_formatter=identity,
|
||||||
|
img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
|
||||||
|
max_model_len=1024,
|
||||||
|
max_num_seqs=2,
|
||||||
|
vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
|
||||||
|
prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
|
||||||
|
),
|
||||||
|
### Tensor parallel / multi-gpu broadcast tests
|
||||||
|
"broadcast-chameleon": VLMTestInfo(
|
||||||
|
models=["facebook/chameleon-7b"],
|
||||||
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
|
max_model_len=4096,
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||||
|
"pixel_values"
|
||||||
|
),
|
||||||
|
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||||
|
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||||
|
comparator=check_outputs_equal,
|
||||||
|
marks=[
|
||||||
|
pytest.mark.distributed_2_gpus,
|
||||||
|
pytest.mark.skipif(
|
||||||
|
cuda_device_count_stateless() < 2,
|
||||||
|
reason="Need at least 2 GPUs to run the test.",
|
||||||
|
),
|
||||||
|
pytest.mark.skipif(
|
||||||
|
transformers.__version__.startswith("4.46"),
|
||||||
|
reason="Model broken in HF, see huggingface/transformers#34379"
|
||||||
|
)
|
||||||
|
],
|
||||||
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||||
|
),
|
||||||
|
"broadcast-llava": VLMTestInfo(
|
||||||
|
models=["llava-hf/llava-1.5-7b-hf"],
|
||||||
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
|
max_model_len=4096,
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
|
marks=[
|
||||||
|
pytest.mark.distributed_2_gpus,
|
||||||
|
pytest.mark.skipif(
|
||||||
|
cuda_device_count_stateless() < 2,
|
||||||
|
reason="Need at least 2 GPUs to run the test.",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||||
|
),
|
||||||
|
"broadcast-llava_next": VLMTestInfo(
|
||||||
|
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||||
|
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||||
|
max_model_len=10240,
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
|
marks=[
|
||||||
|
pytest.mark.distributed_2_gpus,
|
||||||
|
pytest.mark.skipif(
|
||||||
|
cuda_device_count_stateless() < 2,
|
||||||
|
reason="Need at least 2 GPUs to run the test.",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||||
|
),
|
||||||
|
### Custom input edge-cases for specific models
|
||||||
|
"intern_vl-diff-patches": VLMTestInfo(
|
||||||
|
models=["OpenGVLab/InternVL2-2B"],
|
||||||
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
|
max_model_len=4096,
|
||||||
|
dtype="bfloat16" if current_platform.is_cpu() else "half",
|
||||||
|
use_tokenizer_eos=True,
|
||||||
|
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||||
|
custom_test_opts=[
|
||||||
|
CustomTestOptions(
|
||||||
|
inputs=inp,
|
||||||
|
limit_mm_per_prompt={"image": 2},
|
||||||
|
) for inp in custom_inputs.different_patch_input_cases_internvl()
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"llava_one_vision-multiple-images": VLMTestInfo(
|
||||||
|
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||||
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
|
max_model_len=16384,
|
||||||
|
max_num_seqs=2,
|
||||||
|
dtype="half",
|
||||||
|
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||||
|
"pixel_values"
|
||||||
|
),
|
||||||
|
auto_cls=AutoModelForVision2Seq,
|
||||||
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
|
custom_test_opts=[CustomTestOptions(
|
||||||
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
|
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
|
),
|
||||||
|
limit_mm_per_prompt={"image": 4},
|
||||||
|
)],
|
||||||
|
),
|
||||||
|
}
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
|
||||||
|
### Test wrappers
|
||||||
|
# Wrappers around the core test running func for:
|
||||||
|
# - single image
|
||||||
|
# - multi-image
|
||||||
|
# - image embeddings
|
||||||
|
# - video
|
||||||
|
# - custom inputs
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.IMAGE,
|
||||||
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
|
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_single_image_test(
|
||||||
|
tmp_path=tmp_path,
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
image_assets=image_assets,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.MULTI_IMAGE,
|
||||||
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
|
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_multi_image_test(
|
||||||
|
tmp_path=tmp_path,
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
image_assets=image_assets,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.EMBEDDING,
|
||||||
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
|
def test_image_embedding_models(model_type: str,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_embedding_test(
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
image_assets=image_assets,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.VIDEO,
|
||||||
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
|
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_video_test(
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
video_assets=video_assets,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
|
def test_custom_inputs_models(
|
||||||
|
model_type: str,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_custom_inputs_test(
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
#### Tests filtering for things running each test as a new process
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.IMAGE,
|
||||||
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
|
@fork_new_process_for_each_test
|
||||||
|
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_single_image_test(
|
||||||
|
tmp_path=tmp_path,
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
image_assets=image_assets,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.MULTI_IMAGE,
|
||||||
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
|
@fork_new_process_for_each_test
|
||||||
|
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_multi_image_test(
|
||||||
|
tmp_path=tmp_path,
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
image_assets=image_assets,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.EMBEDDING,
|
||||||
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
|
@fork_new_process_for_each_test
|
||||||
|
def test_image_embedding_models_heavy(model_type: str,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_embedding_test(
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
image_assets=image_assets,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.VIDEO,
|
||||||
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
|
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_video_test(
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
video_assets=video_assets,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_type,test_case",
|
||||||
|
get_parametrized_options(
|
||||||
|
VLM_TEST_SETTINGS,
|
||||||
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
|
@fork_new_process_for_each_test
|
||||||
|
def test_custom_inputs_models_heavy(
|
||||||
|
model_type: str,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
):
|
||||||
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||||
|
runners.run_custom_inputs_test(
|
||||||
|
model_test_info=model_test_info,
|
||||||
|
test_case=test_case,
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
)
|
||||||
@ -1,174 +0,0 @@
|
|||||||
import os
|
|
||||||
from typing import List, Optional, Tuple, Type
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
|
||||||
BatchEncoding)
|
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"caption es",
|
|
||||||
"cherry_blossom":
|
|
||||||
"What is in the picture?",
|
|
||||||
})
|
|
||||||
|
|
||||||
models = ["google/paligemma-3b-mix-224"]
|
|
||||||
|
|
||||||
# ROCm Triton FA can run into compilation issues with these models due to,
|
|
||||||
# excessive use of shared memory. Use other backends in the meantime.
|
|
||||||
# FIXME (mattwong, gshtrasb, hongxiayan)
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
||||||
Optional[SampleLogprobs]],
|
|
||||||
model: str):
|
|
||||||
"""Sanitize vllm output to be comparable with hf output."""
|
|
||||||
output_ids, output_str, out_logprobs = vllm_output
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(model)
|
|
||||||
image_token_id = config.image_token_index
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
eos_token_id = tokenizer.eos_token_id
|
|
||||||
|
|
||||||
hf_output_ids = [
|
|
||||||
token_id for idx, token_id in enumerate(output_ids)
|
|
||||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
|
||||||
]
|
|
||||||
|
|
||||||
hf_output_str = output_str
|
|
||||||
|
|
||||||
if hf_output_ids[-1] == eos_token_id:
|
|
||||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
|
||||||
|
|
||||||
return hf_output_ids, hf_output_str, out_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets,
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
size_factors: List[float],
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
inputs_per_image = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
with vllm_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding):
|
|
||||||
hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
|
|
||||||
.to(torch_dtype) # type: ignore
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
with hf_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
postprocess_inputs=process,
|
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs_per_image
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=[
|
|
||||||
vllm_to_hf_output(vllm_output, model)
|
|
||||||
for vllm_output in vllm_outputs
|
|
||||||
],
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", [
|
|
||||||
pytest.param(
|
|
||||||
"float",
|
|
||||||
marks=pytest.mark.skipif(
|
|
||||||
current_platform.is_rocm(),
|
|
||||||
reason=
|
|
||||||
"ROCm FA does not yet fully support 32-bit precision on PaliGemma")
|
|
||||||
), "half"
|
|
||||||
])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
image_assets,
|
|
||||||
model,
|
|
||||||
size_factors=size_factors,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
@ -3,19 +3,14 @@ import re
|
|||||||
from typing import List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
from transformers import AutoTokenizer
|
||||||
from transformers import AutoImageProcessor, AutoTokenizer
|
|
||||||
|
|
||||||
from vllm.inputs import InputContext, token_inputs
|
|
||||||
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
|
|
||||||
from vllm.multimodal import MultiModalRegistry
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.utils import rescale_image_size
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||||
_ImageAssets)
|
from ...utils import check_logprobs_close
|
||||||
from ...utils import build_model_context, check_logprobs_close
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
"stop_sign":
|
"stop_sign":
|
||||||
@ -81,12 +76,15 @@ def run_test(
|
|||||||
Note, the text input is also adjusted to abide by vllm contract.
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
The text output is sanitized to be able to compare with hf.
|
The text output is sanitized to be able to compare with hf.
|
||||||
"""
|
"""
|
||||||
|
# HACK - this is an attempted workaround for the following bug
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
from transformers import AutoImageProcessor # noqa: F401
|
||||||
|
from transformers import AutoProcessor # noqa: F401
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
# will hurt multiprocessing backend with fork method (the default method).
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
# max_model_len should be greater than image_feature_size
|
||||||
with vllm_runner(model,
|
with vllm_runner(model,
|
||||||
task="generate",
|
task="generate",
|
||||||
@ -236,172 +234,3 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
|||||||
mm_limit=2,
|
mm_limit=2,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
### Fast tests for correctness in processor_kwarg override handling
|
|
||||||
|
|
||||||
|
|
||||||
# Wrap lazy imports to avoid initializing CUDA during test collection
|
|
||||||
@pytest.fixture()
|
|
||||||
def input_processor_for_phi3v():
|
|
||||||
from vllm.model_executor.models.phi3v import input_processor_for_phi3v
|
|
||||||
return input_processor_for_phi3v
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def dummy_data_for_phi3v():
|
|
||||||
from vllm.model_executor.models.phi3v import dummy_data_for_phi3v
|
|
||||||
return dummy_data_for_phi3v
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def get_max_phi3v_image_tokens():
|
|
||||||
from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens
|
|
||||||
return get_max_phi3v_image_tokens
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("num_crops", [4, 16, None])
|
|
||||||
def test_input_mapper_override(model: str, image_assets: _ImageAssets,
|
|
||||||
num_crops: Optional[int]):
|
|
||||||
"""Ensure that the [default] input mapper handles num_crops properly."""
|
|
||||||
# We pass the processor kwargs here since for this model, we fall back to
|
|
||||||
# the default mapper; this will fall back to the HF mapper and forward
|
|
||||||
# mm_processor_kwargs to it.
|
|
||||||
mm_processor_kwargs = {
|
|
||||||
"num_crops": num_crops
|
|
||||||
} if num_crops is not None else {}
|
|
||||||
ctx = build_model_context(
|
|
||||||
model_name=model,
|
|
||||||
tokenizer_name=model,
|
|
||||||
trust_remote_code=True,
|
|
||||||
mm_processor_kwargs=mm_processor_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
hf_processor = AutoImageProcessor.from_pretrained(model,
|
|
||||||
trust_remote_code=True,
|
|
||||||
**mm_processor_kwargs)
|
|
||||||
|
|
||||||
mm_registry = MultiModalRegistry()
|
|
||||||
mm_registry.init_mm_limits_per_prompt(ctx.model_config)
|
|
||||||
|
|
||||||
image = image_assets[0].pil_image
|
|
||||||
hf_result = hf_processor.preprocess(
|
|
||||||
image,
|
|
||||||
return_tensors="pt",
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_result = mm_registry.map_input(
|
|
||||||
ctx.model_config,
|
|
||||||
{"image": image},
|
|
||||||
)
|
|
||||||
|
|
||||||
assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"])
|
|
||||||
assert torch.all(
|
|
||||||
hf_result["num_img_tokens"] == vllm_result["num_img_tokens"])
|
|
||||||
|
|
||||||
# For pixel values, the second axis should be the num_crops + 1
|
|
||||||
# for the rescaled original image. The default value in VLLM falls
|
|
||||||
# back to the HF config, which is why we compare to the processor num_crops
|
|
||||||
assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
|
|
||||||
assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("num_crops,expected_max_tokens", [
|
|
||||||
(4, 781),
|
|
||||||
(16, 2653),
|
|
||||||
])
|
|
||||||
def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
|
|
||||||
num_crops: int, expected_max_tokens: int):
|
|
||||||
"""Ensure get_max_phi3v_image_tokens handles num_crops properly."""
|
|
||||||
# NOTE: mm_processor_kwargs on the context in this test is unused, since
|
|
||||||
# this is testing the mapper directly. In practice, the processor kwargs
|
|
||||||
# are wrapped in a closure when calling the max tokens func. We explicitly
|
|
||||||
# do NOT use the mm_processor_kwargs in the model context here to ensure
|
|
||||||
# that the max image tokens implementation is referencing a mix of the
|
|
||||||
# kwargs to the function and the original mm_processor_kwargs in case
|
|
||||||
# values are somehow updated and end up in a bad state.
|
|
||||||
ctx = build_model_context(
|
|
||||||
model_name=model,
|
|
||||||
tokenizer_name=model,
|
|
||||||
trust_remote_code=True,
|
|
||||||
mm_processor_kwargs=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
actual_max_tokens = get_max_phi3v_image_tokens(
|
|
||||||
InputContext(ctx.model_config),
|
|
||||||
num_crops=num_crops,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert expected_max_tokens == actual_max_tokens
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [
|
|
||||||
(4, 781, 1),
|
|
||||||
(4, 781, 2),
|
|
||||||
(16, 2653, 1),
|
|
||||||
(16, 2653, 2),
|
|
||||||
])
|
|
||||||
def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
|
|
||||||
toks_per_img: int, num_imgs: int):
|
|
||||||
"""Ensure dummy_data_for_phi3v handles num_crops properly."""
|
|
||||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
|
||||||
# in this test and assume that the kwargs will be correctly expanded by
|
|
||||||
# the partial when calling the dummy data func.
|
|
||||||
ctx = build_model_context(
|
|
||||||
model_name=model,
|
|
||||||
tokenizer_name=model,
|
|
||||||
trust_remote_code=True,
|
|
||||||
mm_processor_kwargs=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
sequence_data, _, = dummy_data_for_phi3v(
|
|
||||||
ctx=ctx,
|
|
||||||
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
|
|
||||||
mm_counts={"image": num_imgs},
|
|
||||||
num_crops=num_crops,
|
|
||||||
)
|
|
||||||
# Ensure we have the right number of placeholders per num_crops size
|
|
||||||
img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
|
|
||||||
assert img_tok_count == toks_per_img * num_imgs
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [
|
|
||||||
(4, 757, 1),
|
|
||||||
(4, 757, 2),
|
|
||||||
(16, 1921, 1),
|
|
||||||
(16, 1921, 2),
|
|
||||||
])
|
|
||||||
def test_input_processor_override(input_processor_for_phi3v,
|
|
||||||
image_assets: _ImageAssets, model: str,
|
|
||||||
num_crops: int, expected_toks_per_img: int,
|
|
||||||
num_imgs: int):
|
|
||||||
"""Ensure input_processor_for_phi3v handles num_crops properly."""
|
|
||||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
|
||||||
# in this test and assume that the kwargs will be correctly expanded by
|
|
||||||
# the partial when calling the custom input processor.
|
|
||||||
ctx = build_model_context(
|
|
||||||
model_name=model,
|
|
||||||
tokenizer_name=model,
|
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
|
||||||
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
|
||||||
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
|
||||||
images = [image_assets[0].pil_image] * num_imgs
|
|
||||||
|
|
||||||
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
|
|
||||||
prompt=prompt,
|
|
||||||
multi_modal_data={"image": images})
|
|
||||||
|
|
||||||
processed_inputs = input_processor_for_phi3v(ctx,
|
|
||||||
inputs,
|
|
||||||
num_crops=num_crops)
|
|
||||||
|
|
||||||
# Ensure we have the right number of placeholders per num_crops size
|
|
||||||
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
|
|
||||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
|
||||||
|
|||||||
@ -1,374 +0,0 @@
|
|||||||
import pathlib
|
|
||||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import torch
|
|
||||||
from PIL.Image import Image
|
|
||||||
|
|
||||||
from vllm.inputs import InputContext, token_inputs
|
|
||||||
from vllm.multimodal.base import MultiModalInputs
|
|
||||||
from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
|
|
||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
|
|
||||||
VllmRunner, _ImageAssets)
|
|
||||||
from ...utils import build_model_context, check_logprobs_close
|
|
||||||
|
|
||||||
text_only_models = [
|
|
||||||
"Qwen/Qwen-7B-Chat" # Has no visual component
|
|
||||||
]
|
|
||||||
|
|
||||||
multimodal_models = ["Qwen/Qwen-VL"]
|
|
||||||
|
|
||||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|
||||||
"stop_sign":
|
|
||||||
"Picture 1: <img></img>\nWhat's the content of the image?: ",
|
|
||||||
"cherry_blossom":
|
|
||||||
"Picture 1: <img></img>\nWhat is the season?: ",
|
|
||||||
})
|
|
||||||
|
|
||||||
HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nCan you compare these images?\n" # noqa: E501
|
|
||||||
HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nDescribe the two images in detail.\n" # noqa: E501
|
|
||||||
### Multimodal preprocessing tests
|
|
||||||
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
|
|
||||||
# These values are specific to Qwen-VL/Chat; we can get these from the model
|
|
||||||
# config also, but they are hardcoded here to keep the parameterize/fixtures
|
|
||||||
# easy to read.
|
|
||||||
IMG_START_ID = 151857
|
|
||||||
IMG_END_ID = 151858
|
|
||||||
IMG_PAD_ID = 151859
|
|
||||||
TOKS_PER_IMG = 256
|
|
||||||
VIS_ENC_DIM = 4096
|
|
||||||
IMG_SIZE = 448
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def input_mapper_for_qwen():
|
|
||||||
# Lazy import to avoid initializing CUDA during test collection
|
|
||||||
from vllm.model_executor.models.qwen import input_mapper_for_qwen
|
|
||||||
return input_mapper_for_qwen
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def input_processor_for_qwen():
|
|
||||||
# Lazy import to avoid initializing CUDA during test collection
|
|
||||||
from vllm.model_executor.models.qwen import input_processor_for_qwen
|
|
||||||
return input_processor_for_qwen
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def qwen_vl_context() -> InputContext:
|
|
||||||
"""Get an InputContext for Qwen-VL."""
|
|
||||||
return build_model_context(model_name="Qwen/Qwen-VL",
|
|
||||||
trust_remote_code=True)
|
|
||||||
|
|
||||||
|
|
||||||
# Happy path tests for single/multi-image scenarios for the multimodal
|
|
||||||
# input processor and mapper, respectively
|
|
||||||
@pytest.mark.parametrize("num_images", [1, 2])
|
|
||||||
def test_input_processor_valid_mm_data(input_processor_for_qwen,
|
|
||||||
qwen_vl_context: InputContext,
|
|
||||||
num_images: int):
|
|
||||||
"""Happy cases for image inputs to Qwen's multimodal input processor."""
|
|
||||||
prompt = "".join(
|
|
||||||
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
|
|
||||||
inputs = token_inputs(
|
|
||||||
prompt=prompt,
|
|
||||||
# When processing multimodal data for a multimodal model, the qwen
|
|
||||||
# input processor will overwrite the provided prompt_token_ids with
|
|
||||||
# the image prompts
|
|
||||||
prompt_token_ids=[],
|
|
||||||
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
|
|
||||||
)
|
|
||||||
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
|
|
||||||
assert isinstance(proc_inputs, dict)
|
|
||||||
|
|
||||||
# Each image should have one start / stop and a fixed context of 256
|
|
||||||
proc_tokens = proc_inputs["prompt_token_ids"]
|
|
||||||
assert proc_tokens.count(IMG_START_ID) == num_images
|
|
||||||
assert proc_tokens.count(IMG_END_ID) == num_images
|
|
||||||
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"img_data,expected_shape",
|
|
||||||
[
|
|
||||||
# single / multi-image
|
|
||||||
(SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
|
|
||||||
(2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
|
|
||||||
# single / multi-image embeddings
|
|
||||||
(torch.rand(
|
|
||||||
(TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
|
||||||
(torch.rand(
|
|
||||||
(1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
|
||||||
(torch.rand(
|
|
||||||
(2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
|
|
||||||
])
|
|
||||||
def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
|
||||||
qwen_vl_context: InputContext,
|
|
||||||
img_data: Union[torch.Tensor, List[Image],
|
|
||||||
Image],
|
|
||||||
expected_shape: List[int]):
|
|
||||||
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
|
|
||||||
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
|
|
||||||
# Ensure that we get the appropriately shaped pixel_values
|
|
||||||
# for images and image embeddings, respectively.
|
|
||||||
assert isinstance(mapped_img_data, MultiModalInputs)
|
|
||||||
assert "pixel_values" in mapped_img_data
|
|
||||||
assert mapped_img_data["pixel_values"].shape == expected_shape
|
|
||||||
|
|
||||||
|
|
||||||
# Sad path tests for the multimodal input processor and mapper, respectively
|
|
||||||
@pytest.mark.parametrize("mm_data", [
|
|
||||||
{
|
|
||||||
"image": torch.rand((5))
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"image": torch.rand((5, 5, 5, 5, 5))
|
|
||||||
},
|
|
||||||
])
|
|
||||||
def test_input_processor_invalid_mm_data(input_processor_for_qwen,
|
|
||||||
qwen_vl_context: InputContext,
|
|
||||||
mm_data: Dict[str, torch.Tensor]):
|
|
||||||
"""Test sad cases validated in Qwen's multimodal input processor."""
|
|
||||||
tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
|
|
||||||
trust_remote_code=True)
|
|
||||||
prompt = "Picture 1: <img></img>\n"
|
|
||||||
prompt_token_ids = tokenizer.encode(prompt)
|
|
||||||
inputs = token_inputs(prompt=prompt,
|
|
||||||
prompt_token_ids=prompt_token_ids,
|
|
||||||
multi_modal_data=mm_data)
|
|
||||||
# Should fail since we have too many or too few dimensions for embeddings
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
input_processor_for_qwen(qwen_vl_context, inputs)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"img_data",
|
|
||||||
[
|
|
||||||
# Wrong context length
|
|
||||||
torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
|
|
||||||
# Wrong visual encoder output size
|
|
||||||
torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
|
|
||||||
])
|
|
||||||
def test_input_mapper_invalid_mm_data(
|
|
||||||
input_mapper_for_qwen,
|
|
||||||
qwen_vl_context: InputContext,
|
|
||||||
img_data: Union[torch.Tensor, List[Image], Image],
|
|
||||||
):
|
|
||||||
"""Sad cases validated in Qwen VL's multimodal input mapper."""
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
input_mapper_for_qwen(qwen_vl_context, img_data)
|
|
||||||
|
|
||||||
|
|
||||||
### End-to-end generation tests
|
|
||||||
def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
|
|
||||||
assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
|
|
||||||
"""Given a temporary dir path, export one or more image assets into the
|
|
||||||
tempdir & replace its contents with the local path to the string so that
|
|
||||||
the HF version of Qwen-VL can resolve the path and load the image ni its
|
|
||||||
forward() call.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tmp_path: Tempdir for test under consideration.
|
|
||||||
prompt: Prompt with image placeholders.
|
|
||||||
assets: List of image assets whose len equals the num placeholders.
|
|
||||||
"""
|
|
||||||
# Ensure that the number of placeholders matches the number of assets;
|
|
||||||
# If this is not true, the test is probably written incorrectly.
|
|
||||||
assert prompt.count("<img></img>") == len(assets)
|
|
||||||
|
|
||||||
# Replace the placeholders with local paths to the exported assets
|
|
||||||
for asset in assets:
|
|
||||||
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
|
||||||
asset.pil_image.save(image_tmp_path)
|
|
||||||
prompt = prompt.replace(
|
|
||||||
"<img></img>",
|
|
||||||
f"<img>{image_tmp_path}</img>",
|
|
||||||
1,
|
|
||||||
)
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
inputs: List[Tuple[List[str], PromptImageInput]],
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
mm_limit: int,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
distributed_executor_backend: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test is under tests/images.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding MultiModalConfig as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
|
||||||
|
|
||||||
# max_model_len should be greater than image_feature_size
|
|
||||||
# Qwen encodes each image into a fixed content size of 256
|
|
||||||
with vllm_runner(model,
|
|
||||||
max_model_len=1024,
|
|
||||||
max_num_seqs=2,
|
|
||||||
dtype=dtype,
|
|
||||||
limit_mm_per_prompt={"image": mm_limit},
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs_per_image = [
|
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
images=images)
|
|
||||||
for prompts, images in inputs
|
|
||||||
]
|
|
||||||
|
|
||||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
|
|
||||||
vllm_outputs_per_image):
|
|
||||||
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=vllm_outputs,
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", multimodal_models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [8])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets, model: str,
|
|
||||||
size_factors: List[float], dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int) -> None:
|
|
||||||
"""Tests multimodal models with single image prompts."""
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
|
|
||||||
prompts = [
|
|
||||||
get_prompt_with_path(tmp_path, prompt, [asset])
|
|
||||||
for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
|
||||||
]
|
|
||||||
|
|
||||||
inputs = [(
|
|
||||||
[prompt for _ in size_factors],
|
|
||||||
[rescale_image_size(image, factor) for factor in size_factors],
|
|
||||||
) for image, prompt in zip(images, prompts)]
|
|
||||||
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
mm_limit=1,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", multimodal_models)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"size_factors",
|
|
||||||
[
|
|
||||||
# No image
|
|
||||||
[],
|
|
||||||
# Single-scale
|
|
||||||
[1.0],
|
|
||||||
# Single-scale, batched
|
|
||||||
[1.0, 1.0, 1.0],
|
|
||||||
# Multi-scale
|
|
||||||
[0.25, 0.5, 1.0],
|
|
||||||
],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
|
|
||||||
hf_runner: Type[HfRunner],
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
image_assets: _ImageAssets, model: str,
|
|
||||||
size_factors: List[float], dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int) -> None:
|
|
||||||
"""Tests multimodal models with multi-image prompts."""
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
|
||||||
# Put all of the images into one prompt.
|
|
||||||
prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
|
|
||||||
image_assets)
|
|
||||||
inputs = [([prompt for _ in size_factors],
|
|
||||||
[[rescale_image_size(image, factor) for image in images]
|
|
||||||
for factor in size_factors])]
|
|
||||||
|
|
||||||
run_test(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
inputs,
|
|
||||||
model,
|
|
||||||
dtype=dtype,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
mm_limit=2,
|
|
||||||
tensor_parallel_size=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Ensure that a text-only Qwen model can still be loaded and
|
|
||||||
# used for inference in VLLM without throwing.
|
|
||||||
@pytest.mark.parametrize("model", text_only_models)
|
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_text_only_qwen_model_can_be_loaded_and_run(
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
example_prompts: List[str],
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
):
|
|
||||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
|
||||||
vllm_model.generate_greedy_logprobs(
|
|
||||||
example_prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
)
|
|
||||||
235
tests/models/decoder_only/vision_language/vlm_utils/builders.py
Normal file
235
tests/models/decoder_only/vision_language/vlm_utils/builders.py
Normal file
@ -0,0 +1,235 @@
|
|||||||
|
"""Helpers for building inputs that can be leveraged for different test types.
|
||||||
|
"""
|
||||||
|
from pathlib import PosixPath
|
||||||
|
from typing import Callable, Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
||||||
|
resize_video, sample_frames_from_video)
|
||||||
|
|
||||||
|
from .....conftest import _ImageAssets, _VideoAssets
|
||||||
|
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
|
||||||
|
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
|
||||||
|
ImageSizeWrapper, SizeType, VLMTestInfo)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
|
||||||
|
str],
|
||||||
|
test_placeholder: str) -> str:
|
||||||
|
"""Given a prompt, replaces each test placeholder with the
|
||||||
|
model-specific tag.
|
||||||
|
"""
|
||||||
|
prompt_segments = prompt.split(test_placeholder)
|
||||||
|
img_prompt = prompt_segments[0]
|
||||||
|
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
|
||||||
|
img_prompt += img_idx_to_prompt(placeholder_idx)
|
||||||
|
img_prompt += next_seg
|
||||||
|
return img_prompt
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_prompts(base_prompts: Iterable[str],
|
||||||
|
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||||
|
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||||
|
prompt_formatter: Callable[[str], str]) -> List[str]:
|
||||||
|
"""Given a model-agnostic base prompt and test configuration for a model(s)
|
||||||
|
to be tested, update the media placeholders and apply the prompt formatting
|
||||||
|
to get the test prompt string for this model.
|
||||||
|
|
||||||
|
Example for phi3v, given the base_prompt: "<image>What is the season?"
|
||||||
|
1. Replace img placeholder(s)
|
||||||
|
-> "<|image_1|>\nWhat is the season?"
|
||||||
|
2. Apply prompt formatter:
|
||||||
|
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
|
||||||
|
"""
|
||||||
|
assert isinstance(base_prompts, (list, tuple))
|
||||||
|
model_prompts = []
|
||||||
|
for base_prompt in base_prompts:
|
||||||
|
# Replace the multimodal placeholders in the base prompt with
|
||||||
|
# the correct ones for the model that we are testing
|
||||||
|
if img_idx_to_prompt:
|
||||||
|
base_prompt = replace_test_placeholder(base_prompt,
|
||||||
|
img_idx_to_prompt,
|
||||||
|
TEST_IMG_PLACEHOLDER)
|
||||||
|
|
||||||
|
if video_idx_to_prompt:
|
||||||
|
base_prompt = replace_test_placeholder(base_prompt,
|
||||||
|
video_idx_to_prompt,
|
||||||
|
TEST_VIDEO_PLACEHOLDER)
|
||||||
|
|
||||||
|
# Apply the prompt formatter to wrap the base prompt with
|
||||||
|
# the correct media placeholders to get the model test prompt
|
||||||
|
model_prompt = prompt_formatter(base_prompt)
|
||||||
|
model_prompts.append(model_prompt)
|
||||||
|
return model_prompts
|
||||||
|
|
||||||
|
|
||||||
|
def build_single_image_inputs_from_test_info(
|
||||||
|
test_info: VLMTestInfo,
|
||||||
|
image_assets: _ImageAssets,
|
||||||
|
size_wrapper: ImageSizeWrapper,
|
||||||
|
tmp_path: Optional[PosixPath] = None):
|
||||||
|
if test_info.prompt_formatter is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Prompt formatter must be set to build single image inputs")
|
||||||
|
|
||||||
|
model_prompts = get_model_prompts(test_info.single_image_prompts,
|
||||||
|
test_info.img_idx_to_prompt,
|
||||||
|
test_info.video_idx_to_prompt,
|
||||||
|
test_info.prompt_formatter)
|
||||||
|
|
||||||
|
# For models that require a local path / URL encoded in the image; export
|
||||||
|
# assets and encode into tmp_path for this test. This should be avoided
|
||||||
|
# where possible (currently needed for Qwen-VL).
|
||||||
|
if test_info.prompt_path_encoder is not None:
|
||||||
|
if tmp_path is None:
|
||||||
|
raise ValueError("Prompt path encoder requires setting local path")
|
||||||
|
model_prompts = [
|
||||||
|
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
|
||||||
|
for prompt, asset in zip(model_prompts, image_assets)
|
||||||
|
]
|
||||||
|
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
assert len(images) == len(model_prompts)
|
||||||
|
return build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||||
|
|
||||||
|
|
||||||
|
def build_single_image_inputs(images, model_prompts,
|
||||||
|
size_wrapper: ImageSizeWrapper):
|
||||||
|
# For every image / prompt pair, get a pair containing two lists of
|
||||||
|
# length size_factors, where the first contains duplicates of the model
|
||||||
|
# prompt [str], and the second contains copies of the image after being
|
||||||
|
# scaled by one of the size factors.
|
||||||
|
#
|
||||||
|
# NOTE: rescaling preserves the image aspect ratio.
|
||||||
|
return [(
|
||||||
|
[prompt for _ in size_wrapper.data],
|
||||||
|
[
|
||||||
|
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||||
|
for size in size_wrapper.data
|
||||||
|
],
|
||||||
|
) for image, prompt in zip(images, model_prompts)]
|
||||||
|
|
||||||
|
|
||||||
|
def build_multi_image_inputs_from_test_info(
|
||||||
|
test_info: VLMTestInfo,
|
||||||
|
image_assets: _ImageAssets,
|
||||||
|
size_wrapper: ImageSizeWrapper,
|
||||||
|
tmp_path: Optional[PosixPath] = None):
|
||||||
|
if test_info.prompt_formatter is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Prompt formatter must be set to build multi image inputs")
|
||||||
|
|
||||||
|
model_prompts = get_model_prompts([test_info.multi_image_prompt],
|
||||||
|
test_info.img_idx_to_prompt,
|
||||||
|
test_info.video_idx_to_prompt,
|
||||||
|
test_info.prompt_formatter)
|
||||||
|
|
||||||
|
if test_info.prompt_path_encoder is not None:
|
||||||
|
if tmp_path is None:
|
||||||
|
raise ValueError("Prompt path encoder requires setting local path")
|
||||||
|
model_prompts = [
|
||||||
|
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
|
||||||
|
for model_prompt in model_prompts
|
||||||
|
]
|
||||||
|
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
|
# Currently, we only have one multi-image list & one multi-image prompt
|
||||||
|
return build_multi_image_inputs(
|
||||||
|
image_lists=[images],
|
||||||
|
model_prompts=model_prompts,
|
||||||
|
size_wrapper=size_wrapper,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_multi_image_inputs(image_lists, model_prompts,
|
||||||
|
size_wrapper: ImageSizeWrapper):
|
||||||
|
return [(
|
||||||
|
[prompt for _ in size_wrapper.data],
|
||||||
|
[[
|
||||||
|
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||||
|
for image in images
|
||||||
|
] for size in size_wrapper.data],
|
||||||
|
) for images, prompt in zip(image_lists, model_prompts)]
|
||||||
|
|
||||||
|
|
||||||
|
def build_embedding_inputs_from_test_info(
|
||||||
|
test_info: VLMTestInfo,
|
||||||
|
image_assets: _ImageAssets,
|
||||||
|
size_wrapper: ImageSizeWrapper,
|
||||||
|
):
|
||||||
|
# These conditions will always be true if invoked through filtering,
|
||||||
|
# but we still check them in case this is ever called directly
|
||||||
|
if test_info.prompt_formatter is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Prompt formatter must be set to build image embedding inputs")
|
||||||
|
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
|
||||||
|
all(factor == 1.0 for factor in size_wrapper.data):
|
||||||
|
raise ValueError("Embedding tests require constant (1.0) size factors")
|
||||||
|
if test_info.convert_assets_to_embeddings is None:
|
||||||
|
raise ValueError("No conversion func for getting embeddings found")
|
||||||
|
|
||||||
|
model_prompts = get_model_prompts(
|
||||||
|
SINGLE_IMAGE_BASE_PROMPTS,
|
||||||
|
test_info.img_idx_to_prompt,
|
||||||
|
test_info.video_idx_to_prompt,
|
||||||
|
test_info.prompt_formatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
embeds = test_info.convert_assets_to_embeddings(image_assets)
|
||||||
|
assert len(images) == len(model_prompts)
|
||||||
|
|
||||||
|
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||||
|
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
|
||||||
|
size_wrapper)
|
||||||
|
return inputs, vllm_embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def build_video_inputs_from_test_info(
|
||||||
|
test_info: VLMTestInfo,
|
||||||
|
video_assets: _VideoAssets,
|
||||||
|
size_wrapper: ImageSizeWrapper,
|
||||||
|
num_frames: int,
|
||||||
|
):
|
||||||
|
if test_info.prompt_formatter is None:
|
||||||
|
raise ValueError("Prompt formatter must be set to build video inputs")
|
||||||
|
model_prompts = get_model_prompts(
|
||||||
|
[VIDEO_BASE_PROMPT],
|
||||||
|
test_info.img_idx_to_prompt,
|
||||||
|
test_info.video_idx_to_prompt,
|
||||||
|
test_info.prompt_formatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
sampled_vids = [
|
||||||
|
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||||
|
for asset in video_assets
|
||||||
|
]
|
||||||
|
|
||||||
|
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
|
||||||
|
else rescale_video_size)
|
||||||
|
|
||||||
|
return [(
|
||||||
|
[prompt for _ in size_wrapper.data],
|
||||||
|
[video_scaler(video, size) for size in size_wrapper.data],
|
||||||
|
) for video, prompt in zip(sampled_vids, model_prompts)]
|
||||||
|
|
||||||
|
|
||||||
|
def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
|
||||||
|
size_type: SizeType):
|
||||||
|
"""Applies a size scaler to one image; this can be a an image size factor,
|
||||||
|
which scales the image while maintaining the aspect ratio"""
|
||||||
|
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||||
|
# are considering size factors at constant scale, i.e., we just clone
|
||||||
|
# the tensor
|
||||||
|
if isinstance(image, torch.Tensor):
|
||||||
|
assert size_type == SizeType.SIZE_FACTOR and size == 1
|
||||||
|
return image
|
||||||
|
if size_type == SizeType.SIZE_FACTOR:
|
||||||
|
# We have a list of image size factors
|
||||||
|
return rescale_image_size(image, size)
|
||||||
|
elif size_type == SizeType.FIXED_SIZE:
|
||||||
|
# We have a list of fixed sizes
|
||||||
|
return image.resize(size)
|
||||||
|
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
|
||||||
@ -0,0 +1,157 @@
|
|||||||
|
"""Utils for determining which subset of model tests belong to a specific
|
||||||
|
modality, getting all combinations (similar to pytest's parametrization),
|
||||||
|
handling multimodal placeholder substitution, and so on.
|
||||||
|
"""
|
||||||
|
import itertools
|
||||||
|
from collections import OrderedDict
|
||||||
|
from typing import Dict, Iterable, Tuple
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
||||||
|
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||||
|
|
||||||
|
|
||||||
|
def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
|
||||||
|
test_type: VLMTestType,
|
||||||
|
fork_per_test: bool) -> Dict[str, VLMTestInfo]:
|
||||||
|
"""Given the dict of potential test settings to run, return a subdict
|
||||||
|
of tests who have the current test type enabled with the matching val for
|
||||||
|
fork_per_test.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
|
||||||
|
return test_info.test_type == test_type or (
|
||||||
|
isinstance(test_info.test_type, Iterable)
|
||||||
|
and test_type in test_info.test_type)
|
||||||
|
|
||||||
|
matching_tests = {}
|
||||||
|
for test_name, test_info in test_settings.items():
|
||||||
|
# Otherwise check if the test has the right type & keep if it does
|
||||||
|
if matches_test_type(test_info, test_type):
|
||||||
|
# Embedding tests need to have a conversion func in their test info
|
||||||
|
if matches_test_type(test_info, VLMTestType.EMBEDDING):
|
||||||
|
assert test_info.convert_assets_to_embeddings is not None
|
||||||
|
# Custom test inputs need to explicitly define the mm limit/inputs
|
||||||
|
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
|
||||||
|
assert (test_info.custom_test_opts is not None
|
||||||
|
and isinstance(test_info.custom_test_opts, Iterable))
|
||||||
|
# For all types besides custom inputs, we need a prompt formatter
|
||||||
|
else:
|
||||||
|
assert test_info.prompt_formatter is not None
|
||||||
|
|
||||||
|
# Everything looks okay; keep if this is has correct proc handling
|
||||||
|
if (test_info.distributed_executor_backend
|
||||||
|
is not None) == fork_per_test:
|
||||||
|
matching_tests[test_name] = test_info
|
||||||
|
|
||||||
|
return matching_tests
|
||||||
|
|
||||||
|
|
||||||
|
def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
|
||||||
|
test_type: VLMTestType,
|
||||||
|
fork_new_process_for_each_test: bool):
|
||||||
|
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||||
|
This is similar to nesting pytest parametrize calls, but done directly
|
||||||
|
through an itertools product so that each test can set things like
|
||||||
|
size factors etc, while still running in isolated test cases.
|
||||||
|
"""
|
||||||
|
matching_tests = get_filtered_test_settings(
|
||||||
|
test_settings, test_type, fork_new_process_for_each_test)
|
||||||
|
|
||||||
|
# Ensure that something is wrapped as an iterable it's not already
|
||||||
|
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||||
|
|
||||||
|
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
|
||||||
|
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||||
|
# decorators, but we do it programmatically to allow overrides for on
|
||||||
|
# a per-model basis, while still being able to execute each of these
|
||||||
|
# as individual test cases in pytest.
|
||||||
|
iter_kwargs = OrderedDict([
|
||||||
|
("model", ensure_wrapped(test_info.models)),
|
||||||
|
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||||
|
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||||
|
("dtype", ensure_wrapped(test_info.dtype)),
|
||||||
|
("distributed_executor_backend",
|
||||||
|
ensure_wrapped(test_info.distributed_executor_backend)),
|
||||||
|
])
|
||||||
|
|
||||||
|
# num_frames is video only
|
||||||
|
if test_type == VLMTestType.VIDEO:
|
||||||
|
iter_kwargs["num_video_frames"] = ensure_wrapped(
|
||||||
|
test_info.num_video_frames)
|
||||||
|
|
||||||
|
# No sizes passed for custom inputs, since inputs are directly provided
|
||||||
|
if test_type != VLMTestType.CUSTOM_INPUTS:
|
||||||
|
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||||
|
if wrapped_sizes is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Sizes must be set for test type {test_type}")
|
||||||
|
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||||
|
|
||||||
|
#Otherwise expand the custom test options instead
|
||||||
|
else:
|
||||||
|
if test_info.custom_test_opts is None:
|
||||||
|
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||||
|
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
# Wrap all model cases in a pytest parameter & pass marks through
|
||||||
|
return [
|
||||||
|
pytest.param(
|
||||||
|
model_type,
|
||||||
|
ExpandableVLMTestArgs(
|
||||||
|
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
|
||||||
|
),
|
||||||
|
marks=test_info.marks if test_info.marks is not None else []
|
||||||
|
) for case in list(itertools.product(*iter_kwargs.values()))
|
||||||
|
]
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
# Get a list per model type, where each entry contains a tuple of all of
|
||||||
|
# that model type's cases, then flatten them into the top level so that
|
||||||
|
# we can consume them in one mark.parametrize call.
|
||||||
|
cases_by_model_type = [
|
||||||
|
get_model_type_cases(model_type, test_info)
|
||||||
|
for model_type, test_info in matching_tests.items()
|
||||||
|
]
|
||||||
|
return list(itertools.chain(*cases_by_model_type))
|
||||||
|
|
||||||
|
|
||||||
|
def get_wrapped_test_sizes(
|
||||||
|
test_info: VLMTestInfo,
|
||||||
|
test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
|
||||||
|
"""Given a test info which may have size factors or fixed sizes, wrap them
|
||||||
|
and combine them into an iterable, each of which will be used in parameter
|
||||||
|
expansion.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_info: Test configuration to be expanded.
|
||||||
|
test_type: The type of test being filtered for.
|
||||||
|
"""
|
||||||
|
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
|
||||||
|
if test_type == VLMTestType.EMBEDDING:
|
||||||
|
return tuple([
|
||||||
|
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||||
|
for factor in EMBEDDING_SIZE_FACTORS
|
||||||
|
])
|
||||||
|
# Custom inputs have preprocessed inputs
|
||||||
|
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||||
|
return tuple()
|
||||||
|
|
||||||
|
size_factors = test_info.image_size_factors \
|
||||||
|
if test_info.image_size_factors else []
|
||||||
|
fixed_sizes = test_info.image_sizes \
|
||||||
|
if test_info.image_sizes else []
|
||||||
|
|
||||||
|
wrapped_factors = [
|
||||||
|
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||||
|
for factor in size_factors
|
||||||
|
]
|
||||||
|
|
||||||
|
wrapped_sizes = [
|
||||||
|
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
|
||||||
|
for size in fixed_sizes
|
||||||
|
]
|
||||||
|
|
||||||
|
return tuple(wrapped_factors + wrapped_sizes)
|
||||||
141
tests/models/decoder_only/vision_language/vlm_utils/core.py
Normal file
141
tests/models/decoder_only/vision_language/vlm_utils/core.py
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
"""Core test implementation to be shared across modalities."""
|
||||||
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from PIL.Image import Image
|
||||||
|
from transformers import AutoTokenizer, BatchEncoding
|
||||||
|
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||||
|
|
||||||
|
from .....conftest import HfRunner, VllmRunner
|
||||||
|
from .types import RunnerOutput
|
||||||
|
|
||||||
|
|
||||||
|
def run_test(
|
||||||
|
*,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
|
||||||
|
model: str,
|
||||||
|
dtype: str,
|
||||||
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
|
enforce_eager: bool,
|
||||||
|
max_model_len: int,
|
||||||
|
max_num_seqs: int,
|
||||||
|
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
||||||
|
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
||||||
|
auto_cls: Type[_BaseAutoModelClass],
|
||||||
|
use_tokenizer_eos: bool,
|
||||||
|
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
|
||||||
|
comparator: Callable[..., None],
|
||||||
|
get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
|
||||||
|
limit_mm_per_prompt: Dict[str, int],
|
||||||
|
model_kwargs: Optional[Dict[str, Any]],
|
||||||
|
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
|
||||||
|
task: str = "auto",
|
||||||
|
runner_mm_key: str = "images",
|
||||||
|
distributed_executor_backend: Optional[str] = None,
|
||||||
|
tensor_parallel_size: int = 1,
|
||||||
|
vllm_embeddings: Optional[torch.Tensor] = None,
|
||||||
|
):
|
||||||
|
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
|
||||||
|
# In the case of embeddings, vLLM takes separate input tensors
|
||||||
|
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
|
||||||
|
|
||||||
|
vllm_outputs_per_mm = []
|
||||||
|
hf_outputs_per_mm = []
|
||||||
|
|
||||||
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
|
# will hurt multiprocessing backend with fork method (the default method).
|
||||||
|
vllm_kwargs = {}
|
||||||
|
if get_stop_token_ids is not None:
|
||||||
|
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
|
||||||
|
|
||||||
|
with vllm_runner(model,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
max_num_seqs=max_num_seqs,
|
||||||
|
dtype=dtype,
|
||||||
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
enforce_eager=enforce_eager,
|
||||||
|
task=task) as vllm_model:
|
||||||
|
for prompts, media in vllm_inputs:
|
||||||
|
vllm_kwargs[runner_mm_key] = media
|
||||||
|
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||||
|
prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs)
|
||||||
|
vllm_outputs_per_mm.append(vllm_output)
|
||||||
|
|
||||||
|
hf_model = hf_runner(model,
|
||||||
|
dtype=dtype,
|
||||||
|
auto_cls=auto_cls,
|
||||||
|
postprocess_inputs=postprocess_inputs,
|
||||||
|
model_kwargs=model_kwargs)
|
||||||
|
|
||||||
|
# Some models need to patch things like the model processor, e.g., internvl
|
||||||
|
if patch_hf_runner is not None:
|
||||||
|
hf_model = patch_hf_runner(hf_model)
|
||||||
|
|
||||||
|
# Some models need to explicitly pass the eos_token_id off the tokenizer or
|
||||||
|
# processor for a good comparison; currently assume processor/tokenizer
|
||||||
|
# agree on the EOS, and pull it off the tokenizer if requested.
|
||||||
|
hf_kwargs = {}
|
||||||
|
if use_tokenizer_eos:
|
||||||
|
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
with hf_model, torch.no_grad():
|
||||||
|
for prompts, media in inputs:
|
||||||
|
hf_kwargs[runner_mm_key] = media
|
||||||
|
hf_output = hf_model.generate_greedy_logprobs_limit(
|
||||||
|
prompts,
|
||||||
|
max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**hf_kwargs)
|
||||||
|
hf_outputs_per_mm.append(hf_output)
|
||||||
|
|
||||||
|
# Apply output processing / sanitation to the vLLM and HF runner results
|
||||||
|
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
|
||||||
|
model,
|
||||||
|
first_runner_outputs=hf_outputs_per_mm,
|
||||||
|
second_runner_outputs=vllm_outputs_per_mm,
|
||||||
|
first_runner_processor=hf_output_post_proc,
|
||||||
|
second_runner_processor=vllm_output_post_proc,
|
||||||
|
)
|
||||||
|
|
||||||
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
|
||||||
|
vllm_outputs_per_mm):
|
||||||
|
# This is usually check_logprobs_close, but it's passed through to
|
||||||
|
# allow things like check_outputs_equal where needed
|
||||||
|
comparator(
|
||||||
|
outputs_0_lst=hf_outputs,
|
||||||
|
outputs_1_lst=vllm_outputs,
|
||||||
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def process_runner_outputs(
|
||||||
|
model,
|
||||||
|
first_runner_outputs,
|
||||||
|
second_runner_outputs,
|
||||||
|
first_runner_processor=None,
|
||||||
|
second_runner_processor=None,
|
||||||
|
):
|
||||||
|
"""Applies the runner processor(s) to the runner outputs, if any."""
|
||||||
|
if first_runner_processor is not None:
|
||||||
|
first_runner_outputs = process_outputs(first_runner_processor, model,
|
||||||
|
first_runner_outputs)
|
||||||
|
if second_runner_processor is not None:
|
||||||
|
second_runner_outputs = process_outputs(second_runner_processor, model,
|
||||||
|
second_runner_outputs)
|
||||||
|
return first_runner_outputs, second_runner_outputs
|
||||||
|
|
||||||
|
|
||||||
|
def process_outputs(output_processor, model, outputs_per_image):
|
||||||
|
"""Applies a model specific post-processor function to a runner's output"""
|
||||||
|
return [[output_processor(res, model) for res in outputs]
|
||||||
|
for outputs in outputs_per_image]
|
||||||
@ -0,0 +1,102 @@
|
|||||||
|
"""Custom input builders for edge-cases in different models."""
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
||||||
|
resize_video, sample_frames_from_video)
|
||||||
|
|
||||||
|
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||||
|
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||||
|
from .types import ImageSizeWrapper, SizeType
|
||||||
|
|
||||||
|
|
||||||
|
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||||
|
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
formatter: model-specific prompt formatter.
|
||||||
|
"""
|
||||||
|
stop_sign = IMAGE_ASSETS[0].pil_image
|
||||||
|
cherry_blossom = IMAGE_ASSETS[1].pil_image
|
||||||
|
|
||||||
|
# Apply the selected formatter to the base prompts
|
||||||
|
img_prompts = [
|
||||||
|
"<image><image>\nDescribe 2 images.",
|
||||||
|
"<image><image>\nDescribe 2 images.",
|
||||||
|
"<image><image><image><image>\nDescribe 4 images.",
|
||||||
|
"<image>\nWhat is the season?",
|
||||||
|
]
|
||||||
|
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
|
||||||
|
|
||||||
|
return [(
|
||||||
|
formatted_prompts,
|
||||||
|
[
|
||||||
|
[stop_sign, cherry_blossom],
|
||||||
|
# Images with different sizes and aspect-ratios
|
||||||
|
[
|
||||||
|
rescale_image_size(stop_sign, 0.1),
|
||||||
|
stop_sign,
|
||||||
|
],
|
||||||
|
[
|
||||||
|
stop_sign,
|
||||||
|
rescale_image_size(stop_sign, 0.25),
|
||||||
|
cherry_blossom.resize((183, 488)),
|
||||||
|
cherry_blossom.resize((488, 183))
|
||||||
|
],
|
||||||
|
cherry_blossom,
|
||||||
|
])]
|
||||||
|
|
||||||
|
|
||||||
|
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||||
|
num_frames: int = 16):
|
||||||
|
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
formatter: model-specific prompt formatter.
|
||||||
|
"""
|
||||||
|
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
|
||||||
|
# Apply the selected formatter to the base prompts
|
||||||
|
video_prompts = [
|
||||||
|
"<video><video>\nDescribe 2 videos.",
|
||||||
|
"<video><video>\nDescribe 2 videos.",
|
||||||
|
"<video><video><video><video>\nDescribe 4 videos.",
|
||||||
|
"<video>\nWhy is this video funny?",
|
||||||
|
]
|
||||||
|
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
|
||||||
|
|
||||||
|
return [(
|
||||||
|
formatted_prompts,
|
||||||
|
[
|
||||||
|
[video, video],
|
||||||
|
# Videos with different sizes and aspect-ratios
|
||||||
|
[
|
||||||
|
rescale_video_size(video, 0.1),
|
||||||
|
video,
|
||||||
|
],
|
||||||
|
[
|
||||||
|
video,
|
||||||
|
rescale_video_size(video, 0.25),
|
||||||
|
resize_video(video, (183, 488)),
|
||||||
|
resize_video(video, (488, 183))
|
||||||
|
],
|
||||||
|
video,
|
||||||
|
])]
|
||||||
|
|
||||||
|
|
||||||
|
def different_patch_input_cases_internvl():
|
||||||
|
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||||
|
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||||
|
single_img_prompts = [
|
||||||
|
"<image>\nWhat's the content in the center of the image?",
|
||||||
|
"<image>\nWhat is the season?",
|
||||||
|
]
|
||||||
|
multi_img_prompts = [
|
||||||
|
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
|
||||||
|
]
|
||||||
|
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
|
||||||
|
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
|
||||||
|
|
||||||
|
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
|
||||||
|
return [
|
||||||
|
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
|
||||||
|
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
|
||||||
|
]
|
||||||
@ -0,0 +1,338 @@
|
|||||||
|
"""Common utility functions relating to different models that are useful
|
||||||
|
for manipulating the input / output of HF & vLLM test runners, which are
|
||||||
|
typically specific to a small subset of models.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import types
|
||||||
|
from pathlib import PosixPath
|
||||||
|
from typing import Callable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from PIL.Image import Image
|
||||||
|
from transformers import AutoConfig, AutoTokenizer, BatchEncoding
|
||||||
|
|
||||||
|
from vllm.sequence import SampleLogprobs
|
||||||
|
from vllm.transformers_utils.tokenizer import patch_padding_side
|
||||||
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
from .....conftest import HfRunner, ImageAsset, _ImageAssets
|
||||||
|
from .types import RunnerOutput
|
||||||
|
|
||||||
|
|
||||||
|
####### vLLM output processors functions
|
||||||
|
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||||
|
model: str) -> RunnerOutput:
|
||||||
|
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
|
||||||
|
_, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
hf_output_str = output_str + "\n"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
hf_output_ids = tokenizer.encode(hf_output_str)
|
||||||
|
assert hf_output_ids[0] == tokenizer.bos_token_id
|
||||||
|
hf_output_ids = hf_output_ids[1:]
|
||||||
|
|
||||||
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||||
|
model: str) -> RunnerOutput:
|
||||||
|
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
|
||||||
|
output_ids, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
|
||||||
|
|
||||||
|
return output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
def qwen_vllm_to_hf_output(
|
||||||
|
vllm_output: RunnerOutput,
|
||||||
|
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
|
||||||
|
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
|
||||||
|
output_ids, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
hf_output_str = output_str + "<|endoftext|>"
|
||||||
|
|
||||||
|
return output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||||
|
model: str) -> RunnerOutput:
|
||||||
|
config = AutoConfig.from_pretrained(model)
|
||||||
|
mm_token_id = config.image_token_index
|
||||||
|
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||||
|
|
||||||
|
|
||||||
|
def llava_video_vllm_to_hf_output(
|
||||||
|
vllm_output: RunnerOutput,
|
||||||
|
model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
|
||||||
|
config = AutoConfig.from_pretrained(model)
|
||||||
|
mm_token_id = config.video_token_index
|
||||||
|
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||||
|
|
||||||
|
|
||||||
|
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||||
|
mm_token_id: int) -> RunnerOutput:
|
||||||
|
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
|
||||||
|
output_ids, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
eos_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
hf_output_ids = [
|
||||||
|
token_id for idx, token_id in enumerate(output_ids)
|
||||||
|
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
|
||||||
|
]
|
||||||
|
|
||||||
|
assert output_str[0] == " "
|
||||||
|
hf_output_str = output_str[1:]
|
||||||
|
if hf_output_ids[-1] == eos_token_id:
|
||||||
|
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||||
|
|
||||||
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||||
|
model: str) -> RunnerOutput:
|
||||||
|
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
|
||||||
|
output_ids, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(model)
|
||||||
|
video_token_id = config.video_token_index
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
eos_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
hf_output_ids = [
|
||||||
|
token_id for idx, token_id in enumerate(output_ids)
|
||||||
|
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||||
|
]
|
||||||
|
|
||||||
|
hf_output_str = output_str
|
||||||
|
if hf_output_ids[-1] == eos_token_id:
|
||||||
|
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||||
|
|
||||||
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||||
|
model: str) -> RunnerOutput:
|
||||||
|
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
|
||||||
|
_, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
|
||||||
|
assert output_str_without_image[0] == " "
|
||||||
|
output_str_without_image = output_str_without_image[1:]
|
||||||
|
|
||||||
|
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
hf_output_ids = tokenizer.encode(output_str_without_image)
|
||||||
|
assert hf_output_ids[0] == 1
|
||||||
|
hf_output_ids = hf_output_ids[1:]
|
||||||
|
|
||||||
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||||
|
model: str) -> RunnerOutput:
|
||||||
|
"""Sanitize vllm output to be comparable with hf output."""
|
||||||
|
output_ids, output_str, out_logprobs = vllm_output
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(model)
|
||||||
|
image_token_id = config.image_token_index
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
eos_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
hf_output_ids = [
|
||||||
|
token_id for idx, token_id in enumerate(output_ids)
|
||||||
|
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||||
|
]
|
||||||
|
|
||||||
|
hf_output_str = output_str
|
||||||
|
|
||||||
|
if hf_output_ids[-1] == eos_token_id:
|
||||||
|
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||||
|
|
||||||
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
####### Post-processors for HF outputs
|
||||||
|
def minicmpv_trunc_hf_output(hf_output: RunnerOutput,
|
||||||
|
model: str) -> RunnerOutput:
|
||||||
|
output_ids, output_str, out_logprobs = hf_output
|
||||||
|
if output_str.endswith("<|eot_id|>"):
|
||||||
|
output_str = output_str.split("<|eot_id|>")[0]
|
||||||
|
return output_ids, output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
|
####### Functions for converting image assets to embeddings
|
||||||
|
def get_llava_embeddings(image_assets: _ImageAssets):
|
||||||
|
return [asset.image_embeds for asset in image_assets]
|
||||||
|
|
||||||
|
|
||||||
|
####### postprocessors to run on HF BatchEncoding
|
||||||
|
def get_key_type_post_processor(
|
||||||
|
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
|
||||||
|
"""Gets a handle to a post processor which converts a given key into a
|
||||||
|
target data type."""
|
||||||
|
|
||||||
|
def process(hf_inputs: BatchEncoding, dtype: str):
|
||||||
|
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||||
|
hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
|
||||||
|
return hf_inputs
|
||||||
|
|
||||||
|
return process
|
||||||
|
|
||||||
|
|
||||||
|
def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
|
||||||
|
return {"model_inputs": hf_inputs}
|
||||||
|
|
||||||
|
|
||||||
|
####### Prompt path encoders for models that need models on disk
|
||||||
|
def qwen_prompt_path_encoder(
|
||||||
|
tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
|
||||||
|
_ImageAssets]) -> str:
|
||||||
|
"""Given a temporary dir path, export one or more image assets into the
|
||||||
|
tempdir & replace its contents with the local path to the string so that
|
||||||
|
the HF version of Qwen-VL can resolve the path and load the image in its
|
||||||
|
forward() call.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tmp_path: Tempdir for test under consideration.
|
||||||
|
prompt: Prompt with image placeholders.
|
||||||
|
assets: List of image assets whose len equals the num placeholders.
|
||||||
|
"""
|
||||||
|
# Ensure that the number of placeholders matches the number of assets;
|
||||||
|
# If this is not true, the test is probably written incorrectly.
|
||||||
|
assert prompt.count("<img></img>") == len(assets)
|
||||||
|
|
||||||
|
# Replace the placeholders with local paths to the exported assets
|
||||||
|
for asset in assets:
|
||||||
|
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
||||||
|
asset.pil_image.save(image_tmp_path)
|
||||||
|
prompt = prompt.replace(
|
||||||
|
"<img></img>",
|
||||||
|
f"<img>{image_tmp_path}</img>",
|
||||||
|
1,
|
||||||
|
)
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
####### Model-specific HuggingFace runner patchers
|
||||||
|
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||||
|
"""Patches and returns an instance of the HfRunner to use for GLM4."""
|
||||||
|
hf_processor = hf_model.processor
|
||||||
|
patch_padding_side(hf_processor)
|
||||||
|
|
||||||
|
def processor(*args, text="", images=None, **kwargs):
|
||||||
|
if images is None:
|
||||||
|
return hf_processor(*args, **kwargs)
|
||||||
|
|
||||||
|
return hf_processor.apply_chat_template(
|
||||||
|
[{
|
||||||
|
"role": "user",
|
||||||
|
"image": images,
|
||||||
|
"content": text
|
||||||
|
}],
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=True,
|
||||||
|
return_dict=True,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
hf_model.processor = processor
|
||||||
|
hf_model.model.get_output_embeddings = lambda: \
|
||||||
|
hf_model.model.transformer.output_layer
|
||||||
|
return hf_model
|
||||||
|
|
||||||
|
|
||||||
|
def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||||
|
"""Patches and returns an instance of the HfRunner to use for InternVL."""
|
||||||
|
|
||||||
|
class InternVLProcessor:
|
||||||
|
"""A simple processor for InternVL2 which misses a processor."""
|
||||||
|
|
||||||
|
def __init__(self, hf_runner: HfRunner):
|
||||||
|
self.num_image_token = hf_runner.model.num_image_token
|
||||||
|
self.tokenizer = hf_runner.tokenizer
|
||||||
|
self.dtype = hf_runner.model.dtype
|
||||||
|
|
||||||
|
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||||
|
trust_remote_code=True)
|
||||||
|
self.vision_config = self.config.vision_config
|
||||||
|
self.use_thumbnail = self.config.use_thumbnail
|
||||||
|
self.min_num = self.config.min_dynamic_patch
|
||||||
|
self.max_num = self.config.max_dynamic_patch
|
||||||
|
self.image_size = self.vision_config.image_size
|
||||||
|
|
||||||
|
def __call__(self, text: str, images: Union[Image, List[Image]],
|
||||||
|
**kwargs):
|
||||||
|
from vllm.model_executor.models.internvl import (
|
||||||
|
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
||||||
|
images = [images] if isinstance(images, Image) else images
|
||||||
|
pixel_values = [
|
||||||
|
image_to_pixel_values(image, self.image_size, self.min_num,
|
||||||
|
self.max_num,
|
||||||
|
self.use_thumbnail).to(self.dtype)
|
||||||
|
for image in images
|
||||||
|
]
|
||||||
|
num_patches_list = [
|
||||||
|
pixel_value.shape[0] for pixel_value in pixel_values
|
||||||
|
]
|
||||||
|
pixel_values = torch.cat(pixel_values, dim=0)
|
||||||
|
for num_patches in num_patches_list:
|
||||||
|
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||||
|
* num_patches
|
||||||
|
image_tokens = IMG_START + context_tokens + IMG_END
|
||||||
|
text = text.replace('<image>', image_tokens, 1)
|
||||||
|
prompt = self.tokenizer(text, return_tensors="pt")
|
||||||
|
prompt.update({"pixel_values": pixel_values})
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||||
|
"<IMG_CONTEXT>")
|
||||||
|
hf_model.model.img_context_token_id = img_context_token_id
|
||||||
|
hf_model.processor = InternVLProcessor(hf_model)
|
||||||
|
hf_model.model.get_output_embeddings = lambda: \
|
||||||
|
hf_model.model.language_model.get_output_embeddings()
|
||||||
|
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||||
|
hf_model.model)
|
||||||
|
return hf_model
|
||||||
|
|
||||||
|
|
||||||
|
def _internvl_generate(
|
||||||
|
self,
|
||||||
|
pixel_values: torch.FloatTensor,
|
||||||
|
input_ids: torch.FloatTensor,
|
||||||
|
attention_mask: Optional[torch.LongTensor] = None,
|
||||||
|
**generate_kwargs,
|
||||||
|
) -> torch.LongTensor:
|
||||||
|
"""Generate method for InternVL2 model without fixed use_cache."""
|
||||||
|
assert self.img_context_token_id is not None
|
||||||
|
vit_embeds = self.extract_feature(pixel_values)
|
||||||
|
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
||||||
|
B, N, C = input_embeds.shape
|
||||||
|
input_embeds = input_embeds.reshape(B * N, C)
|
||||||
|
|
||||||
|
input_ids = input_ids.reshape(B * N)
|
||||||
|
selected = (input_ids == self.img_context_token_id)
|
||||||
|
assert selected.sum() != 0
|
||||||
|
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
||||||
|
|
||||||
|
input_embeds = input_embeds.reshape(B, N, C)
|
||||||
|
|
||||||
|
forward_kwargs = dict(
|
||||||
|
inputs_embeds=input_embeds,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
)
|
||||||
|
if getattr(self, "use_visual_token_mask", False):
|
||||||
|
visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
|
||||||
|
forward_kwargs["visual_token_mask"] = visual_token_mask
|
||||||
|
outputs = self.language_model.generate(
|
||||||
|
**forward_kwargs,
|
||||||
|
**generate_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
return outputs
|
||||||
130
tests/models/decoder_only/vision_language/vlm_utils/runners.py
Normal file
130
tests/models/decoder_only/vision_language/vlm_utils/runners.py
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
"""Entrypoints for wrapping the core run_test implementation for specific test
|
||||||
|
types / modalities.
|
||||||
|
"""
|
||||||
|
from pathlib import PosixPath
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
|
from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
|
||||||
|
from . import builders, core
|
||||||
|
from .types import ExpandableVLMTestArgs, VLMTestInfo
|
||||||
|
|
||||||
|
|
||||||
|
####### Entrypoints for running different test types
|
||||||
|
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
assert test_case.size_wrapper is not None
|
||||||
|
inputs = builders.build_single_image_inputs_from_test_info(
|
||||||
|
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||||
|
|
||||||
|
core.run_test(
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
inputs=inputs,
|
||||||
|
model=test_case.model,
|
||||||
|
dtype=test_case.dtype,
|
||||||
|
max_tokens=test_case.max_tokens,
|
||||||
|
num_logprobs=test_case.num_logprobs,
|
||||||
|
limit_mm_per_prompt={"image": 1},
|
||||||
|
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||||
|
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
assert test_case.size_wrapper is not None
|
||||||
|
inputs = builders.build_multi_image_inputs_from_test_info(
|
||||||
|
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||||
|
|
||||||
|
core.run_test(
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
inputs=inputs,
|
||||||
|
model=test_case.model,
|
||||||
|
dtype=test_case.dtype,
|
||||||
|
max_tokens=test_case.max_tokens,
|
||||||
|
num_logprobs=test_case.num_logprobs,
|
||||||
|
limit_mm_per_prompt={"image": len(image_assets)},
|
||||||
|
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||||
|
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
image_assets: _ImageAssets):
|
||||||
|
assert test_case.size_wrapper is not None
|
||||||
|
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
|
||||||
|
model_test_info, image_assets, test_case.size_wrapper)
|
||||||
|
|
||||||
|
core.run_test(
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
inputs=inputs,
|
||||||
|
model=test_case.model,
|
||||||
|
dtype=test_case.dtype,
|
||||||
|
max_tokens=test_case.max_tokens,
|
||||||
|
num_logprobs=test_case.num_logprobs,
|
||||||
|
limit_mm_per_prompt={"image": 1},
|
||||||
|
vllm_embeddings=vllm_embeddings,
|
||||||
|
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||||
|
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
def run_video_test(
|
||||||
|
*,
|
||||||
|
model_test_info: VLMTestInfo,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner],
|
||||||
|
video_assets: _VideoAssets,
|
||||||
|
):
|
||||||
|
assert test_case.size_wrapper is not None
|
||||||
|
assert test_case.num_video_frames is not None
|
||||||
|
inputs = builders.build_video_inputs_from_test_info(
|
||||||
|
model_test_info, video_assets, test_case.size_wrapper,
|
||||||
|
test_case.num_video_frames)
|
||||||
|
|
||||||
|
core.run_test(
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
inputs=inputs,
|
||||||
|
model=test_case.model,
|
||||||
|
dtype=test_case.dtype,
|
||||||
|
max_tokens=test_case.max_tokens,
|
||||||
|
num_logprobs=test_case.num_logprobs,
|
||||||
|
limit_mm_per_prompt={"video": len(video_assets)},
|
||||||
|
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||||
|
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||||
|
test_case: ExpandableVLMTestArgs,
|
||||||
|
hf_runner: Type[HfRunner],
|
||||||
|
vllm_runner: Type[VllmRunner]):
|
||||||
|
# Custom test cases can provide inputs directly, but they need to
|
||||||
|
# explicitly provided a CustomTestConfig, which wraps the inputs and
|
||||||
|
# the limit_mm_per_prompt
|
||||||
|
assert test_case.custom_test_opts is not None
|
||||||
|
|
||||||
|
inputs = test_case.custom_test_opts.inputs
|
||||||
|
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
|
||||||
|
assert inputs is not None and limit_mm_per_prompt is not None
|
||||||
|
|
||||||
|
core.run_test(
|
||||||
|
hf_runner=hf_runner,
|
||||||
|
vllm_runner=vllm_runner,
|
||||||
|
inputs=inputs,
|
||||||
|
model=test_case.model,
|
||||||
|
dtype=test_case.dtype,
|
||||||
|
max_tokens=test_case.max_tokens,
|
||||||
|
num_logprobs=test_case.num_logprobs,
|
||||||
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
|
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||||
|
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||||
187
tests/models/decoder_only/vision_language/vlm_utils/types.py
Normal file
187
tests/models/decoder_only/vision_language/vlm_utils/types.py
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
"""Types for writing multimodal model tests."""
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import PosixPath
|
||||||
|
from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
|
||||||
|
Tuple, Type, Union)
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from PIL.Image import Image
|
||||||
|
from pytest import MarkDecorator
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
|
||||||
|
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||||
|
|
||||||
|
from vllm.sequence import SampleLogprobs
|
||||||
|
from vllm.utils import identity
|
||||||
|
|
||||||
|
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
|
||||||
|
from ....utils import check_logprobs_close
|
||||||
|
|
||||||
|
# meta image tag; will be replaced by the appropriate tag for the model
|
||||||
|
TEST_IMG_PLACEHOLDER = "<vlm_image>"
|
||||||
|
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||||
|
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
||||||
|
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
||||||
|
})
|
||||||
|
|
||||||
|
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
|
||||||
|
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
||||||
|
|
||||||
|
|
||||||
|
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||||
|
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
|
||||||
|
RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]]
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
|
||||||
|
class VLMTestType(Enum):
|
||||||
|
IMAGE = 1
|
||||||
|
MULTI_IMAGE = 2
|
||||||
|
EMBEDDING = 3
|
||||||
|
VIDEO = 4
|
||||||
|
CUSTOM_INPUTS = 5
|
||||||
|
|
||||||
|
|
||||||
|
class SizeType(Enum):
|
||||||
|
SIZE_FACTOR = 1
|
||||||
|
FIXED_SIZE = 2
|
||||||
|
|
||||||
|
|
||||||
|
class CustomTestOptions(NamedTuple):
|
||||||
|
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
|
||||||
|
limit_mm_per_prompt: Dict[str, int]
|
||||||
|
|
||||||
|
|
||||||
|
class ImageSizeWrapper(NamedTuple):
|
||||||
|
type: SizeType
|
||||||
|
# A size factor is a wrapper of 0+ floats,
|
||||||
|
# while a fixed size contains an iterable of integer pairs
|
||||||
|
data: Union[Iterable[float], Iterable[Tuple[int, int]]]
|
||||||
|
|
||||||
|
|
||||||
|
class VLMTestInfo(NamedTuple):
|
||||||
|
"""Holds the configuration for 1+ tests for one model architecture."""
|
||||||
|
|
||||||
|
models: Union[List[str]]
|
||||||
|
test_type: Union[VLMTestType, Iterable[VLMTestType]]
|
||||||
|
|
||||||
|
# Should be None only if this is a CUSTOM_INPUTS test
|
||||||
|
prompt_formatter: Optional[Callable[[str], str]] = None
|
||||||
|
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
|
||||||
|
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
|
||||||
|
|
||||||
|
# Most models work on the single / multi-image prompts above, but in some
|
||||||
|
# cases the log prob check fails, e.g., for paligemma. We allow passing
|
||||||
|
# an override for the single image prompts / multi-image prompt for this
|
||||||
|
# reason.
|
||||||
|
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
|
||||||
|
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
|
||||||
|
|
||||||
|
# Function for converting ImageAssets to image embeddings;
|
||||||
|
# We need to define this explicitly for embedding tests
|
||||||
|
convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
|
||||||
|
torch.Tensor]] = None
|
||||||
|
|
||||||
|
# Exposed options for vLLM runner; we change these in a several tests,
|
||||||
|
# but the defaults are derived from VllmRunner & the engine defaults
|
||||||
|
# These settings are chosen to avoid OOMs when running in the CI
|
||||||
|
enforce_eager: bool = True
|
||||||
|
max_model_len: int = 1024
|
||||||
|
max_num_seqs: int = 256
|
||||||
|
task: str = "auto"
|
||||||
|
tensor_parallel_size: int = 1
|
||||||
|
|
||||||
|
# Optional callable which gets a list of token IDs from the model tokenizer
|
||||||
|
get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
|
||||||
|
|
||||||
|
# Exposed options for HF runner
|
||||||
|
model_kwargs: Optional[Dict[str, Any]] = None
|
||||||
|
# Indicates we should explicitly pass the EOS from the tokeniezr
|
||||||
|
use_tokenizer_eos: bool = False
|
||||||
|
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
|
||||||
|
# Callable to pass to the HF runner to run on inputs; for now, we also pass
|
||||||
|
# the data type to input post processing, because almost all of the uses of
|
||||||
|
# postprocess_inputs are to fix the data types of BatchEncoding values.
|
||||||
|
postprocess_inputs: Callable[[BatchEncoding, str],
|
||||||
|
BatchEncoding] = identity
|
||||||
|
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
|
||||||
|
|
||||||
|
# Post processors that if defined, will run oun the outputs of the
|
||||||
|
# vLLM and HF runner, respectively (useful for sanitization, etc).
|
||||||
|
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
|
||||||
|
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
|
||||||
|
|
||||||
|
# Consumes the output of the callables above and checks if they're equal
|
||||||
|
comparator: Callable[..., None] = check_logprobs_close
|
||||||
|
|
||||||
|
# Default expandable params per test; these defaults can be overridden in
|
||||||
|
# instances of this object; the complete set of test cases for the model
|
||||||
|
# is all combinations of .models + all fields below
|
||||||
|
max_tokens: Union[int, Tuple[int]] = 128
|
||||||
|
num_logprobs: Union[int, Tuple[int]] = 5
|
||||||
|
dtype: Union[str, Iterable[str]] = "half"
|
||||||
|
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
|
||||||
|
# Only expanded in video tests
|
||||||
|
num_video_frames: Union[int, Tuple[int]] = 16
|
||||||
|
|
||||||
|
# Fixed image sizes / image size factors; most tests use image_size_factors
|
||||||
|
# The values provided for these two fields will be stacked and expanded
|
||||||
|
# such that each model will consider each image size factor / image size
|
||||||
|
# once per tests (much like concatenating and wrapping in one parametrize
|
||||||
|
# call)
|
||||||
|
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
|
||||||
|
image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None
|
||||||
|
|
||||||
|
# Hack for updating a prompt to take into a local path; currently only used
|
||||||
|
# for Qwen-VL, which requires encoding the image path / url into the prompt
|
||||||
|
# for HF runner
|
||||||
|
prompt_path_encoder: Optional[
|
||||||
|
Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
|
||||||
|
str]] = None # noqa: E501
|
||||||
|
|
||||||
|
# kwarg to pass multimodal data in as to vllm/hf runner instances
|
||||||
|
runner_mm_key: str = "images"
|
||||||
|
|
||||||
|
# Allows configuring a test to run with custom inputs
|
||||||
|
custom_test_opts: Optional[List[CustomTestOptions]] = None
|
||||||
|
|
||||||
|
marks: Optional[List[MarkDecorator]] = None
|
||||||
|
|
||||||
|
def get_non_parametrized_runner_kwargs(self):
|
||||||
|
"""Returns a dictionary of expandable kwargs for items that are used
|
||||||
|
in all test types, which are NOT used when creating the parametrized
|
||||||
|
test cases.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"enforce_eager": self.enforce_eager,
|
||||||
|
"max_model_len": self.max_model_len,
|
||||||
|
"max_num_seqs": self.max_num_seqs,
|
||||||
|
"task": self.task,
|
||||||
|
"hf_output_post_proc": self.hf_output_post_proc,
|
||||||
|
"vllm_output_post_proc": self.vllm_output_post_proc,
|
||||||
|
"auto_cls": self.auto_cls,
|
||||||
|
"use_tokenizer_eos": self.use_tokenizer_eos,
|
||||||
|
"postprocess_inputs": self.postprocess_inputs,
|
||||||
|
"comparator": self.comparator,
|
||||||
|
"get_stop_token_ids": self.get_stop_token_ids,
|
||||||
|
"model_kwargs": self.model_kwargs,
|
||||||
|
"patch_hf_runner": self.patch_hf_runner,
|
||||||
|
"runner_mm_key": self.runner_mm_key,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ExpandableVLMTestArgs(NamedTuple):
|
||||||
|
"""The expanded kwargs which correspond to a single test case."""
|
||||||
|
model: str
|
||||||
|
max_tokens: int
|
||||||
|
num_logprobs: int
|
||||||
|
dtype: str
|
||||||
|
distributed_executor_backend: Optional[str]
|
||||||
|
# Sizes are used for everything except for custom input tests
|
||||||
|
size_wrapper: Optional[ImageSizeWrapper] = None
|
||||||
|
# Video only
|
||||||
|
num_video_frames: Optional[int] = None
|
||||||
|
# Custom inputs only
|
||||||
|
custom_test_opts: Optional[CustomTestOptions] = None
|
||||||
@ -85,6 +85,8 @@ def _run_test(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME
|
||||||
|
@pytest.mark.skip(reason="LLava next embedding tests currently fail")
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
def test_models_text(
|
def test_models_text(
|
||||||
|
|||||||
@ -192,7 +192,7 @@ def _run_test(
|
|||||||
for prompts, images in inputs
|
for prompts, images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding):
|
def process(hf_inputs: BatchEncoding, **kwargs):
|
||||||
return hf_inputs
|
return hf_inputs
|
||||||
|
|
||||||
with hf_runner(model,
|
with hf_runner(model,
|
||||||
|
|||||||
@ -561,12 +561,11 @@ def fork_new_process_for_each_test(
|
|||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
def large_gpu_test(*, min_gb: int):
|
def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
||||||
"""
|
"""Gets a pytest skipif mark, which triggers ig the the device doesn't have
|
||||||
Decorate a test to be skipped if no GPU is available or it does not have
|
meet a minimum memory requirement in gb; can be leveraged via
|
||||||
sufficient memory.
|
@large_gpu_test to skip tests in environments without enough resources, or
|
||||||
|
called when filtering tests to run directly.
|
||||||
Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if current_platform.is_cpu():
|
if current_platform.is_cpu():
|
||||||
@ -578,14 +577,23 @@ def large_gpu_test(*, min_gb: int):
|
|||||||
f"An error occurred when finding the available memory: {e}",
|
f"An error occurred when finding the available memory: {e}",
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
memory_gb = 0
|
memory_gb = 0
|
||||||
|
|
||||||
test_skipif = pytest.mark.skipif(
|
return pytest.mark.skipif(
|
||||||
memory_gb < min_gb,
|
memory_gb < min_gb,
|
||||||
reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
|
reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def large_gpu_test(*, min_gb: int):
|
||||||
|
"""
|
||||||
|
Decorate a test to be skipped if no GPU is available or it does not have
|
||||||
|
sufficient memory.
|
||||||
|
|
||||||
|
Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
|
||||||
|
"""
|
||||||
|
test_skipif = large_gpu_mark(min_gb)
|
||||||
|
|
||||||
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||||
return test_skipif(f)
|
return test_skipif(f)
|
||||||
|
|
||||||
|
|||||||
@ -977,7 +977,8 @@ def enable_trace_function_call_for_thread() -> None:
|
|||||||
|
|
||||||
|
|
||||||
# `functools` helpers
|
# `functools` helpers
|
||||||
def identity(value: T) -> T:
|
def identity(value: T, **kwargs) -> T:
|
||||||
|
"""Returns the first provided value."""
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user