[Misc] Rename assets for testing (#17575)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-05-02 18:29:25 +08:00 committed by GitHub
parent c777df79f7
commit d7543862bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 145 additions and 131 deletions

View File

@ -47,7 +47,7 @@ def get_mixed_modalities_query() -> QueryResult:
"image": "image":
ImageAsset("cherry_blossom").pil_image.convert("RGB"), ImageAsset("cherry_blossom").pil_image.convert("RGB"),
"video": "video":
VideoAsset(name="sample_demo_1", num_frames=16).np_ndarrays, VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
}, },
}, },
limit_mm_per_prompt={ limit_mm_per_prompt={
@ -65,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult:
"<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n") f"<|im_start|>assistant\n")
asset = VideoAsset(name="sample_demo_1", num_frames=16) asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000) audio = asset.get_audio(sampling_rate=16000)
assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. " assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
"Please launch this example with " "Please launch this example with "

View File

@ -1109,7 +1109,7 @@ def get_multi_modal_input(args):
if args.modality == "video": if args.modality == "video":
# Input video and question # Input video and question
video = VideoAsset(name="sample_demo_1", video = VideoAsset(name="baby_reading",
num_frames=args.num_frames).np_ndarrays num_frames=args.num_frames).np_ndarrays
vid_questions = ["Why is this video funny?"] vid_questions = ["Why is this video funny?"]

View File

@ -1,9 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import json import json
import os import os
import tempfile import tempfile
from collections import UserList
from enum import Enum from enum import Enum
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
@ -58,16 +56,12 @@ def _read_prompts(filename: str) -> list[str]:
return prompts return prompts
class _ImageAssetPrompts(TypedDict): class ImageAssetPrompts(TypedDict):
stop_sign: str stop_sign: str
cherry_blossom: str cherry_blossom: str
class _ImageAssetsBase(UserList[ImageAsset]): class ImageTestAssets(list[ImageAsset]):
pass
class _ImageAssets(_ImageAssetsBase):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__([ super().__init__([
@ -75,7 +69,7 @@ class _ImageAssets(_ImageAssetsBase):
ImageAsset("cherry_blossom"), ImageAsset("cherry_blossom"),
]) ])
def prompts(self, prompts: _ImageAssetPrompts) -> list[str]: def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
""" """
Convenience method to define the prompt for each test image. Convenience method to define the prompt for each test image.
@ -85,35 +79,27 @@ class _ImageAssets(_ImageAssetsBase):
return [prompts["stop_sign"], prompts["cherry_blossom"]] return [prompts["stop_sign"], prompts["cherry_blossom"]]
class _VideoAssetPrompts(TypedDict): class VideoAssetPrompts(TypedDict):
sample_demo_1: str baby_reading: str
class _VideoAssetsBase(UserList[VideoAsset]): class VideoTestAssets(list[VideoAsset]):
pass
class _VideoAssets(_VideoAssetsBase):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__([ super().__init__([
VideoAsset("sample_demo_1"), VideoAsset("baby_reading"),
]) ])
def prompts(self, prompts: _VideoAssetPrompts) -> list[str]: def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
return [prompts["sample_demo_1"]] return [prompts["baby_reading"]]
class _AudioAssetPrompts(TypedDict): class AudioAssetPrompts(TypedDict):
mary_had_lamb: str mary_had_lamb: str
winning_call: str winning_call: str
class _AudioAssetsBase(UserList[AudioAsset]): class AudioTestAssets(list[AudioAsset]):
pass
class _AudioAssets(_AudioAssetsBase):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__([ super().__init__([
@ -121,16 +107,16 @@ class _AudioAssets(_AudioAssetsBase):
AudioAsset("winning_call"), AudioAsset("winning_call"),
]) ])
def prompts(self, prompts: _AudioAssetPrompts) -> list[str]: def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
return [prompts["mary_had_lamb"], prompts["winning_call"]] return [prompts["mary_had_lamb"], prompts["winning_call"]]
IMAGE_ASSETS = _ImageAssets() IMAGE_ASSETS = ImageTestAssets()
"""Singleton instance of :class:`_ImageAssets`.""" """Singleton instance of :class:`ImageTestAssets`."""
VIDEO_ASSETS = _VideoAssets() VIDEO_ASSETS = VideoTestAssets()
"""Singleton instance of :class:`_VideoAssets`.""" """Singleton instance of :class:`VideoTestAssets`."""
AUDIO_ASSETS = _AudioAssets() AUDIO_ASSETS = AudioTestAssets()
"""Singleton instance of :class:`_AudioAssets`.""" """Singleton instance of :class:`AudioTestAssets`."""
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
@ -278,17 +264,17 @@ def example_long_prompts() -> list[str]:
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def image_assets() -> _ImageAssets: def image_assets() -> ImageTestAssets:
return IMAGE_ASSETS return IMAGE_ASSETS
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def video_assets() -> _VideoAssets: def video_assets() -> VideoTestAssets:
return VIDEO_ASSETS return VIDEO_ASSETS
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def audio_assets() -> _AudioAssets: def audio_assets() -> AudioTestAssets:
return AUDIO_ASSETS return AUDIO_ASSETS

View File

@ -13,8 +13,8 @@ from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import identity from vllm.utils import identity
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets, from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
_VideoAssets) VideoTestAssets, VllmRunner)
from ....utils import (create_new_process_for_each_test, large_gpu_mark, from ....utils import (create_new_process_for_each_test, large_gpu_mark,
multi_gpu_marks) multi_gpu_marks)
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
@ -691,7 +691,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
@ -716,7 +716,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
@ -741,7 +741,7 @@ def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
@ -763,7 +763,7 @@ def test_image_embedding_models(model_type: str,
)) ))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets, monkeypatch): video_assets: VideoTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
@ -814,7 +814,7 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
@ -840,7 +840,7 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
@ -866,7 +866,8 @@ def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets,
monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
@ -889,7 +890,7 @@ def test_image_embedding_models_heavy(model_type: str,
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets, monkeypatch): video_assets: VideoTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]

View File

@ -9,7 +9,7 @@ from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
MODELS = ["microsoft/Florence-2-base"] MODELS = ["microsoft/Florence-2-base"]
@ -118,7 +118,7 @@ def run_test(
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, model: str, image_assets: ImageTestAssets, model: str,
size_factors: list[int], dtype: str, max_tokens: int, size_factors: list[int], dtype: str, max_tokens: int,
num_logprobs: int) -> None: num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]

View File

@ -9,7 +9,8 @@ from transformers import AutoModelForSpeechSeq2Seq
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ....conftest import HfRunner, PromptAudioInput, VllmRunner, _AudioAssets from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
VllmRunner)
from ...registry import HF_EXAMPLE_MODELS from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
@ -116,9 +117,9 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [2048]) @pytest.mark.parametrize("max_model_len", [2048])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10]) @pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, model: str, audio_assets: _AudioAssets, def test_models(hf_runner, vllm_runner, model: str,
dtype: str, max_model_len: int, max_tokens: int, audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
num_logprobs: int) -> None: max_tokens: int, num_logprobs: int) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip") model_info.check_transformers_version(on_fail="skip")

View File

@ -29,7 +29,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB") image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
image_stop = ImageAsset("stop_sign").pil_image.convert("RGB") image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
images = [image_cherry, image_stop] images = [image_cherry, image_stop]
video = VideoAsset(name="sample_demo_1", num_frames=16).np_ndarrays video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
inputs = [ inputs = [
( (

View File

@ -14,8 +14,8 @@ from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
_ImageAssets) PromptImageInput, VllmRunner)
from ....quantization.utils import is_quant_method_supported from ....quantization.utils import is_quant_method_supported
from ....utils import (create_new_process_for_each_test, large_gpu_test, from ....utils import (create_new_process_for_each_test, large_gpu_test,
multi_gpu_test) multi_gpu_test)
@ -90,7 +90,7 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str,
def _get_inputs( def _get_inputs(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
*, *,
size_factors: Optional[list[float]] = None, size_factors: Optional[list[float]] = None,
sizes: Optional[list[tuple[int, int]]] = None, sizes: Optional[list[tuple[int, int]]] = None,
@ -126,7 +126,7 @@ def _get_inputs(
def run_test( def run_test(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
*, *,
size_factors: list[float], size_factors: list[float],
@ -143,7 +143,7 @@ def run_test(
def run_test( def run_test(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
*, *,
sizes: list[tuple[int, int]], sizes: list[tuple[int, int]],
@ -159,7 +159,7 @@ def run_test(
def run_test( def run_test(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
*, *,
size_factors: Optional[list[float]] = None, size_factors: Optional[list[float]] = None,
@ -433,7 +433,7 @@ def test_models_distributed(
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
def test_bnb_regression( def test_bnb_regression(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
@ -473,7 +473,7 @@ def test_bnb_regression(
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
def test_explicit_implicit_prompt( def test_explicit_implicit_prompt(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,

View File

@ -50,7 +50,7 @@ IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
}) })
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
"sample_demo_1": "baby_reading":
qwen2_vl_chat_template( qwen2_vl_chat_template(
VIDEO_PLACEHOLDER, VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ", "Describe this video with a short sentence ",

View File

@ -11,13 +11,22 @@ from transformers import AutoModel, AutoTokenizer
from vllm.multimodal.audio import resample_audio_librosa from vllm.multimodal.audio import resample_audio_librosa
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ....conftest import HfRunner, VllmRunner, _AudioAssets from ....conftest import AUDIO_ASSETS, AudioTestAssets, HfRunner, VllmRunner
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
from ...registry import HF_EXAMPLE_MODELS from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b" MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
"mary_had_lamb":
"Transcribe this into English.",
"winning_call":
"What is happening in this audio clip?",
})
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
AudioTuple = tuple[np.ndarray, int] AudioTuple = tuple[np.ndarray, int]
VLLM_PLACEHOLDER = "<|audio|>" VLLM_PLACEHOLDER = "<|audio|>"
@ -31,12 +40,6 @@ CHUNKED_PREFILL_KWARGS = {
} }
@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
def audio(request):
from vllm.assets.audio import AudioAsset
return AudioAsset(request.param)
def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]: def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
"""Convert kwargs to CLI args.""" """Convert kwargs to CLI args."""
args = [] args = []
@ -53,7 +56,7 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
pytest.param({}, marks=pytest.mark.cpu_model), pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS), pytest.param(CHUNKED_PREFILL_KWARGS),
]) ])
def server(request, audio_assets: _AudioAssets): def server(request, audio_assets: AudioTestAssets):
args = [ args = [
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager", "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
"--limit-mm-per-prompt", "--limit-mm-per-prompt",
@ -199,15 +202,19 @@ def run_multi_audio_test(
pytest.param({}, marks=pytest.mark.cpu_model), pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS), pytest.param(CHUNKED_PREFILL_KWARGS),
]) ])
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, def test_models(hf_runner, vllm_runner, audio_assets: AudioTestAssets,
num_logprobs: int, vllm_kwargs: dict) -> None: dtype: str, max_tokens: int, num_logprobs: int,
vllm_kwargs: dict) -> None:
audio_inputs = [(
_get_prompt(1, audio, VLLM_PLACEHOLDER),
_get_prompt(1, audio, HF_PLACEHOLDER),
audio.audio_and_sample_rate,
) for audio in audio_assets]
vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
run_test( run_test(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
[(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)], audio_inputs,
MODEL_NAME, MODEL_NAME,
dtype=dtype, dtype=dtype,
max_tokens=max_tokens, max_tokens=max_tokens,
@ -224,13 +231,12 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
pytest.param({}, marks=pytest.mark.cpu_model), pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS), pytest.param(CHUNKED_PREFILL_KWARGS),
]) ])
def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets, def test_models_with_multiple_audios(vllm_runner,
dtype: str, max_tokens: int, audio_assets: AudioTestAssets, dtype: str,
num_logprobs: int, max_tokens: int, num_logprobs: int,
vllm_kwargs: dict) -> None: vllm_kwargs: dict) -> None:
vllm_prompt = _get_prompt(len(audio_assets), vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
"Describe each of the audios above.",
VLLM_PLACEHOLDER) VLLM_PLACEHOLDER)
run_multi_audio_test( run_multi_audio_test(
vllm_runner, vllm_runner,
@ -245,7 +251,7 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_online_serving(client, audio_assets: _AudioAssets): async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled.""" """Exercises online serving with/without chunked prefill enabled."""
messages = [{ messages = [{

View File

@ -11,7 +11,7 @@ from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video, from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video) sample_frames_from_video)
from .....conftest import _ImageAssets, _VideoAssets from .....conftest import ImageTestAssets, VideoTestAssets
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT, TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
ImageSizeWrapper, SizeType, VLMTestInfo) ImageSizeWrapper, SizeType, VLMTestInfo)
@ -69,7 +69,7 @@ def get_model_prompts(base_prompts: Iterable[str],
def build_single_image_inputs_from_test_info( def build_single_image_inputs_from_test_info(
test_info: VLMTestInfo, test_info: VLMTestInfo,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper, size_wrapper: ImageSizeWrapper,
tmp_path: Optional[PosixPath] = None): tmp_path: Optional[PosixPath] = None):
if test_info.prompt_formatter is None: if test_info.prompt_formatter is None:
@ -116,7 +116,7 @@ def build_single_image_inputs(images, model_prompts,
def build_multi_image_inputs_from_test_info( def build_multi_image_inputs_from_test_info(
test_info: VLMTestInfo, test_info: VLMTestInfo,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper, size_wrapper: ImageSizeWrapper,
tmp_path: Optional[PosixPath] = None): tmp_path: Optional[PosixPath] = None):
if test_info.prompt_formatter is None: if test_info.prompt_formatter is None:
@ -159,7 +159,7 @@ def build_multi_image_inputs(image_lists, model_prompts,
def build_embedding_inputs_from_test_info( def build_embedding_inputs_from_test_info(
test_info: VLMTestInfo, test_info: VLMTestInfo,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_wrapper: ImageSizeWrapper, size_wrapper: ImageSizeWrapper,
): ):
# These conditions will always be true if invoked through filtering, # These conditions will always be true if invoked through filtering,
@ -192,7 +192,7 @@ def build_embedding_inputs_from_test_info(
def build_video_inputs_from_test_info( def build_video_inputs_from_test_info(
test_info: VLMTestInfo, test_info: VLMTestInfo,
video_assets: _VideoAssets, video_assets: VideoTestAssets,
size_wrapper: ImageSizeWrapper, size_wrapper: ImageSizeWrapper,
num_frames: int, num_frames: int,
): ):

View File

@ -16,7 +16,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.transformers_utils.tokenizer import patch_padding_side
from .....conftest import HfRunner, ImageAsset, _ImageAssets from .....conftest import HfRunner, ImageAsset, ImageTestAssets
from .types import RunnerOutput from .types import RunnerOutput
@ -238,14 +238,14 @@ def minimax_vl_01_hf_output(hf_output: RunnerOutput,
####### Functions for converting image assets to embeddings ####### Functions for converting image assets to embeddings
def get_llava_embeddings(image_assets: _ImageAssets): def get_llava_embeddings(image_assets: ImageTestAssets):
return [asset.image_embeds for asset in image_assets] return [asset.image_embeds for asset in image_assets]
####### Prompt path encoders for models that need models on disk ####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder( def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], tmp_path: PosixPath, prompt: str,
_ImageAssets]) -> str: assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
"""Given a temporary dir path, export one or more image assets into the """Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that tempdir & replace its contents with the local path to the string so that
the HF version of Qwen-VL can resolve the path and load the image in its the HF version of Qwen-VL can resolve the path and load the image in its

View File

@ -4,7 +4,8 @@ types / modalities.
""" """
from pathlib import PosixPath from pathlib import PosixPath
from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets from .....conftest import (HfRunner, ImageTestAssets, VideoTestAssets,
VllmRunner)
from . import builders, core from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo from .types import ExpandableVLMTestArgs, VLMTestInfo
@ -14,7 +15,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: ImageTestAssets):
assert test_case.size_wrapper is not None assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info( inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path) model_test_info, image_assets, test_case.size_wrapper, tmp_path)
@ -37,7 +38,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: ImageTestAssets):
assert test_case.size_wrapper is not None assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info( inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path) model_test_info, image_assets, test_case.size_wrapper, tmp_path)
@ -60,7 +61,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: ImageTestAssets):
assert test_case.size_wrapper is not None assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info( inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper) model_test_info, image_assets, test_case.size_wrapper)
@ -86,7 +87,7 @@ def run_video_test(
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets, video_assets: VideoTestAssets,
): ):
assert test_case.size_wrapper is not None assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None assert test_case.num_video_frames is not None

View File

@ -15,7 +15,7 @@ from vllm.config import TaskOption
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, ImageTestAssets
from ....utils import check_logprobs_close from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model # meta image tag; will be replaced by the appropriate tag for the model
@ -85,7 +85,7 @@ class VLMTestInfo(NamedTuple):
# Function for converting ImageAssets to image embeddings; # Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests # We need to define this explicitly for embedding tests
convert_assets_to_embeddings: Optional[Callable[[_ImageAssets], convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
torch.Tensor]] = None torch.Tensor]] = None
# Exposed options for vLLM runner; we change these in a several tests, # Exposed options for vLLM runner; we change these in a several tests,
@ -141,7 +141,7 @@ class VLMTestInfo(NamedTuple):
# for Qwen-VL, which requires encoding the image path / url into the prompt # for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner # for HF runner
prompt_path_encoder: Optional[ prompt_path_encoder: Optional[
Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]], Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
str]] = None # noqa: E501 str]] = None # noqa: E501
# Allows configuring a test to run with custom inputs # Allows configuring a test to run with custom inputs

View File

@ -7,7 +7,7 @@ from transformers import AutoConfig, AutoModel, CLIPImageProcessor
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
# we use snapshot_download to prevent conflicts between # we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner # dynamic_module and trust_remote_code for hf_runner
@ -15,7 +15,7 @@ DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
def run_intern_vit_test( def run_intern_vit_test(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
*, *,
dtype: str, dtype: str,

View File

@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@ -137,7 +137,7 @@ def _run_check(
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
model_id: str, model_id: str,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_factors: list[int], size_factors: list[int],
min_dynamic_patch: int, min_dynamic_patch: int,
max_dynamic_patch: int, max_dynamic_patch: int,

View File

@ -5,7 +5,7 @@ from transformers import Idefics3Config
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@ -21,7 +21,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, object], mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int, expected_toks_per_img: int,

View File

@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@ -94,7 +94,7 @@ def _run_check(
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
model_id: str, model_id: str,
image_assets: _ImageAssets, image_assets: ImageTestAssets,
size_factors: list[int], size_factors: list[int],
min_dynamic_patch: int, min_dynamic_patch: int,
max_dynamic_patch: int, max_dynamic_patch: int,

View File

@ -6,7 +6,7 @@ import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.tokenizer import encode_tokens from vllm.transformers_utils.tokenizer import encode_tokens
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@ -17,7 +17,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False]) @pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
@pytest.mark.parametrize("tokenized_prompt", [True, False]) @pytest.mark.parametrize("tokenized_prompt", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict, mm_processor_kwargs: dict,
num_imgs: int, num_imgs: int,

View File

@ -7,14 +7,14 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"]) @pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
num_imgs: int, num_imgs: int,
): ):

View File

@ -4,7 +4,7 @@ import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@ -22,7 +22,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, int], mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int, expected_toks_per_img: int,

View File

@ -4,7 +4,7 @@ import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@ -22,7 +22,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, int], mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int, expected_toks_per_img: int,

View File

@ -4,7 +4,7 @@ import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@ -19,7 +19,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, object], mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int, expected_toks_per_img: int,

View File

@ -5,7 +5,7 @@ from transformers import SmolVLMConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from ....conftest import _ImageAssets from ....conftest import ImageTestAssets
from ...utils import build_model_context from ...utils import build_model_context
@ -21,7 +21,7 @@ from ...utils import build_model_context
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, object], mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int, expected_toks_per_img: int,

View File

@ -7,7 +7,7 @@ import torch
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from ...conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
from ..utils import check_logprobs_close from ..utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@ -20,7 +20,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
def run_awq_test( def run_awq_test(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, image_assets: ImageTestAssets,
source_model: str, source_model: str,
quant_model: str, quant_model: str,
*, *,

View File

@ -18,19 +18,25 @@ except ImportError:
ASSET_DIR = "multimodal_asset" ASSET_DIR = "multimodal_asset"
AudioAssetName = Literal["winning_call", "mary_had_lamb"]
@dataclass(frozen=True) @dataclass(frozen=True)
class AudioAsset: class AudioAsset:
name: Literal["winning_call", "mary_had_lamb"] name: AudioAssetName
@property
def filename(self) -> str:
return f"{self.name}.ogg"
@property @property
def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", audio_path = get_vllm_public_assets(filename=self.filename,
s3_prefix=ASSET_DIR) s3_prefix=ASSET_DIR)
return librosa.load(audio_path, sr=None) return librosa.load(audio_path, sr=None)
def get_local_path(self) -> Path: def get_local_path(self) -> Path:
return get_vllm_public_assets(filename=f"{self.name}.ogg", return get_vllm_public_assets(filename=self.filename,
s3_prefix=ASSET_DIR) s3_prefix=ASSET_DIR)
@property @property

View File

@ -10,10 +10,12 @@ from .base import get_vllm_public_assets
VLM_IMAGES_DIR = "vision_model_images" VLM_IMAGES_DIR = "vision_model_images"
ImageAssetName = Literal["stop_sign", "cherry_blossom"]
@dataclass(frozen=True) @dataclass(frozen=True)
class ImageAsset: class ImageAsset:
name: Literal["stop_sign", "cherry_blossom"] name: ImageAssetName
@property @property
def pil_image(self) -> Image.Image: def pil_image(self) -> Image.Image:

View File

@ -2,7 +2,7 @@
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache from functools import lru_cache
from typing import Literal, Optional from typing import ClassVar, Literal, Optional
import cv2 import cv2
import numpy as np import numpy as np
@ -76,20 +76,31 @@ def video_to_pil_images_list(path: str,
] ]
VideoAssetName = Literal["baby_reading"]
@dataclass(frozen=True) @dataclass(frozen=True)
class VideoAsset: class VideoAsset:
name: Literal["sample_demo_1"] name: VideoAssetName
num_frames: int = -1 num_frames: int = -1
_NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = {
"baby_reading": "sample_demo_1.mp4",
}
@property
def filename(self) -> str:
return self._NAME_TO_FILE[self.name]
@property @property
def pil_images(self) -> list[Image.Image]: def pil_images(self) -> list[Image.Image]:
video_path = download_video_asset(self.name + ".mp4") video_path = download_video_asset(self.filename)
ret = video_to_pil_images_list(video_path, self.num_frames) ret = video_to_pil_images_list(video_path, self.num_frames)
return ret return ret
@property @property
def np_ndarrays(self) -> npt.NDArray: def np_ndarrays(self) -> npt.NDArray:
video_path = download_video_asset(self.name + ".mp4") video_path = download_video_asset(self.filename)
ret = video_to_ndarrays(video_path, self.num_frames) ret = video_to_ndarrays(video_path, self.num_frames)
return ret return ret
@ -99,5 +110,5 @@ class VideoAsset:
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
""" """
video_path = download_video_asset(self.name + ".mp4") video_path = download_video_asset(self.filename)
return librosa.load(video_path, sr=sampling_rate)[0] return librosa.load(video_path, sr=sampling_rate)[0]