diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index c75a990120e0..031e924d34cf 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult: "image": ImageAsset("cherry_blossom").pil_image.convert("RGB"), "video": - VideoAsset(name="sample_demo_1.mp4", - num_frames=16).np_ndarrays, + VideoAsset(name="sample_demo_1", num_frames=16).np_ndarrays, }, }, limit_mm_per_prompt={ @@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult: "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" f"{question}<|im_end|>\n" f"<|im_start|>assistant\n") - asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16) + asset = VideoAsset(name="sample_demo_1", num_frames=16) audio = asset.get_audio(sampling_rate=16000) assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. " "Please launch this example with " diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 755e19bb2699..6cd2a774a03d 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1109,7 +1109,7 @@ def get_multi_modal_input(args): if args.modality == "video": # Input video and question - video = VideoAsset(name="sample_demo_1.mp4", + video = VideoAsset(name="sample_demo_1", num_frames=args.num_frames).np_ndarrays vid_questions = ["Why is this video funny?"] diff --git a/tests/conftest.py b/tests/conftest.py index f02b5a8c0520..14a88ca47505 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -97,13 +97,18 @@ class _VideoAssets(_VideoAssetsBase): def __init__(self) -> None: super().__init__([ - VideoAsset("sample_demo_1.mp4"), + VideoAsset("sample_demo_1"), ]) def prompts(self, prompts: _VideoAssetPrompts) -> list[str]: return [prompts["sample_demo_1"]] +class _AudioAssetPrompts(TypedDict): + mary_had_lamb: str + winning_call: str + + class _AudioAssetsBase(UserList[AudioAsset]): pass @@ -116,6 +121,9 @@ class _AudioAssets(_AudioAssetsBase): AudioAsset("winning_call"), ]) + def prompts(self, prompts: _AudioAssetPrompts) -> list[str]: + return [prompts["mary_had_lamb"], prompts["winning_call"]] + IMAGE_ASSETS = _ImageAssets() """Singleton instance of :class:`_ImageAssets`.""" diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py index 92c8155fe1e2..10052da9b0bd 100644 --- a/tests/models/multimodal/generation/test_interleaved.py +++ b/tests/models/multimodal/generation/test_interleaved.py @@ -29,7 +29,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None: image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB") image_stop = ImageAsset("stop_sign").pil_image.convert("RGB") images = [image_cherry, image_stop] - video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays + video = VideoAsset(name="sample_demo_1", num_frames=16).np_ndarrays inputs = [ ( diff --git a/tests/models/multimodal/generation/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py similarity index 91% rename from tests/models/multimodal/generation/test_intern_vit.py rename to tests/models/multimodal/pooling/test_intern_vit.py index a842d14fee2e..c15913b4225b 100644 --- a/tests/models/multimodal/generation/test_intern_vit.py +++ b/tests/models/multimodal/pooling/test_intern_vit.py @@ -1,13 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - import pytest import torch import torch.nn as nn from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModel, CLIPImageProcessor +from vllm.distributed import cleanup_dist_env_and_memory + from ....conftest import _ImageAssets # we use snapshot_download to prevent conflicts between @@ -20,7 +19,6 @@ def run_intern_vit_test( model_id: str, *, dtype: str, - distributed_executor_backend: Optional[str] = None, ): model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN) @@ -43,7 +41,6 @@ def run_intern_vit_test( for pixel_value in pixel_values ] - from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.models.intern_vit import InternVisionModel vllm_model = InternVisionModel(config) vllm_model.load_weights(hf_model.state_dict().items()) @@ -71,7 +68,7 @@ def run_intern_vit_test( ]) @pytest.mark.parametrize("dtype", [torch.half]) @torch.inference_mode() -def test_models(dist_init, image_assets, model_id, dtype: str) -> None: +def test_models(image_assets, model_id, dtype: str) -> None: run_intern_vit_test( image_assets, model_id, diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 133e18b68e25..fc3d47341b30 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -78,18 +78,18 @@ def video_to_pil_images_list(path: str, @dataclass(frozen=True) class VideoAsset: - name: Literal["sample_demo_1.mp4"] + name: Literal["sample_demo_1"] num_frames: int = -1 @property def pil_images(self) -> list[Image.Image]: - video_path = download_video_asset(self.name) + video_path = download_video_asset(self.name + ".mp4") ret = video_to_pil_images_list(video_path, self.num_frames) return ret @property def np_ndarrays(self) -> npt.NDArray: - video_path = download_video_asset(self.name) + video_path = download_video_asset(self.name + ".mp4") ret = video_to_ndarrays(video_path, self.num_frames) return ret @@ -99,5 +99,5 @@ class VideoAsset: See also: examples/offline_inference/qwen2_5_omni/only_thinker.py """ - video_path = download_video_asset(self.name) + video_path = download_video_asset(self.name + ".mp4") return librosa.load(video_path, sr=sampling_rate)[0]