diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index c75a990120e0..031e924d34cf 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult:
                 "image":
                 ImageAsset("cherry_blossom").pil_image.convert("RGB"),
                 "video":
-                VideoAsset(name="sample_demo_1.mp4",
-                           num_frames=16).np_ndarrays,
+                VideoAsset(name="sample_demo_1", num_frames=16).np_ndarrays,
             },
         },
         limit_mm_per_prompt={
@@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult:
               "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
               f"{question}<|im_end|>\n"
               f"<|im_start|>assistant\n")
-    asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
+    asset = VideoAsset(name="sample_demo_1", num_frames=16)
     audio = asset.get_audio(sampling_rate=16000)
     assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
                                   "Please launch this example with "
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 755e19bb2699..6cd2a774a03d 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1109,7 +1109,7 @@ def get_multi_modal_input(args):
 
     if args.modality == "video":
         # Input video and question
-        video = VideoAsset(name="sample_demo_1.mp4",
+        video = VideoAsset(name="sample_demo_1",
                            num_frames=args.num_frames).np_ndarrays
         vid_questions = ["Why is this video funny?"]
 
diff --git a/tests/conftest.py b/tests/conftest.py
index f02b5a8c0520..14a88ca47505 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -97,13 +97,18 @@ class _VideoAssets(_VideoAssetsBase):
 
     def __init__(self) -> None:
         super().__init__([
-            VideoAsset("sample_demo_1.mp4"),
+            VideoAsset("sample_demo_1"),
         ])
 
     def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
         return [prompts["sample_demo_1"]]
 
 
+class _AudioAssetPrompts(TypedDict):
+    mary_had_lamb: str
+    winning_call: str
+
+
 class _AudioAssetsBase(UserList[AudioAsset]):
     pass
 
@@ -116,6 +121,9 @@ class _AudioAssets(_AudioAssetsBase):
             AudioAsset("winning_call"),
         ])
 
+    def prompts(self, prompts: _AudioAssetPrompts) -> list[str]:
+        return [prompts["mary_had_lamb"], prompts["winning_call"]]
+
 
 IMAGE_ASSETS = _ImageAssets()
 """Singleton instance of :class:`_ImageAssets`."""
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index 92c8155fe1e2..10052da9b0bd 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -29,7 +29,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
     image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
     image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
     images = [image_cherry, image_stop]
-    video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays
+    video = VideoAsset(name="sample_demo_1", num_frames=16).np_ndarrays
 
     inputs = [
         (
diff --git a/tests/models/multimodal/generation/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
similarity index 91%
rename from tests/models/multimodal/generation/test_intern_vit.py
rename to tests/models/multimodal/pooling/test_intern_vit.py
index a842d14fee2e..c15913b4225b 100644
--- a/tests/models/multimodal/generation/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -1,13 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional
-
 import pytest
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 
+from vllm.distributed import cleanup_dist_env_and_memory
+
 from ....conftest import _ImageAssets
 
 # we use snapshot_download to prevent conflicts between
@@ -20,7 +19,6 @@ def run_intern_vit_test(
     model_id: str,
     *,
     dtype: str,
-    distributed_executor_backend: Optional[str] = None,
 ):
     model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
 
@@ -43,7 +41,6 @@ def run_intern_vit_test(
         for pixel_value in pixel_values
     ]
 
-    from vllm.distributed import cleanup_dist_env_and_memory
     from vllm.model_executor.models.intern_vit import InternVisionModel
     vllm_model = InternVisionModel(config)
     vllm_model.load_weights(hf_model.state_dict().items())
@@ -71,7 +68,7 @@ def run_intern_vit_test(
 ])
 @pytest.mark.parametrize("dtype", [torch.half])
 @torch.inference_mode()
-def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
+def test_models(image_assets, model_id, dtype: str) -> None:
     run_intern_vit_test(
         image_assets,
         model_id,
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 133e18b68e25..fc3d47341b30 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -78,18 +78,18 @@ def video_to_pil_images_list(path: str,
 
 @dataclass(frozen=True)
 class VideoAsset:
-    name: Literal["sample_demo_1.mp4"]
+    name: Literal["sample_demo_1"]
     num_frames: int = -1
 
     @property
     def pil_images(self) -> list[Image.Image]:
-        video_path = download_video_asset(self.name)
+        video_path = download_video_asset(self.name + ".mp4")
         ret = video_to_pil_images_list(video_path, self.num_frames)
         return ret
 
     @property
     def np_ndarrays(self) -> npt.NDArray:
-        video_path = download_video_asset(self.name)
+        video_path = download_video_asset(self.name + ".mp4")
         ret = video_to_ndarrays(video_path, self.num_frames)
         return ret
 
@@ -99,5 +99,5 @@ class VideoAsset:
         
         See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
         """
-        video_path = download_video_asset(self.name)
+        video_path = download_video_asset(self.name + ".mp4")
         return librosa.load(video_path, sr=sampling_rate)[0]