From b616f6a53dc0caaf1eeb4be785df76415dde8633 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Wed, 2 Jul 2025 20:10:39 -0700
Subject: [PATCH] [Misc] Small: Fix video loader return type annotations.
 (#20389)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 tests/multimodal/test_utils.py |  7 ++++---
 vllm/multimodal/utils.py       |  4 ++--
 vllm/multimodal/video.py       | 14 ++++++++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index d927ae5cd0b27..b642e5c0ad47e 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -172,9 +172,10 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
             "num_frames": num_frames,
         }})
 
-    video_sync = connector.fetch_video(video_url)
-    video_async = await connector.fetch_video_async(video_url)
-    assert np.array_equal(video_sync[0], video_async[0])
+    video_sync, metadata_sync = connector.fetch_video(video_url)
+    video_async, metadata_async = await connector.fetch_video_async(video_url)
+    assert np.array_equal(video_sync, video_async)
+    assert metadata_sync == metadata_async
 
 
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 2f2be59a1f42d..22e696141b84b 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -228,7 +228,7 @@ class MediaConnector:
         video_url: str,
         *,
         image_mode: str = "RGB",
-    ) -> npt.NDArray:
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
         Load video from a HTTP or base64 data URL.
         """
@@ -248,7 +248,7 @@ class MediaConnector:
         video_url: str,
         *,
         image_mode: str = "RGB",
-    ) -> npt.NDArray:
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
         Asynchronously load video from a HTTP or base64 data URL.
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index d9589068a203b..ef1380bdb614c 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -6,6 +6,7 @@ from abc import abstractmethod
 from functools import partial
 from io import BytesIO
 from pathlib import Path
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -57,7 +58,7 @@ class VideoLoader:
     def load_bytes(cls,
                    data: bytes,
                    num_frames: int = -1,
-                   **kwargs) -> npt.NDArray:
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
         raise NotImplementedError
 
 
@@ -106,7 +107,7 @@ class OpenCVVideoBackend(VideoLoader):
     def load_bytes(cls,
                    data: bytes,
                    num_frames: int = -1,
-                   **kwargs) -> npt.NDArray:
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -179,12 +180,13 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
         video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND
         self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
 
-    def load_bytes(self, data: bytes) -> npt.NDArray:
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
         return self.video_loader.load_bytes(data,
                                             num_frames=self.num_frames,
                                             **self.kwargs)
 
-    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
+    def load_base64(self, media_type: str,
+                    data: str) -> tuple[npt.NDArray, dict[str, Any]]:
         if media_type.lower() == "video/jpeg":
             load_frame = partial(
                 self.image_io.load_base64,
@@ -194,11 +196,11 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
             return np.stack([
                 np.asarray(load_frame(frame_data))
                 for frame_data in data.split(",")
-            ])
+            ]), {}
 
         return self.load_bytes(base64.b64decode(data))
 
-    def load_file(self, filepath: Path) -> npt.NDArray:
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
         with filepath.open("rb") as f:
             data = f.read()