From bcbe2a4d9e792dea8b76ac1a006aa8596af02c41 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 12 Sep 2025 00:44:34 +0800
Subject: [PATCH] [VLM] Optimize GLM4.5-V-style video processing to only decode
 necessary frames (#24161)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../multimodal/processing/test_glm4_1v.py     |  47 ++++++++
 tests/multimodal/test_utils.py                |  26 +++++
 vllm/assets/video.py                          |  16 +--
 vllm/model_executor/models/glm4_1v.py         | 100 +++++++++++-------
 vllm/multimodal/video.py                      |  99 +++++++++++++++--
 5 files changed, 233 insertions(+), 55 deletions(-)

diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index a49842e1099c2..dfb8d9b2a038d 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -5,6 +5,7 @@ import pytest
 
 from vllm.assets.video import VideoAsset
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
 
 from ...utils import build_model_context
 
@@ -50,3 +51,49 @@ def test_processor_override(
 
     assert grid_t == expected_grid_t
     assert video_tok_count == expected_toks_per_frame * grid_t
+
+
+@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("fps", [2])
+def test_video_loader_consistency(
+    model_id: str,
+    fps: int,
+):
+    """
+    Ensure dynamic video loader (pre-sampled by loader) and normal video 
+    loader (post-sampled by processor) produce same video processing outputs.
+    """
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
+    with open(video_path, "rb") as f:
+        video_bytes = f.read()
+
+    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
+    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
+        video_bytes, requested_fps=fps)
+
+    # pre-sampled loader shouldn't read all frames
+    assert len(dynamic_video) < len(static_video)
+
+    static_mm_data = {"video": [(static_video, static_metadata)]}
+    dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
+
+    static_outputs = processor.apply(prompt, static_mm_data,
+                                     hf_processor_mm_kwargs)
+    dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
+                                      hf_processor_mm_kwargs)
+
+    assert static_outputs["prompt_token_ids"] == dynamic_outputs[
+        "prompt_token_ids"]
+    assert static_outputs["mm_kwargs"].get_data(
+    ) == dynamic_outputs["mm_kwargs"].get_data()
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 886582a516409..e1e8282dd66d4 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -204,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
     assert metadata_sync == metadata_async
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("max_duration", [1, 60, 1800])
+@pytest.mark.parametrize("requested_fps", [2, 24])
+async def test_fetch_video_http_with_dynamic_loader(
+        video_url: str, max_duration: int, requested_fps: int,
+        monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+        connector = MediaConnector(
+            media_io_kwargs={
+                "video": {
+                    "max_duration": max_duration,
+                    "requested_fps": requested_fps,
+                }
+            })
+
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(
+            video_url)
+
+        assert np.array_equal(video_sync, video_async)
+        assert metadata_sync == metadata_async
+        assert metadata_sync["video_backend"] == "opencv_dynamic"
+
+
 # Used for `test_argsort_mm_positions`.
 class TestCase(NamedTuple):
     mm_positions: "MultiModalPlaceholderDict"
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 8ab0e9760be87..983e9114cccfb 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -110,22 +110,23 @@ class VideoAsset:
     def filename(self) -> str:
         return self._NAME_TO_FILE[self.name]
 
+    @property
+    def video_path(self) -> str:
+        return download_video_asset(self.filename)
+
     @property
     def pil_images(self) -> list[Image.Image]:
-        video_path = download_video_asset(self.filename)
-        ret = video_to_pil_images_list(video_path, self.num_frames)
+        ret = video_to_pil_images_list(self.video_path, self.num_frames)
         return ret
 
     @property
     def np_ndarrays(self) -> npt.NDArray:
-        video_path = download_video_asset(self.filename)
-        ret = video_to_ndarrays(video_path, self.num_frames)
+        ret = video_to_ndarrays(self.video_path, self.num_frames)
         return ret
 
     @property
     def metadata(self) -> dict[str, Any]:
-        video_path = download_video_asset(self.filename)
-        ret = video_get_metadata(video_path)
+        ret = video_get_metadata(self.video_path)
         return ret
 
     def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
@@ -134,5 +135,4 @@ class VideoAsset:
         
         See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
         """
-        video_path = download_video_asset(self.filename)
-        return librosa.load(video_path, sr=sampling_rate)[0]
+        return librosa.load(self.video_path, sr=sampling_rate)[0]
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 846b4b800dd59..539381b618000 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1023,6 +1023,43 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
             selected_timestamps.append(timestamps_list[idx])
         return selected_timestamps
 
+    def _construct_video_placeholder(
+        self,
+        video_array: np.ndarray,
+        metadata: dict[str, Any],
+        grid_thw: torch.Tensor,
+    ) -> str:
+        hf_processor = self.get_hf_processor()
+        tokenizer = self.get_tokenizer()
+        image_processor = hf_processor.image_processor
+
+        hf_config = self.get_hf_config()
+        boi_token_id = hf_config.image_start_token_id
+        eoi_token_id = hf_config.image_end_token_id
+        bov_token_id = hf_config.video_start_token_id
+        eov_token_id = hf_config.video_end_token_id
+        merge_length = image_processor.merge_size**2
+
+        assert isinstance(grid_thw, torch.Tensor)
+        timestamps = self._get_video_second_idx(metadata, len(video_array))
+        frames_idx_token = [
+            tokenizer.encode(str(i), add_special_tokens=False)
+            for i in timestamps
+        ]
+        T, H, W = grid_thw
+        num_tokens_per_frame = int(H * W) // merge_length
+        placeholder = []
+        placeholder.append(bov_token_id)
+        for frame_idx in frames_idx_token:
+            placeholder.append(boi_token_id)
+            placeholder.extend([hf_processor.video_token_id] *
+                               num_tokens_per_frame)
+            placeholder.append(eoi_token_id)
+            placeholder.extend(frame_idx)
+        placeholder.append(eov_token_id)
+
+        return placeholder
+
 
 class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
 
@@ -1118,17 +1155,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
             for item in mm_data.pop("videos", []):
                 video_array, metadata = item
 
-                # FIXME(Isotr0py): Activate the below logic after we can disable
-                # resampling from video loader backend.
-                # assert metadata["total_num_frames"] == len(video_array), (
-                #     f"Total frames {metadata['total_num_frames']} does not "
-                #     f"match the length of video array {len(video_array)}.")
+                if metadata["video_backend"] == "opencv_dynamic":
+                    mm_kwargs["do_sample_frames"] = False
 
-                # NOTE: Temporary workaround for resampled videos.
-                # this can cause a divergence with HF implementation if
-                # the input video is resampled in advance.
-
-                if metadata["total_num_frames"] != len(video_array):
+                elif metadata["total_num_frames"] != len(video_array):
                     logger.warning(
                         "Total frames in metadata "
                         "(%s) does not match the length of "
@@ -1140,11 +1170,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                         len(video_array),
                     )
                     metadata["total_num_frames"] = len(video_array)
-                metadata = VideoMetadata(**metadata)
 
                 video_mm_data = dict()
                 video_mm_data["videos"] = [[video_array]]
-                video_mm_data["video_metadata"] = [[metadata]]
+                video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]]
 
                 video_outputs = super()._call_hf_processor(
                     prompt="<|begin_of_video|><|video|><|end_of_video|>",
@@ -1152,11 +1181,23 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                     mm_kwargs=mm_kwargs,
                     tok_kwargs=tok_kwargs,
                 )
-                input_ids = video_outputs.pop("input_ids")
-                input_ids[input_ids == processor.image_token_id] = (
-                    processor.video_token_id)
-                video_placeholder = processor.tokenizer.batch_decode(
-                    input_ids)[0]
+                if "do_sample_frames" in mm_kwargs and not mm_kwargs[
+                        "do_sample_frames"]:
+                    # Transformers v4.55 has incorrect timestamps issue for
+                    # skip sampling. We construct the placeholder manually to
+                    # get placeholders with correct timestamps.
+                    placeholder = self.info._construct_video_placeholder(
+                        video_array,
+                        metadata,
+                        video_outputs["video_grid_thw"].squeeze(0),
+                    )
+                    video_placeholder = processor.tokenizer.decode(placeholder)
+                else:
+                    input_ids = video_outputs.pop("input_ids")
+                    input_ids[input_ids == processor.image_token_id] = (
+                        processor.video_token_id)
+                    video_placeholder = processor.tokenizer.batch_decode(
+                        input_ids)[0]
                 prompt = prompt.replace(
                     "<|begin_of_video|><|video|><|end_of_video|>",
                     video_placeholder,
@@ -1202,14 +1243,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
             **hf_processor_mm_kwargs)
-        tokenizer = self.info.get_tokenizer()
-        hf_config = self.info.get_hf_config()
-
-        boi_token_id = hf_config.image_start_token_id
-        eoi_token_id = hf_config.image_end_token_id
-
-        bov_token_id = hf_config.video_start_token_id
-        eov_token_id = hf_config.video_end_token_id
 
         merge_length = image_processor.merge_size**2
 
@@ -1227,21 +1260,8 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
             assert isinstance(grid_thw, torch.Tensor)
 
             video, metadata = mm_items["video"][item_idx]
-            timestamps = self.info._get_video_second_idx(metadata, len(video))
-            frames_idx_token = [
-                tokenizer.encode(str(i), add_special_tokens=False)
-                for i in timestamps
-            ]
-            num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
-            placeholder = []
-            placeholder.append(bov_token_id)
-            for frame_idx in frames_idx_token:
-                placeholder.append(boi_token_id)
-                placeholder.extend([hf_processor.video_token_id] *
-                                   num_tokens_per_frame)
-                placeholder.append(eoi_token_id)
-                placeholder.extend(frame_idx)
-            placeholder.append(eov_token_id)
+            placeholder = self.info._construct_video_placeholder(
+                video, metadata, grid_thw)
             return PromptUpdateDetails.select_token_id(
                 placeholder,
                 embed_token_id=hf_processor.video_token_id,
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index ef1380bdb614c..df6e19da82ca2 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import base64
+import math
 from abc import abstractmethod
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Any
+from typing import Any, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -104,10 +104,12 @@ class OpenCVVideoBackend(VideoLoader):
         return api_pref
 
     @classmethod
-    def load_bytes(cls,
-                   data: bytes,
-                   num_frames: int = -1,
-                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -119,6 +121,15 @@ class OpenCVVideoBackend(VideoLoader):
         original_fps = cap.get(cv2.CAP_PROP_FPS)
         duration = total_frames_num / original_fps if original_fps > 0 else 0
 
+        # Use transformers transformers.video_utils.VideoMetadata format
+        metadata = {
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
+            "duration": duration,
+            "video_backend": "opencv"
+        }
+
+        # resample video to target num_frames
         full_read = num_frames == -1 or total_frames_num < num_frames
         if full_read:
             num_frames = total_frames_num
@@ -148,14 +159,88 @@ class OpenCVVideoBackend(VideoLoader):
         assert i == num_frames, (f"Expected reading {num_frames} frames, "
                                  f"but only loaded {i} frames from video.")
 
+        return frames, metadata
+
+
+@VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
+class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
+
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        requested_fps: int = 2,
+        max_duration: int = 300,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
         # Use transformers transformers.video_utils.VideoMetadata format
         metadata = {
             "total_num_frames": total_frames_num,
             "fps": original_fps,
             "duration": duration,
-            "video_backend": "opencv"
+            "video_backend": "opencv_dynamic"
         }
 
+        # resample video to target num_frames
+        max_frame_idx = total_frames_num - 1
+        duration = duration or round(max_frame_idx / original_fps) + 1
+
+        # Refer to:
+        # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
+        frame_indices: Union[range, list[int]]
+        if duration <= max_duration:
+            n = int(math.floor(duration * requested_fps))
+            frame_indices = sorted({
+                min(max_frame_idx,
+                    int(math.ceil(i * original_fps / requested_fps)))
+                for i in range(n)
+            })
+        else:
+            num_samples = int(max_duration * requested_fps)
+            if num_samples >= total_frames_num:
+                frame_indices = range(total_frames_num)
+            else:
+                target_seconds = np.linspace(0,
+                                             duration,
+                                             num_samples,
+                                             endpoint=True)
+                frame_indices = sorted({
+                    min(max_frame_idx, int(math.ceil(t * original_fps)))
+                    for t in target_seconds
+                })
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((len(frame_indices), height, width, 3),
+                          dtype=np.uint8)
+
+        i = 0
+        for idx in range(total_frames_num):
+            ok = cap.grab()
+            if not ok:
+                break
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    i += 1
+
+        assert i == len(frame_indices), (
+            f"Expected reading {len(frame_indices)} frames, "
+            f"but only loaded {i} frames from video.")
+
         return frames, metadata