From bcbe2a4d9e792dea8b76ac1a006aa8596af02c41 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 12 Sep 2025 00:44:34 +0800 Subject: [PATCH] [VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161) Signed-off-by: Isotr0py --- .../multimodal/processing/test_glm4_1v.py | 47 ++++++++ tests/multimodal/test_utils.py | 26 +++++ vllm/assets/video.py | 16 +-- vllm/model_executor/models/glm4_1v.py | 100 +++++++++++------- vllm/multimodal/video.py | 99 +++++++++++++++-- 5 files changed, 233 insertions(+), 55 deletions(-) diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index a49842e1099c2..dfb8d9b2a038d 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -5,6 +5,7 @@ import pytest from vllm.assets.video import VideoAsset from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend from ...utils import build_model_context @@ -50,3 +51,49 @@ def test_processor_override( assert grid_t == expected_grid_t assert video_tok_count == expected_toks_per_frame * grid_t + + +@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"]) +@pytest.mark.parametrize("fps", [2]) +def test_video_loader_consistency( + model_id: str, + fps: int, +): + """ + Ensure dynamic video loader (pre-sampled by loader) and normal video + loader (post-sampled by processor) produce same video processing outputs. + """ + ctx = build_model_context( + model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"video": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + hf_processor_mm_kwargs = {"fps": fps} + + # Build the image str / prompt based on the number of images we pass + prompt = "<|begin_of_video|><|video|><|end_of_video|>" + + video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path + with open(video_path, "rb") as f: + video_bytes = f.read() + + static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes) + dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes( + video_bytes, requested_fps=fps) + + # pre-sampled loader shouldn't read all frames + assert len(dynamic_video) < len(static_video) + + static_mm_data = {"video": [(static_video, static_metadata)]} + dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]} + + static_outputs = processor.apply(prompt, static_mm_data, + hf_processor_mm_kwargs) + dynamic_outputs = processor.apply(prompt, dynamic_mm_data, + hf_processor_mm_kwargs) + + assert static_outputs["prompt_token_ids"] == dynamic_outputs[ + "prompt_token_ids"] + assert static_outputs["mm_kwargs"].get_data( + ) == dynamic_outputs["mm_kwargs"].get_data() diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 886582a516409..e1e8282dd66d4 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -204,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int): assert metadata_sync == metadata_async +@pytest.mark.asyncio +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +@pytest.mark.parametrize("max_duration", [1, 60, 1800]) +@pytest.mark.parametrize("requested_fps", [2, 24]) +async def test_fetch_video_http_with_dynamic_loader( + video_url: str, max_duration: int, requested_fps: int, + monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic") + connector = MediaConnector( + media_io_kwargs={ + "video": { + "max_duration": max_duration, + "requested_fps": requested_fps, + } + }) + + video_sync, metadata_sync = connector.fetch_video(video_url) + video_async, metadata_async = await connector.fetch_video_async( + video_url) + + assert np.array_equal(video_sync, video_async) + assert metadata_sync == metadata_async + assert metadata_sync["video_backend"] == "opencv_dynamic" + + # Used for `test_argsort_mm_positions`. class TestCase(NamedTuple): mm_positions: "MultiModalPlaceholderDict" diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 8ab0e9760be87..983e9114cccfb 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -110,22 +110,23 @@ class VideoAsset: def filename(self) -> str: return self._NAME_TO_FILE[self.name] + @property + def video_path(self) -> str: + return download_video_asset(self.filename) + @property def pil_images(self) -> list[Image.Image]: - video_path = download_video_asset(self.filename) - ret = video_to_pil_images_list(video_path, self.num_frames) + ret = video_to_pil_images_list(self.video_path, self.num_frames) return ret @property def np_ndarrays(self) -> npt.NDArray: - video_path = download_video_asset(self.filename) - ret = video_to_ndarrays(video_path, self.num_frames) + ret = video_to_ndarrays(self.video_path, self.num_frames) return ret @property def metadata(self) -> dict[str, Any]: - video_path = download_video_asset(self.filename) - ret = video_get_metadata(video_path) + ret = video_get_metadata(self.video_path) return ret def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray: @@ -134,5 +135,4 @@ class VideoAsset: See also: examples/offline_inference/qwen2_5_omni/only_thinker.py """ - video_path = download_video_asset(self.filename) - return librosa.load(video_path, sr=sampling_rate)[0] + return librosa.load(self.video_path, sr=sampling_rate)[0] diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 846b4b800dd59..539381b618000 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1023,6 +1023,43 @@ class Glm4vProcessingInfo(BaseProcessingInfo): selected_timestamps.append(timestamps_list[idx]) return selected_timestamps + def _construct_video_placeholder( + self, + video_array: np.ndarray, + metadata: dict[str, Any], + grid_thw: torch.Tensor, + ) -> str: + hf_processor = self.get_hf_processor() + tokenizer = self.get_tokenizer() + image_processor = hf_processor.image_processor + + hf_config = self.get_hf_config() + boi_token_id = hf_config.image_start_token_id + eoi_token_id = hf_config.image_end_token_id + bov_token_id = hf_config.video_start_token_id + eov_token_id = hf_config.video_end_token_id + merge_length = image_processor.merge_size**2 + + assert isinstance(grid_thw, torch.Tensor) + timestamps = self._get_video_second_idx(metadata, len(video_array)) + frames_idx_token = [ + tokenizer.encode(str(i), add_special_tokens=False) + for i in timestamps + ] + T, H, W = grid_thw + num_tokens_per_frame = int(H * W) // merge_length + placeholder = [] + placeholder.append(bov_token_id) + for frame_idx in frames_idx_token: + placeholder.append(boi_token_id) + placeholder.extend([hf_processor.video_token_id] * + num_tokens_per_frame) + placeholder.append(eoi_token_id) + placeholder.extend(frame_idx) + placeholder.append(eov_token_id) + + return placeholder + class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): @@ -1118,17 +1155,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): for item in mm_data.pop("videos", []): video_array, metadata = item - # FIXME(Isotr0py): Activate the below logic after we can disable - # resampling from video loader backend. - # assert metadata["total_num_frames"] == len(video_array), ( - # f"Total frames {metadata['total_num_frames']} does not " - # f"match the length of video array {len(video_array)}.") + if metadata["video_backend"] == "opencv_dynamic": + mm_kwargs["do_sample_frames"] = False - # NOTE: Temporary workaround for resampled videos. - # this can cause a divergence with HF implementation if - # the input video is resampled in advance. - - if metadata["total_num_frames"] != len(video_array): + elif metadata["total_num_frames"] != len(video_array): logger.warning( "Total frames in metadata " "(%s) does not match the length of " @@ -1140,11 +1170,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): len(video_array), ) metadata["total_num_frames"] = len(video_array) - metadata = VideoMetadata(**metadata) video_mm_data = dict() video_mm_data["videos"] = [[video_array]] - video_mm_data["video_metadata"] = [[metadata]] + video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]] video_outputs = super()._call_hf_processor( prompt="<|begin_of_video|><|video|><|end_of_video|>", @@ -1152,11 +1181,23 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): mm_kwargs=mm_kwargs, tok_kwargs=tok_kwargs, ) - input_ids = video_outputs.pop("input_ids") - input_ids[input_ids == processor.image_token_id] = ( - processor.video_token_id) - video_placeholder = processor.tokenizer.batch_decode( - input_ids)[0] + if "do_sample_frames" in mm_kwargs and not mm_kwargs[ + "do_sample_frames"]: + # Transformers v4.55 has incorrect timestamps issue for + # skip sampling. We construct the placeholder manually to + # get placeholders with correct timestamps. + placeholder = self.info._construct_video_placeholder( + video_array, + metadata, + video_outputs["video_grid_thw"].squeeze(0), + ) + video_placeholder = processor.tokenizer.decode(placeholder) + else: + input_ids = video_outputs.pop("input_ids") + input_ids[input_ids == processor.image_token_id] = ( + processor.video_token_id) + video_placeholder = processor.tokenizer.batch_decode( + input_ids)[0] prompt = prompt.replace( "<|begin_of_video|><|video|><|end_of_video|>", video_placeholder, @@ -1202,14 +1243,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( **hf_processor_mm_kwargs) - tokenizer = self.info.get_tokenizer() - hf_config = self.info.get_hf_config() - - boi_token_id = hf_config.image_start_token_id - eoi_token_id = hf_config.image_end_token_id - - bov_token_id = hf_config.video_start_token_id - eov_token_id = hf_config.video_end_token_id merge_length = image_processor.merge_size**2 @@ -1227,21 +1260,8 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): assert isinstance(grid_thw, torch.Tensor) video, metadata = mm_items["video"][item_idx] - timestamps = self.info._get_video_second_idx(metadata, len(video)) - frames_idx_token = [ - tokenizer.encode(str(i), add_special_tokens=False) - for i in timestamps - ] - num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length - placeholder = [] - placeholder.append(bov_token_id) - for frame_idx in frames_idx_token: - placeholder.append(boi_token_id) - placeholder.extend([hf_processor.video_token_id] * - num_tokens_per_frame) - placeholder.append(eoi_token_id) - placeholder.extend(frame_idx) - placeholder.append(eov_token_id) + placeholder = self.info._construct_video_placeholder( + video, metadata, grid_thw) return PromptUpdateDetails.select_token_id( placeholder, embed_token_id=hf_processor.video_token_id, diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index ef1380bdb614c..df6e19da82ca2 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import base64 +import math from abc import abstractmethod from functools import partial from io import BytesIO from pathlib import Path -from typing import Any +from typing import Any, Union import numpy as np import numpy.typing as npt @@ -104,10 +104,12 @@ class OpenCVVideoBackend(VideoLoader): return api_pref @classmethod - def load_bytes(cls, - data: bytes, - num_frames: int = -1, - **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + def load_bytes( + cls, + data: bytes, + num_frames: int = -1, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: import cv2 backend = cls().get_cv2_video_api() @@ -119,6 +121,15 @@ class OpenCVVideoBackend(VideoLoader): original_fps = cap.get(cv2.CAP_PROP_FPS) duration = total_frames_num / original_fps if original_fps > 0 else 0 + # Use transformers transformers.video_utils.VideoMetadata format + metadata = { + "total_num_frames": total_frames_num, + "fps": original_fps, + "duration": duration, + "video_backend": "opencv" + } + + # resample video to target num_frames full_read = num_frames == -1 or total_frames_num < num_frames if full_read: num_frames = total_frames_num @@ -148,14 +159,88 @@ class OpenCVVideoBackend(VideoLoader): assert i == num_frames, (f"Expected reading {num_frames} frames, " f"but only loaded {i} frames from video.") + return frames, metadata + + +@VIDEO_LOADER_REGISTRY.register("opencv_dynamic") +class OpenCVDynamicVideoBackend(OpenCVVideoBackend): + + @classmethod + def load_bytes( + cls, + data: bytes, + num_frames: int = -1, + requested_fps: int = 2, + max_duration: int = 300, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: + import cv2 + + backend = cls().get_cv2_video_api() + cap = cv2.VideoCapture(BytesIO(data), backend, []) + if not cap.isOpened(): + raise ValueError("Could not open video stream") + + total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + original_fps = cap.get(cv2.CAP_PROP_FPS) + duration = total_frames_num / original_fps if original_fps > 0 else 0 + # Use transformers transformers.video_utils.VideoMetadata format metadata = { "total_num_frames": total_frames_num, "fps": original_fps, "duration": duration, - "video_backend": "opencv" + "video_backend": "opencv_dynamic" } + # resample video to target num_frames + max_frame_idx = total_frames_num - 1 + duration = duration or round(max_frame_idx / original_fps) + 1 + + # Refer to: + # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 + frame_indices: Union[range, list[int]] + if duration <= max_duration: + n = int(math.floor(duration * requested_fps)) + frame_indices = sorted({ + min(max_frame_idx, + int(math.ceil(i * original_fps / requested_fps))) + for i in range(n) + }) + else: + num_samples = int(max_duration * requested_fps) + if num_samples >= total_frames_num: + frame_indices = range(total_frames_num) + else: + target_seconds = np.linspace(0, + duration, + num_samples, + endpoint=True) + frame_indices = sorted({ + min(max_frame_idx, int(math.ceil(t * original_fps))) + for t in target_seconds + }) + + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames = np.empty((len(frame_indices), height, width, 3), + dtype=np.uint8) + + i = 0 + for idx in range(total_frames_num): + ok = cap.grab() + if not ok: + break + if idx in frame_indices: + ret, frame = cap.retrieve() + if ret: + frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + i += 1 + + assert i == len(frame_indices), ( + f"Expected reading {len(frame_indices)} frames, " + f"but only loaded {i} frames from video.") + return frames, metadata