[VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py 2025-09-12 00:44:34 +08:00 committed by GitHub
parent 51d41265ad
commit bcbe2a4d9e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 233 additions and 55 deletions

View File

@ -5,6 +5,7 @@ import pytest
from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
from ...utils import build_model_context
@ -50,3 +51,49 @@ def test_processor_override(
assert grid_t == expected_grid_t
assert video_tok_count == expected_toks_per_frame * grid_t
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("fps", [2])
def test_video_loader_consistency(
model_id: str,
fps: int,
):
"""
Ensure dynamic video loader (pre-sampled by loader) and normal video
loader (post-sampled by processor) produce same video processing outputs.
"""
ctx = build_model_context(
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {"fps": fps}
# Build the image str / prompt based on the number of images we pass
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
with open(video_path, "rb") as f:
video_bytes = f.read()
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
video_bytes, requested_fps=fps)
# pre-sampled loader shouldn't read all frames
assert len(dynamic_video) < len(static_video)
static_mm_data = {"video": [(static_video, static_metadata)]}
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
static_outputs = processor.apply(prompt, static_mm_data,
hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
hf_processor_mm_kwargs)
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
"prompt_token_ids"]
assert static_outputs["mm_kwargs"].get_data(
) == dynamic_outputs["mm_kwargs"].get_data()

View File

@ -204,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
assert metadata_sync == metadata_async
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
@pytest.mark.parametrize("requested_fps", [2, 24])
async def test_fetch_video_http_with_dynamic_loader(
video_url: str, max_duration: int, requested_fps: int,
monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
connector = MediaConnector(
media_io_kwargs={
"video": {
"max_duration": max_duration,
"requested_fps": requested_fps,
}
})
video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(
video_url)
assert np.array_equal(video_sync, video_async)
assert metadata_sync == metadata_async
assert metadata_sync["video_backend"] == "opencv_dynamic"
# Used for `test_argsort_mm_positions`.
class TestCase(NamedTuple):
mm_positions: "MultiModalPlaceholderDict"

View File

@ -110,22 +110,23 @@ class VideoAsset:
def filename(self) -> str:
return self._NAME_TO_FILE[self.name]
@property
def video_path(self) -> str:
return download_video_asset(self.filename)
@property
def pil_images(self) -> list[Image.Image]:
video_path = download_video_asset(self.filename)
ret = video_to_pil_images_list(video_path, self.num_frames)
ret = video_to_pil_images_list(self.video_path, self.num_frames)
return ret
@property
def np_ndarrays(self) -> npt.NDArray:
video_path = download_video_asset(self.filename)
ret = video_to_ndarrays(video_path, self.num_frames)
ret = video_to_ndarrays(self.video_path, self.num_frames)
return ret
@property
def metadata(self) -> dict[str, Any]:
video_path = download_video_asset(self.filename)
ret = video_get_metadata(video_path)
ret = video_get_metadata(self.video_path)
return ret
def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
@ -134,5 +135,4 @@ class VideoAsset:
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
"""
video_path = download_video_asset(self.filename)
return librosa.load(video_path, sr=sampling_rate)[0]
return librosa.load(self.video_path, sr=sampling_rate)[0]

View File

@ -1023,6 +1023,43 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
selected_timestamps.append(timestamps_list[idx])
return selected_timestamps
def _construct_video_placeholder(
self,
video_array: np.ndarray,
metadata: dict[str, Any],
grid_thw: torch.Tensor,
) -> str:
hf_processor = self.get_hf_processor()
tokenizer = self.get_tokenizer()
image_processor = hf_processor.image_processor
hf_config = self.get_hf_config()
boi_token_id = hf_config.image_start_token_id
eoi_token_id = hf_config.image_end_token_id
bov_token_id = hf_config.video_start_token_id
eov_token_id = hf_config.video_end_token_id
merge_length = image_processor.merge_size**2
assert isinstance(grid_thw, torch.Tensor)
timestamps = self._get_video_second_idx(metadata, len(video_array))
frames_idx_token = [
tokenizer.encode(str(i), add_special_tokens=False)
for i in timestamps
]
T, H, W = grid_thw
num_tokens_per_frame = int(H * W) // merge_length
placeholder = []
placeholder.append(bov_token_id)
for frame_idx in frames_idx_token:
placeholder.append(boi_token_id)
placeholder.extend([hf_processor.video_token_id] *
num_tokens_per_frame)
placeholder.append(eoi_token_id)
placeholder.extend(frame_idx)
placeholder.append(eov_token_id)
return placeholder
class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
@ -1118,17 +1155,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
for item in mm_data.pop("videos", []):
video_array, metadata = item
# FIXME(Isotr0py): Activate the below logic after we can disable
# resampling from video loader backend.
# assert metadata["total_num_frames"] == len(video_array), (
# f"Total frames {metadata['total_num_frames']} does not "
# f"match the length of video array {len(video_array)}.")
if metadata["video_backend"] == "opencv_dynamic":
mm_kwargs["do_sample_frames"] = False
# NOTE: Temporary workaround for resampled videos.
# this can cause a divergence with HF implementation if
# the input video is resampled in advance.
if metadata["total_num_frames"] != len(video_array):
elif metadata["total_num_frames"] != len(video_array):
logger.warning(
"Total frames in metadata "
"(%s) does not match the length of "
@ -1140,11 +1170,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
len(video_array),
)
metadata["total_num_frames"] = len(video_array)
metadata = VideoMetadata(**metadata)
video_mm_data = dict()
video_mm_data["videos"] = [[video_array]]
video_mm_data["video_metadata"] = [[metadata]]
video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]]
video_outputs = super()._call_hf_processor(
prompt="<|begin_of_video|><|video|><|end_of_video|>",
@ -1152,11 +1181,23 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
mm_kwargs=mm_kwargs,
tok_kwargs=tok_kwargs,
)
input_ids = video_outputs.pop("input_ids")
input_ids[input_ids == processor.image_token_id] = (
processor.video_token_id)
video_placeholder = processor.tokenizer.batch_decode(
input_ids)[0]
if "do_sample_frames" in mm_kwargs and not mm_kwargs[
"do_sample_frames"]:
# Transformers v4.55 has incorrect timestamps issue for
# skip sampling. We construct the placeholder manually to
# get placeholders with correct timestamps.
placeholder = self.info._construct_video_placeholder(
video_array,
metadata,
video_outputs["video_grid_thw"].squeeze(0),
)
video_placeholder = processor.tokenizer.decode(placeholder)
else:
input_ids = video_outputs.pop("input_ids")
input_ids[input_ids == processor.image_token_id] = (
processor.video_token_id)
video_placeholder = processor.tokenizer.batch_decode(
input_ids)[0]
prompt = prompt.replace(
"<|begin_of_video|><|video|><|end_of_video|>",
video_placeholder,
@ -1202,14 +1243,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor(
**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer()
hf_config = self.info.get_hf_config()
boi_token_id = hf_config.image_start_token_id
eoi_token_id = hf_config.image_end_token_id
bov_token_id = hf_config.video_start_token_id
eov_token_id = hf_config.video_end_token_id
merge_length = image_processor.merge_size**2
@ -1227,21 +1260,8 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
assert isinstance(grid_thw, torch.Tensor)
video, metadata = mm_items["video"][item_idx]
timestamps = self.info._get_video_second_idx(metadata, len(video))
frames_idx_token = [
tokenizer.encode(str(i), add_special_tokens=False)
for i in timestamps
]
num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
placeholder = []
placeholder.append(bov_token_id)
for frame_idx in frames_idx_token:
placeholder.append(boi_token_id)
placeholder.extend([hf_processor.video_token_id] *
num_tokens_per_frame)
placeholder.append(eoi_token_id)
placeholder.extend(frame_idx)
placeholder.append(eov_token_id)
placeholder = self.info._construct_video_placeholder(
video, metadata, grid_thw)
return PromptUpdateDetails.select_token_id(
placeholder,
embed_token_id=hf_processor.video_token_id,

View File

@ -1,12 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import math
from abc import abstractmethod
from functools import partial
from io import BytesIO
from pathlib import Path
from typing import Any
from typing import Any, Union
import numpy as np
import numpy.typing as npt
@ -104,10 +104,12 @@ class OpenCVVideoBackend(VideoLoader):
return api_pref
@classmethod
def load_bytes(cls,
data: bytes,
num_frames: int = -1,
**kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
**kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]:
import cv2
backend = cls().get_cv2_video_api()
@ -119,6 +121,15 @@ class OpenCVVideoBackend(VideoLoader):
original_fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames_num / original_fps if original_fps > 0 else 0
# Use transformers transformers.video_utils.VideoMetadata format
metadata = {
"total_num_frames": total_frames_num,
"fps": original_fps,
"duration": duration,
"video_backend": "opencv"
}
# resample video to target num_frames
full_read = num_frames == -1 or total_frames_num < num_frames
if full_read:
num_frames = total_frames_num
@ -148,14 +159,88 @@ class OpenCVVideoBackend(VideoLoader):
assert i == num_frames, (f"Expected reading {num_frames} frames, "
f"but only loaded {i} frames from video.")
return frames, metadata
@VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
requested_fps: int = 2,
max_duration: int = 300,
**kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]:
import cv2
backend = cls().get_cv2_video_api()
cap = cv2.VideoCapture(BytesIO(data), backend, [])
if not cap.isOpened():
raise ValueError("Could not open video stream")
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
original_fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames_num / original_fps if original_fps > 0 else 0
# Use transformers transformers.video_utils.VideoMetadata format
metadata = {
"total_num_frames": total_frames_num,
"fps": original_fps,
"duration": duration,
"video_backend": "opencv"
"video_backend": "opencv_dynamic"
}
# resample video to target num_frames
max_frame_idx = total_frames_num - 1
duration = duration or round(max_frame_idx / original_fps) + 1
# Refer to:
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
frame_indices: Union[range, list[int]]
if duration <= max_duration:
n = int(math.floor(duration * requested_fps))
frame_indices = sorted({
min(max_frame_idx,
int(math.ceil(i * original_fps / requested_fps)))
for i in range(n)
})
else:
num_samples = int(max_duration * requested_fps)
if num_samples >= total_frames_num:
frame_indices = range(total_frames_num)
else:
target_seconds = np.linspace(0,
duration,
num_samples,
endpoint=True)
frame_indices = sorted({
min(max_frame_idx, int(math.ceil(t * original_fps)))
for t in target_seconds
})
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames = np.empty((len(frame_indices), height, width, 3),
dtype=np.uint8)
i = 0
for idx in range(total_frames_num):
ok = cap.grab()
if not ok:
break
if idx in frame_indices:
ret, frame = cap.retrieve()
if ret:
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
i += 1
assert i == len(frame_indices), (
f"Expected reading {len(frame_indices)} frames, "
f"but only loaded {i} frames from video.")
return frames, metadata