mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-08 02:22:18 +08:00
[VLM] Optimize GLM4.5-V-style video processing to only decode necessary frames (#24161)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
51d41265ad
commit
bcbe2a4d9e
@ -5,6 +5,7 @@ import pytest
|
|||||||
|
|
||||||
from vllm.assets.video import VideoAsset
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
|
||||||
|
|
||||||
from ...utils import build_model_context
|
from ...utils import build_model_context
|
||||||
|
|
||||||
@ -50,3 +51,49 @@ def test_processor_override(
|
|||||||
|
|
||||||
assert grid_t == expected_grid_t
|
assert grid_t == expected_grid_t
|
||||||
assert video_tok_count == expected_toks_per_frame * grid_t
|
assert video_tok_count == expected_toks_per_frame * grid_t
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
|
||||||
|
@pytest.mark.parametrize("fps", [2])
|
||||||
|
def test_video_loader_consistency(
|
||||||
|
model_id: str,
|
||||||
|
fps: int,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Ensure dynamic video loader (pre-sampled by loader) and normal video
|
||||||
|
loader (post-sampled by processor) produce same video processing outputs.
|
||||||
|
"""
|
||||||
|
ctx = build_model_context(
|
||||||
|
model_id,
|
||||||
|
mm_processor_kwargs=None,
|
||||||
|
limit_mm_per_prompt={"video": 1},
|
||||||
|
)
|
||||||
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
|
hf_processor_mm_kwargs = {"fps": fps}
|
||||||
|
|
||||||
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
prompt = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||||
|
|
||||||
|
video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
|
||||||
|
with open(video_path, "rb") as f:
|
||||||
|
video_bytes = f.read()
|
||||||
|
|
||||||
|
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
|
||||||
|
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
|
||||||
|
video_bytes, requested_fps=fps)
|
||||||
|
|
||||||
|
# pre-sampled loader shouldn't read all frames
|
||||||
|
assert len(dynamic_video) < len(static_video)
|
||||||
|
|
||||||
|
static_mm_data = {"video": [(static_video, static_metadata)]}
|
||||||
|
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
|
||||||
|
|
||||||
|
static_outputs = processor.apply(prompt, static_mm_data,
|
||||||
|
hf_processor_mm_kwargs)
|
||||||
|
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
|
||||||
|
hf_processor_mm_kwargs)
|
||||||
|
|
||||||
|
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
|
||||||
|
"prompt_token_ids"]
|
||||||
|
assert static_outputs["mm_kwargs"].get_data(
|
||||||
|
) == dynamic_outputs["mm_kwargs"].get_data()
|
||||||
|
|||||||
@ -204,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
|
|||||||
assert metadata_sync == metadata_async
|
assert metadata_sync == metadata_async
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||||
|
@pytest.mark.parametrize("max_duration", [1, 60, 1800])
|
||||||
|
@pytest.mark.parametrize("requested_fps", [2, 24])
|
||||||
|
async def test_fetch_video_http_with_dynamic_loader(
|
||||||
|
video_url: str, max_duration: int, requested_fps: int,
|
||||||
|
monkeypatch: pytest.MonkeyPatch):
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
|
||||||
|
connector = MediaConnector(
|
||||||
|
media_io_kwargs={
|
||||||
|
"video": {
|
||||||
|
"max_duration": max_duration,
|
||||||
|
"requested_fps": requested_fps,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
video_sync, metadata_sync = connector.fetch_video(video_url)
|
||||||
|
video_async, metadata_async = await connector.fetch_video_async(
|
||||||
|
video_url)
|
||||||
|
|
||||||
|
assert np.array_equal(video_sync, video_async)
|
||||||
|
assert metadata_sync == metadata_async
|
||||||
|
assert metadata_sync["video_backend"] == "opencv_dynamic"
|
||||||
|
|
||||||
|
|
||||||
# Used for `test_argsort_mm_positions`.
|
# Used for `test_argsort_mm_positions`.
|
||||||
class TestCase(NamedTuple):
|
class TestCase(NamedTuple):
|
||||||
mm_positions: "MultiModalPlaceholderDict"
|
mm_positions: "MultiModalPlaceholderDict"
|
||||||
|
|||||||
@ -110,22 +110,23 @@ class VideoAsset:
|
|||||||
def filename(self) -> str:
|
def filename(self) -> str:
|
||||||
return self._NAME_TO_FILE[self.name]
|
return self._NAME_TO_FILE[self.name]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def video_path(self) -> str:
|
||||||
|
return download_video_asset(self.filename)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pil_images(self) -> list[Image.Image]:
|
def pil_images(self) -> list[Image.Image]:
|
||||||
video_path = download_video_asset(self.filename)
|
ret = video_to_pil_images_list(self.video_path, self.num_frames)
|
||||||
ret = video_to_pil_images_list(video_path, self.num_frames)
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def np_ndarrays(self) -> npt.NDArray:
|
def np_ndarrays(self) -> npt.NDArray:
|
||||||
video_path = download_video_asset(self.filename)
|
ret = video_to_ndarrays(self.video_path, self.num_frames)
|
||||||
ret = video_to_ndarrays(video_path, self.num_frames)
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def metadata(self) -> dict[str, Any]:
|
def metadata(self) -> dict[str, Any]:
|
||||||
video_path = download_video_asset(self.filename)
|
ret = video_get_metadata(self.video_path)
|
||||||
ret = video_get_metadata(video_path)
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
|
def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
|
||||||
@ -134,5 +135,4 @@ class VideoAsset:
|
|||||||
|
|
||||||
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
|
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
|
||||||
"""
|
"""
|
||||||
video_path = download_video_asset(self.filename)
|
return librosa.load(self.video_path, sr=sampling_rate)[0]
|
||||||
return librosa.load(video_path, sr=sampling_rate)[0]
|
|
||||||
|
|||||||
@ -1023,6 +1023,43 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
|
|||||||
selected_timestamps.append(timestamps_list[idx])
|
selected_timestamps.append(timestamps_list[idx])
|
||||||
return selected_timestamps
|
return selected_timestamps
|
||||||
|
|
||||||
|
def _construct_video_placeholder(
|
||||||
|
self,
|
||||||
|
video_array: np.ndarray,
|
||||||
|
metadata: dict[str, Any],
|
||||||
|
grid_thw: torch.Tensor,
|
||||||
|
) -> str:
|
||||||
|
hf_processor = self.get_hf_processor()
|
||||||
|
tokenizer = self.get_tokenizer()
|
||||||
|
image_processor = hf_processor.image_processor
|
||||||
|
|
||||||
|
hf_config = self.get_hf_config()
|
||||||
|
boi_token_id = hf_config.image_start_token_id
|
||||||
|
eoi_token_id = hf_config.image_end_token_id
|
||||||
|
bov_token_id = hf_config.video_start_token_id
|
||||||
|
eov_token_id = hf_config.video_end_token_id
|
||||||
|
merge_length = image_processor.merge_size**2
|
||||||
|
|
||||||
|
assert isinstance(grid_thw, torch.Tensor)
|
||||||
|
timestamps = self._get_video_second_idx(metadata, len(video_array))
|
||||||
|
frames_idx_token = [
|
||||||
|
tokenizer.encode(str(i), add_special_tokens=False)
|
||||||
|
for i in timestamps
|
||||||
|
]
|
||||||
|
T, H, W = grid_thw
|
||||||
|
num_tokens_per_frame = int(H * W) // merge_length
|
||||||
|
placeholder = []
|
||||||
|
placeholder.append(bov_token_id)
|
||||||
|
for frame_idx in frames_idx_token:
|
||||||
|
placeholder.append(boi_token_id)
|
||||||
|
placeholder.extend([hf_processor.video_token_id] *
|
||||||
|
num_tokens_per_frame)
|
||||||
|
placeholder.append(eoi_token_id)
|
||||||
|
placeholder.extend(frame_idx)
|
||||||
|
placeholder.append(eov_token_id)
|
||||||
|
|
||||||
|
return placeholder
|
||||||
|
|
||||||
|
|
||||||
class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
|
class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
|
||||||
|
|
||||||
@ -1118,17 +1155,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
|||||||
for item in mm_data.pop("videos", []):
|
for item in mm_data.pop("videos", []):
|
||||||
video_array, metadata = item
|
video_array, metadata = item
|
||||||
|
|
||||||
# FIXME(Isotr0py): Activate the below logic after we can disable
|
if metadata["video_backend"] == "opencv_dynamic":
|
||||||
# resampling from video loader backend.
|
mm_kwargs["do_sample_frames"] = False
|
||||||
# assert metadata["total_num_frames"] == len(video_array), (
|
|
||||||
# f"Total frames {metadata['total_num_frames']} does not "
|
|
||||||
# f"match the length of video array {len(video_array)}.")
|
|
||||||
|
|
||||||
# NOTE: Temporary workaround for resampled videos.
|
elif metadata["total_num_frames"] != len(video_array):
|
||||||
# this can cause a divergence with HF implementation if
|
|
||||||
# the input video is resampled in advance.
|
|
||||||
|
|
||||||
if metadata["total_num_frames"] != len(video_array):
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Total frames in metadata "
|
"Total frames in metadata "
|
||||||
"(%s) does not match the length of "
|
"(%s) does not match the length of "
|
||||||
@ -1140,11 +1170,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
|||||||
len(video_array),
|
len(video_array),
|
||||||
)
|
)
|
||||||
metadata["total_num_frames"] = len(video_array)
|
metadata["total_num_frames"] = len(video_array)
|
||||||
metadata = VideoMetadata(**metadata)
|
|
||||||
|
|
||||||
video_mm_data = dict()
|
video_mm_data = dict()
|
||||||
video_mm_data["videos"] = [[video_array]]
|
video_mm_data["videos"] = [[video_array]]
|
||||||
video_mm_data["video_metadata"] = [[metadata]]
|
video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]]
|
||||||
|
|
||||||
video_outputs = super()._call_hf_processor(
|
video_outputs = super()._call_hf_processor(
|
||||||
prompt="<|begin_of_video|><|video|><|end_of_video|>",
|
prompt="<|begin_of_video|><|video|><|end_of_video|>",
|
||||||
@ -1152,11 +1181,23 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
|||||||
mm_kwargs=mm_kwargs,
|
mm_kwargs=mm_kwargs,
|
||||||
tok_kwargs=tok_kwargs,
|
tok_kwargs=tok_kwargs,
|
||||||
)
|
)
|
||||||
input_ids = video_outputs.pop("input_ids")
|
if "do_sample_frames" in mm_kwargs and not mm_kwargs[
|
||||||
input_ids[input_ids == processor.image_token_id] = (
|
"do_sample_frames"]:
|
||||||
processor.video_token_id)
|
# Transformers v4.55 has incorrect timestamps issue for
|
||||||
video_placeholder = processor.tokenizer.batch_decode(
|
# skip sampling. We construct the placeholder manually to
|
||||||
input_ids)[0]
|
# get placeholders with correct timestamps.
|
||||||
|
placeholder = self.info._construct_video_placeholder(
|
||||||
|
video_array,
|
||||||
|
metadata,
|
||||||
|
video_outputs["video_grid_thw"].squeeze(0),
|
||||||
|
)
|
||||||
|
video_placeholder = processor.tokenizer.decode(placeholder)
|
||||||
|
else:
|
||||||
|
input_ids = video_outputs.pop("input_ids")
|
||||||
|
input_ids[input_ids == processor.image_token_id] = (
|
||||||
|
processor.video_token_id)
|
||||||
|
video_placeholder = processor.tokenizer.batch_decode(
|
||||||
|
input_ids)[0]
|
||||||
prompt = prompt.replace(
|
prompt = prompt.replace(
|
||||||
"<|begin_of_video|><|video|><|end_of_video|>",
|
"<|begin_of_video|><|video|><|end_of_video|>",
|
||||||
video_placeholder,
|
video_placeholder,
|
||||||
@ -1202,14 +1243,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
|||||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||||
image_processor = self.info.get_image_processor(
|
image_processor = self.info.get_image_processor(
|
||||||
**hf_processor_mm_kwargs)
|
**hf_processor_mm_kwargs)
|
||||||
tokenizer = self.info.get_tokenizer()
|
|
||||||
hf_config = self.info.get_hf_config()
|
|
||||||
|
|
||||||
boi_token_id = hf_config.image_start_token_id
|
|
||||||
eoi_token_id = hf_config.image_end_token_id
|
|
||||||
|
|
||||||
bov_token_id = hf_config.video_start_token_id
|
|
||||||
eov_token_id = hf_config.video_end_token_id
|
|
||||||
|
|
||||||
merge_length = image_processor.merge_size**2
|
merge_length = image_processor.merge_size**2
|
||||||
|
|
||||||
@ -1227,21 +1260,8 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
|||||||
assert isinstance(grid_thw, torch.Tensor)
|
assert isinstance(grid_thw, torch.Tensor)
|
||||||
|
|
||||||
video, metadata = mm_items["video"][item_idx]
|
video, metadata = mm_items["video"][item_idx]
|
||||||
timestamps = self.info._get_video_second_idx(metadata, len(video))
|
placeholder = self.info._construct_video_placeholder(
|
||||||
frames_idx_token = [
|
video, metadata, grid_thw)
|
||||||
tokenizer.encode(str(i), add_special_tokens=False)
|
|
||||||
for i in timestamps
|
|
||||||
]
|
|
||||||
num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
|
|
||||||
placeholder = []
|
|
||||||
placeholder.append(bov_token_id)
|
|
||||||
for frame_idx in frames_idx_token:
|
|
||||||
placeholder.append(boi_token_id)
|
|
||||||
placeholder.extend([hf_processor.video_token_id] *
|
|
||||||
num_tokens_per_frame)
|
|
||||||
placeholder.append(eoi_token_id)
|
|
||||||
placeholder.extend(frame_idx)
|
|
||||||
placeholder.append(eov_token_id)
|
|
||||||
return PromptUpdateDetails.select_token_id(
|
return PromptUpdateDetails.select_token_id(
|
||||||
placeholder,
|
placeholder,
|
||||||
embed_token_id=hf_processor.video_token_id,
|
embed_token_id=hf_processor.video_token_id,
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import math
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
@ -104,10 +104,12 @@ class OpenCVVideoBackend(VideoLoader):
|
|||||||
return api_pref
|
return api_pref
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load_bytes(cls,
|
def load_bytes(
|
||||||
data: bytes,
|
cls,
|
||||||
num_frames: int = -1,
|
data: bytes,
|
||||||
**kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
|
num_frames: int = -1,
|
||||||
|
**kwargs,
|
||||||
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||||
import cv2
|
import cv2
|
||||||
|
|
||||||
backend = cls().get_cv2_video_api()
|
backend = cls().get_cv2_video_api()
|
||||||
@ -119,6 +121,15 @@ class OpenCVVideoBackend(VideoLoader):
|
|||||||
original_fps = cap.get(cv2.CAP_PROP_FPS)
|
original_fps = cap.get(cv2.CAP_PROP_FPS)
|
||||||
duration = total_frames_num / original_fps if original_fps > 0 else 0
|
duration = total_frames_num / original_fps if original_fps > 0 else 0
|
||||||
|
|
||||||
|
# Use transformers transformers.video_utils.VideoMetadata format
|
||||||
|
metadata = {
|
||||||
|
"total_num_frames": total_frames_num,
|
||||||
|
"fps": original_fps,
|
||||||
|
"duration": duration,
|
||||||
|
"video_backend": "opencv"
|
||||||
|
}
|
||||||
|
|
||||||
|
# resample video to target num_frames
|
||||||
full_read = num_frames == -1 or total_frames_num < num_frames
|
full_read = num_frames == -1 or total_frames_num < num_frames
|
||||||
if full_read:
|
if full_read:
|
||||||
num_frames = total_frames_num
|
num_frames = total_frames_num
|
||||||
@ -148,14 +159,88 @@ class OpenCVVideoBackend(VideoLoader):
|
|||||||
assert i == num_frames, (f"Expected reading {num_frames} frames, "
|
assert i == num_frames, (f"Expected reading {num_frames} frames, "
|
||||||
f"but only loaded {i} frames from video.")
|
f"but only loaded {i} frames from video.")
|
||||||
|
|
||||||
|
return frames, metadata
|
||||||
|
|
||||||
|
|
||||||
|
@VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
|
||||||
|
class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_bytes(
|
||||||
|
cls,
|
||||||
|
data: bytes,
|
||||||
|
num_frames: int = -1,
|
||||||
|
requested_fps: int = 2,
|
||||||
|
max_duration: int = 300,
|
||||||
|
**kwargs,
|
||||||
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||||
|
import cv2
|
||||||
|
|
||||||
|
backend = cls().get_cv2_video_api()
|
||||||
|
cap = cv2.VideoCapture(BytesIO(data), backend, [])
|
||||||
|
if not cap.isOpened():
|
||||||
|
raise ValueError("Could not open video stream")
|
||||||
|
|
||||||
|
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||||
|
original_fps = cap.get(cv2.CAP_PROP_FPS)
|
||||||
|
duration = total_frames_num / original_fps if original_fps > 0 else 0
|
||||||
|
|
||||||
# Use transformers transformers.video_utils.VideoMetadata format
|
# Use transformers transformers.video_utils.VideoMetadata format
|
||||||
metadata = {
|
metadata = {
|
||||||
"total_num_frames": total_frames_num,
|
"total_num_frames": total_frames_num,
|
||||||
"fps": original_fps,
|
"fps": original_fps,
|
||||||
"duration": duration,
|
"duration": duration,
|
||||||
"video_backend": "opencv"
|
"video_backend": "opencv_dynamic"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# resample video to target num_frames
|
||||||
|
max_frame_idx = total_frames_num - 1
|
||||||
|
duration = duration or round(max_frame_idx / original_fps) + 1
|
||||||
|
|
||||||
|
# Refer to:
|
||||||
|
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
|
||||||
|
frame_indices: Union[range, list[int]]
|
||||||
|
if duration <= max_duration:
|
||||||
|
n = int(math.floor(duration * requested_fps))
|
||||||
|
frame_indices = sorted({
|
||||||
|
min(max_frame_idx,
|
||||||
|
int(math.ceil(i * original_fps / requested_fps)))
|
||||||
|
for i in range(n)
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
num_samples = int(max_duration * requested_fps)
|
||||||
|
if num_samples >= total_frames_num:
|
||||||
|
frame_indices = range(total_frames_num)
|
||||||
|
else:
|
||||||
|
target_seconds = np.linspace(0,
|
||||||
|
duration,
|
||||||
|
num_samples,
|
||||||
|
endpoint=True)
|
||||||
|
frame_indices = sorted({
|
||||||
|
min(max_frame_idx, int(math.ceil(t * original_fps)))
|
||||||
|
for t in target_seconds
|
||||||
|
})
|
||||||
|
|
||||||
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
frames = np.empty((len(frame_indices), height, width, 3),
|
||||||
|
dtype=np.uint8)
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
for idx in range(total_frames_num):
|
||||||
|
ok = cap.grab()
|
||||||
|
if not ok:
|
||||||
|
break
|
||||||
|
if idx in frame_indices:
|
||||||
|
ret, frame = cap.retrieve()
|
||||||
|
if ret:
|
||||||
|
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
assert i == len(frame_indices), (
|
||||||
|
f"Expected reading {len(frame_indices)} frames, "
|
||||||
|
f"but only loaded {i} frames from video.")
|
||||||
|
|
||||||
return frames, metadata
|
return frames, metadata
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user