mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-20 05:25:01 +08:00
[Bugfix] Handle broken frames in video loading (#29001)
Signed-off-by: gcanlin <canlinguosdu@gmail.com> Signed-off-by: 凌葭 <lvjiang.lj@alibaba-inc.com> Co-authored-by: 凌葭 <lvjiang.lj@alibaba-inc.com>
This commit is contained in:
parent
0cca9b4d13
commit
fe25772aa9
BIN
tests/multimodal/assets/corrupted.mp4
Normal file
BIN
tests/multimodal/assets/corrupted.mp4
Normal file
Binary file not shown.
@ -18,6 +18,7 @@ from .utils import cosine_similarity, create_video_from_image, normalize_image
|
|||||||
|
|
||||||
pytestmark = pytest.mark.cpu_test
|
pytestmark = pytest.mark.cpu_test
|
||||||
|
|
||||||
|
ASSETS_DIR = Path(__file__).parent / "assets"
|
||||||
NUM_FRAMES = 10
|
NUM_FRAMES = 10
|
||||||
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
||||||
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
||||||
@ -140,3 +141,39 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
|
|||||||
)
|
)
|
||||||
assert np.sum(np.isnan(sim)) / sim.size < 0.001
|
assert np.sum(np.isnan(sim)) / sim.size < 0.001
|
||||||
assert np.nanmean(sim) > 0.99
|
assert np.nanmean(sim) > 0.99
|
||||||
|
|
||||||
|
|
||||||
|
def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
"""
|
||||||
|
Regression test for handling videos with broken frames.
|
||||||
|
This test uses a pre-corrupted video file (assets/corrupted.mp4) that
|
||||||
|
contains broken/unreadable frames to verify the video loader handles
|
||||||
|
them gracefully without crashing and returns accurate metadata.
|
||||||
|
"""
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
|
||||||
|
|
||||||
|
# Load the pre-corrupted video file that contains broken frames
|
||||||
|
corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
|
||||||
|
|
||||||
|
with open(corrupted_video_path, "rb") as f:
|
||||||
|
video_data = f.read()
|
||||||
|
|
||||||
|
loader = VIDEO_LOADER_REGISTRY.load("opencv")
|
||||||
|
frames, metadata = loader.load_bytes(video_data, num_frames=-1)
|
||||||
|
|
||||||
|
# Verify metadata consistency:
|
||||||
|
# frames_indices must match actual loaded frames
|
||||||
|
assert frames.shape[0] == len(metadata["frames_indices"]), (
|
||||||
|
f"Frames array size must equal frames_indices length. "
|
||||||
|
f"Got {frames.shape[0]} frames but "
|
||||||
|
f"{len(metadata['frames_indices'])} indices"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify that broken frames were skipped:
|
||||||
|
# loaded frames should be less than total
|
||||||
|
assert frames.shape[0] < metadata["total_num_frames"], (
|
||||||
|
f"Should load fewer frames than total due to broken frames. "
|
||||||
|
f"Expected fewer than {metadata['total_num_frames']} frames, "
|
||||||
|
f"but loaded {frames.shape[0]} frames"
|
||||||
|
)
|
||||||
|
|||||||
@ -63,6 +63,63 @@ class VideoLoader:
|
|||||||
) -> tuple[npt.NDArray, dict[str, Any]]:
|
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _read_frames(
|
||||||
|
cap,
|
||||||
|
frame_indices: set[int],
|
||||||
|
num_expected_frames: int,
|
||||||
|
max_frame_idx: int,
|
||||||
|
) -> tuple[npt.NDArray, int, list[int]]:
|
||||||
|
import cv2
|
||||||
|
|
||||||
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
valid_frame_indices = []
|
||||||
|
for idx in range(max_frame_idx + 1):
|
||||||
|
ok = cap.grab()
|
||||||
|
if not ok:
|
||||||
|
# Frame is broken/unreadable, log warning
|
||||||
|
if idx in frame_indices:
|
||||||
|
logger.warning(
|
||||||
|
"Failed to grab frame %d during video loading. "
|
||||||
|
"This frame will be skipped.",
|
||||||
|
idx,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if idx in frame_indices:
|
||||||
|
ret, frame = cap.retrieve()
|
||||||
|
if ret:
|
||||||
|
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||||
|
valid_frame_indices.append(idx)
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
# retrieve() failed even though grab() succeeded
|
||||||
|
logger.warning(
|
||||||
|
"Failed to retrieve frame %d during video loading. "
|
||||||
|
"This frame will be skipped.",
|
||||||
|
idx,
|
||||||
|
)
|
||||||
|
|
||||||
|
valid_num_frames = len(valid_frame_indices)
|
||||||
|
if valid_num_frames < num_expected_frames:
|
||||||
|
logger.warning(
|
||||||
|
"Video loading completed with %d broken/unreadable frames. "
|
||||||
|
"Expected %d frames but only loaded %d frames.",
|
||||||
|
num_expected_frames - valid_num_frames,
|
||||||
|
num_expected_frames,
|
||||||
|
valid_num_frames,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert i == valid_num_frames, (
|
||||||
|
f"Expected reading {valid_num_frames} frames, "
|
||||||
|
f"but only loaded {i} frames from video."
|
||||||
|
)
|
||||||
|
|
||||||
|
return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
|
||||||
|
|
||||||
|
|
||||||
VIDEO_LOADER_REGISTRY = ExtensionManager()
|
VIDEO_LOADER_REGISTRY = ExtensionManager()
|
||||||
|
|
||||||
@ -120,24 +177,10 @@ class OpenCVVideoBackend(VideoLoader):
|
|||||||
)
|
)
|
||||||
frame_idx = uniform_sampled_frames.tolist()
|
frame_idx = uniform_sampled_frames.tolist()
|
||||||
|
|
||||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
# Convert to set for O(1) lookup performance
|
||||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
frame_idx_set = set(frame_idx)
|
||||||
frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
|
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
|
||||||
|
cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
|
||||||
i = 0
|
|
||||||
for idx in range(max(frame_idx) + 1):
|
|
||||||
ok = cap.grab()
|
|
||||||
if not ok:
|
|
||||||
break
|
|
||||||
if idx in frame_idx:
|
|
||||||
ret, frame = cap.retrieve()
|
|
||||||
if ret:
|
|
||||||
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
assert i == num_frames_to_sample, (
|
|
||||||
f"Expected reading {num_frames_to_sample} frames, "
|
|
||||||
f"but only loaded {i} frames from video."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use transformers transformers.video_utils.VideoMetadata format
|
# Use transformers transformers.video_utils.VideoMetadata format
|
||||||
@ -148,10 +191,10 @@ class OpenCVVideoBackend(VideoLoader):
|
|||||||
"fps": original_fps,
|
"fps": original_fps,
|
||||||
"duration": duration,
|
"duration": duration,
|
||||||
"video_backend": "opencv",
|
"video_backend": "opencv",
|
||||||
"frames_indices": list(frame_idx),
|
"frames_indices": valid_frame_indices,
|
||||||
# extra field used to control hf processor's video
|
# extra field used to control hf processor's video
|
||||||
# sampling behavior
|
# sampling behavior
|
||||||
"do_sample_frames": num_frames_to_sample == total_frames_num,
|
"do_sample_frames": valid_num_frames == total_frames_num,
|
||||||
}
|
}
|
||||||
|
|
||||||
return frames, metadata
|
return frames, metadata
|
||||||
@ -185,10 +228,10 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
|
|||||||
|
|
||||||
# Refer to:
|
# Refer to:
|
||||||
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
|
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
|
||||||
frame_indices: range | list[int]
|
frame_indices_list: list[int]
|
||||||
if duration <= max_duration:
|
if duration <= max_duration:
|
||||||
n = int(math.floor(duration * fps))
|
n = int(math.floor(duration * fps))
|
||||||
frame_indices = sorted(
|
frame_indices_list = sorted(
|
||||||
{
|
{
|
||||||
min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
|
min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
|
||||||
for i in range(n)
|
for i in range(n)
|
||||||
@ -197,34 +240,23 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
|
|||||||
else:
|
else:
|
||||||
num_samples = int(max_duration * fps)
|
num_samples = int(max_duration * fps)
|
||||||
if num_samples >= total_frames_num:
|
if num_samples >= total_frames_num:
|
||||||
frame_indices = range(total_frames_num)
|
frame_indices_list = list(range(total_frames_num))
|
||||||
else:
|
else:
|
||||||
target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
|
target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
|
||||||
frame_indices = sorted(
|
frame_indices_list = sorted(
|
||||||
{
|
{
|
||||||
min(max_frame_idx, int(math.ceil(t * original_fps)))
|
min(max_frame_idx, int(math.ceil(t * original_fps)))
|
||||||
for t in target_seconds
|
for t in target_seconds
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
# Convert to set for O(1) lookup performance
|
||||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
frame_indices_set = set(frame_indices_list)
|
||||||
frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
|
frames, valid_num_frames, valid_frame_indices = cls._read_frames(
|
||||||
|
cap,
|
||||||
i = 0
|
frame_indices_set,
|
||||||
for idx in range(total_frames_num):
|
len(frame_indices_list),
|
||||||
ok = cap.grab()
|
total_frames_num - 1,
|
||||||
if not ok:
|
|
||||||
break
|
|
||||||
if idx in frame_indices:
|
|
||||||
ret, frame = cap.retrieve()
|
|
||||||
if ret:
|
|
||||||
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
assert i == len(frame_indices), (
|
|
||||||
f"Expected reading {len(frame_indices)} frames, "
|
|
||||||
f"but only loaded {i} frames from video."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use transformers transformers.video_utils.VideoMetadata format
|
# Use transformers transformers.video_utils.VideoMetadata format
|
||||||
@ -233,7 +265,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
|
|||||||
"fps": original_fps,
|
"fps": original_fps,
|
||||||
"duration": duration,
|
"duration": duration,
|
||||||
"video_backend": "opencv_dynamic",
|
"video_backend": "opencv_dynamic",
|
||||||
"frames_indices": list(frame_indices),
|
"frames_indices": valid_frame_indices,
|
||||||
"do_sample_frames": False,
|
"do_sample_frames": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user