vllm/tests/models/multimodal/generation/test_qwen2_5_vl.py
Laith Sakka 2e0ad629b0
Avoid bytecode hook and simplify TorchCompileWrapperWithCustomDipatch (#25110)
Signed-off-by: Laith Sakka <lsakka@meta.com>
2025-11-14 14:11:10 -08:00

149 lines
4.6 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.multimodal.video import sample_frames_from_video
from ....conftest import VIDEO_ASSETS
models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
target_dtype = "bfloat16"
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
def qwen2_5_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_5_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_functionality(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
use_bytecode_hook: bool,
monkeypatch,
) -> None:
"""Test EVS (Efficient Video Sampling) functionality with different
pruning rates.
"""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Sample frames from video assets
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
prompts = [VIDEO_PROMPTS[0]]
videos = [sampled_vids[0]]
# Initialize model with EVS configuration
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
dtype=dtype,
limit_mm_per_prompt={"video": 1},
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got a response
assert len(outputs) == 1
output_ids, output_text = outputs[0]
# Ensure we got some output
assert len(output_ids) > 0
assert len(output_text) > 0
# Ensure the output is a string
assert isinstance(output_text, str)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_batched_videos(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
use_bytecode_hook: bool,
monkeypatch,
) -> None:
"""Test EVS functionality with batched videos.
This test validates that:
1. The model handles batched video inputs correctly with EVS
2. Both pruning configurations work with multiple videos
3. The model doesn't crash when processing multiple videos simultaneously
"""
# Set the environment variable for this test
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
# Sample frames from video assets
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
# Test batched videos
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
# Initialize model with EVS configuration
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"video": 2},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got responses for both videos
assert len(outputs) == 2
for output_ids, output_text in outputs:
# Ensure we got some output for each video
assert len(output_ids) > 0
assert len(output_text) > 0
# Ensure the output is a string
assert isinstance(output_text, str)