vllm/tests/models/multimodal/generation/test_qwen2_5_vl.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest

from vllm.multimodal.video import sample_frames_from_video

from ....conftest import VIDEO_ASSETS

models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
target_dtype = "bfloat16"

VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"


def qwen2_5_vl_chat_template(*query):
    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501


VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
    {
        "baby_reading": qwen2_5_vl_chat_template(
            VIDEO_PLACEHOLDER,
            "Describe this video with a short sentence ",
            "(no more than 20 words)",
        ),
    }
)


@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_functionality(
    vllm_runner,
    video_assets,
    model,
    video_pruning_rate: float,
    num_frames: int,
    dtype: str,
    max_tokens: int,
    use_bytecode_hook: bool,
    monkeypatch,
) -> None:
    """Test EVS (Efficient Video Sampling) functionality with different
    pruning rates.
    """
    # Set the environment variable for this test
    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")

    # Sample frames from video assets
    sampled_vids = [
        sample_frames_from_video(asset.np_ndarrays, num_frames)
        for asset in video_assets
    ]

    prompts = [VIDEO_PROMPTS[0]]
    videos = [sampled_vids[0]]

    # Initialize model with EVS configuration
    with vllm_runner(
        model,
        runner="generate",
        max_model_len=4000,
        dtype=dtype,
        limit_mm_per_prompt={"video": 1},
        video_pruning_rate=video_pruning_rate,
    ) as vllm_model:
        # Generate output - this should not crash
        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)

        # Basic validation that we got a response
        assert len(outputs) == 1
        output_ids, output_text = outputs[0]

        # Ensure we got some output
        assert len(output_ids) > 0
        assert len(output_text) > 0

        # Ensure the output is a string
        assert isinstance(output_text, str)


@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
def test_qwen2_5_vl_evs_batched_videos(
    vllm_runner,
    video_assets,
    model,
    video_pruning_rate: float,
    num_frames: int,
    dtype: str,
    max_tokens: int,
    use_bytecode_hook: bool,
    monkeypatch,
) -> None:
    """Test EVS functionality with batched videos.

    This test validates that:
    1. The model handles batched video inputs correctly with EVS
    2. Both pruning configurations work with multiple videos
    3. The model doesn't crash when processing multiple videos simultaneously
    """
    # Set the environment variable for this test
    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
    # Sample frames from video assets
    sampled_vids = [
        sample_frames_from_video(asset.np_ndarrays, num_frames)
        for asset in video_assets
    ]

    # Test batched videos
    prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
    videos = [sampled_vids[0], sampled_vids[0]]  # Use same video twice for testing

    # Initialize model with EVS configuration
    with vllm_runner(
        model,
        runner="generate",
        max_model_len=4000,
        max_num_seqs=2,
        dtype=dtype,
        limit_mm_per_prompt={"video": 2},
        tensor_parallel_size=1,
        video_pruning_rate=video_pruning_rate,
    ) as vllm_model:
        # Generate output - this should not crash
        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)

        # Basic validation that we got responses for both videos
        assert len(outputs) == 2

        for output_ids, output_text in outputs:
            # Ensure we got some output for each video
            assert len(output_ids) > 0
            assert len(output_text) > 0

            # Ensure the output is a string
            assert isinstance(output_text, str)