vllm/tests/models/multimodal/generation/test_vit_backend_functionality.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Consolidated test for ViT attention backend functionality across multiple models.

This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend.
"""

from dataclasses import asdict
from typing import Any

import pytest
from transformers import AutoProcessor

from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform

from ....utils import create_new_process_for_each_test
from ...utils import dummy_hf_overrides

# Dots.OCR prompt from official repository
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
# ruff: noqa: E501
DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.

1. Bbox format: [x1, y1, x2, y2]

2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].

3. Text Extraction & Formatting Rules:
    - Picture: For the 'Picture' category, the text field should be omitted.
    - Formula: Format its text as LaTeX.
    - Table: Format its text as HTML.
    - All Others (Text, Title, etc.): Format their text as Markdown.

4. Constraints:
    - The output text must be the original text from the image, with no translation.
    - All layout elements must be sorted according to human reading order.

5. Final Output: The entire output must be a single JSON object.
"""

VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"


# Model configurations
MODEL_CONFIGS: dict[str, dict[str, Any]] = {
    "dots_ocr": {
        "model_name": "rednote-hilab/dots.ocr",
        "interface": "llm_chat",
        "max_model_len": 32768,
        "max_num_seqs": 1,
        "limit_mm_per_prompt": {"image": 1},
        "sampling_params": {
            "temperature": 0.1,
            "max_tokens": 16384,
            "top_p": 0.9,
            "stop_token_ids": None,
        },
        "use_specific_image": "stop_sign",
        "prompt_builder": "build_dots_ocr_prompt",
        "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
    },
    "ernie45_vl": {
        "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
        "interface": "llm_generate",
        "max_model_len": 16384,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "glm4_1v": {
        "model_name": "zai-org/GLM-4.1V-9B-Thinking",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "keye_vl": {
        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 5,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "supported_backends": {
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.ROCM_AITER_FA,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "ovis2_5": {
        "model_name": "AIDC-AI/Ovis2.5-2B",
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "prompt_builder": "build_ovis_prompt",
        "question": "What is the content of each image?",
    },
    "qwen2_5_vl": {
        "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
        "interface": "vllm_runner",
        "media_type": "video",
        "max_model_len": 4000,
        "max_num_seqs": 1,
        "limit_mm_per_prompt": {"video": 1},
        "sampling_params": {
            "max_tokens": 128,
        },
        "runner_kwargs": {
            "runner": "generate",
            "dtype": "bfloat16",
        },
        "video_params": {
            "num_frames": 16,
            "pruning_rates": [0.0, 0.75],
        },
    },
    "qwen2_5_omni": {
        "model_name": "Qwen/Qwen2.5-Omni-3B",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
        "sampling_params": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "max_tokens": 16384,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "qwen3_omni": {
        "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
        "sampling_params": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "max_tokens": 16384,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
}


# Prompt builder functions
def build_dots_ocr_prompt(images, config):
    """Build Dots.OCR specific prompt with OCR instructions."""
    # Use only stop_sign image for Dots.OCR
    image = images[0]  # Already filtered to stop_sign

    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"

    placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {
                    "type": "text",
                    "text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
                },
            ],
        },
    ]

    return messages


def build_processor_prompt(images, config):
    """Build prompt using AutoProcessor.apply_chat_template()."""
    processor = AutoProcessor.from_pretrained(
        config["model_name"], trust_remote_code=True
    )

    image_urls = [
        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
    ]
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": config["question"]},
            ],
        },
    ]

    return processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )


def build_ovis_prompt(images, config):
    """Build Ovis2.5 specific prompt with custom format."""
    image_urls = [
        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
    ]

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )

    return (
        f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )


def build_qwen2_5_video_prompt():
    """Build Qwen2.5-VL video prompt with EVS placeholder."""
    return (
        f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
        "Describe this video with a short sentence (no more than 20 words)"
        "<|im_end|><|im_start|>assistant\n"
    )


# Handler functions
def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
    """Standard LLM.generate() interface handler."""
    images = [asset.pil_image for asset in image_assets]

    # Build prompt
    if config.get("use_processor"):
        prompt = build_processor_prompt(images, config)
    else:
        prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
        prompt_builder = globals()[prompt_builder_name]
        prompt = prompt_builder(images, config)

    # Determine limit_mm_per_prompt
    limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})

    # Create engine
    engine_args = EngineArgs(
        model=config["model_name"],
        trust_remote_code=True,
        max_model_len=config["max_model_len"],
        max_num_seqs=config["max_num_seqs"],
        limit_mm_per_prompt=limit_mm_per_prompt,
        mm_encoder_attn_backend=mm_encoder_attn_backend,
        hf_overrides=dummy_hf_overrides,
        load_format="dummy",
    )

    engine_dict = asdict(engine_args) | {"seed": 42}
    llm = LLM(**engine_dict)

    # Generate
    sampling_params = SamplingParams(**config["sampling_params"])
    outputs = llm.generate(
        {
            "prompt": prompt,
            "multi_modal_data": {"image": images},
        },
        sampling_params=sampling_params,
    )

    # Validate
    for o in outputs:
        generated_text = o.outputs[0].text
        validator = config.get("output_validator", lambda x: len(x) > 10)
        assert validator(generated_text), (
            f"Validation failed for {config['model_name']}: {generated_text}"
        )


def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
    """LLM.chat() interface handler for Dots.OCR."""
    # Filter to stop_sign image only
    stop_sign_image = [
        asset.pil_image for asset in image_assets if asset.name == "stop_sign"
    ][0]

    # Build messages
    messages = build_dots_ocr_prompt([stop_sign_image], config)

    # Create engine
    engine_args = EngineArgs(
        model=config["model_name"],
        trust_remote_code=True,
        max_model_len=config["max_model_len"],
        max_num_seqs=config["max_num_seqs"],
        limit_mm_per_prompt=config["limit_mm_per_prompt"],
        mm_encoder_attn_backend=mm_encoder_attn_backend,
        hf_overrides=dummy_hf_overrides,
        load_format="dummy",
    )

    engine_dict = asdict(engine_args) | {"seed": 42}
    llm = LLM(**engine_dict)

    # Generate using chat
    sampling_params = SamplingParams(**config["sampling_params"])
    outputs = llm.chat(messages=messages, sampling_params=sampling_params)

    # Validate
    for o in outputs:
        generated_text = o.outputs[0].text
        validator = config.get("output_validator", lambda x: len(x) > 10)
        assert validator(generated_text), (
            f"Validation failed for {config['model_name']}: {generated_text}"
        )


def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
    """Video test with EVS (Efficient Video Sampling) handler."""
    for pruning_rate in config["video_params"]["pruning_rates"]:
        num_frames = config["video_params"]["num_frames"]

        # Sample frames from video
        sampled_vids = [
            sample_frames_from_video(asset.np_ndarrays, num_frames)
            for asset in video_assets
        ]

        # Build prompt and prepare video
        prompt = build_qwen2_5_video_prompt()
        prompts = [prompt]
        videos = [sampled_vids[0]]

        # Run with vllm_runner context manager
        with vllm_runner(
            config["model_name"],
            max_model_len=config["max_model_len"],
            max_num_seqs=config["max_num_seqs"],
            limit_mm_per_prompt=config["limit_mm_per_prompt"],
            tensor_parallel_size=1,
            video_pruning_rate=pruning_rate,
            mm_encoder_attn_backend=mm_encoder_attn_backend,
            hf_overrides=dummy_hf_overrides,
            load_format="dummy",
            **config["runner_kwargs"],
        ) as vllm_model:
            outputs = vllm_model.generate_greedy(
                prompts,
                config["sampling_params"]["max_tokens"],
                videos=videos,
            )

            # Validate output
            assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
            output_ids, output_text = outputs[0]
            assert len(output_ids) > 0, "Generated no output IDs"
            assert len(output_text) > 0, "Generated empty text"
            assert isinstance(output_text, str), (
                f"Output is not string: {type(output_text)}"
            )


# Main test function
@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
@pytest.mark.parametrize(
    "mm_encoder_attn_backend",
    [None] + current_platform.get_supported_vit_attn_backends(),
)
@create_new_process_for_each_test()
def test_vit_backend_functionality(
    model_key: str,
    mm_encoder_attn_backend: AttentionBackendEnum | None,
    image_assets,
    video_assets,
    vllm_runner,
    request,
):
    """Test ViT attention backend functionality for multimodal models.

    This test validates that each model can successfully generate outputs
    using different ViT attention backends. The test:
    1. Filters unsupported backends per model
    2. Applies appropriate GPU marks
    3. Routes to the correct test handler based on interface
    4. Validates output meets minimum requirements
    """
    config = MODEL_CONFIGS[model_key]

    # Step 1: Backend filtering
    if (
        "supported_backends" in config
        and mm_encoder_attn_backend is not None
        and mm_encoder_attn_backend not in config["supported_backends"]
    ):
        pytest.skip(
            f"{model_key} does not support {mm_encoder_attn_backend} backend now."
        )

    # Step 2: Apply GPU marks dynamically
    if "gpu_marks" in config:
        for mark in config["gpu_marks"]:
            request.applymarker(mark)

    # Step 3: Route to appropriate handler
    if config.get("media_type") == "video":
        run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
    elif config["interface"] == "llm_chat":
        run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
    elif config["interface"] == "llm_generate":
        run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
    else:
        raise ValueError(f"Unknown interface: {config['interface']}")