# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Consolidated test for ViT attention backend functionality across multiple models. This test validates that each multimodal model can successfully generate outputs using different ViT attention backends. Tests are parametrized by model and backend. """ from dataclasses import asdict from typing import Any import pytest from transformers import AutoProcessor from vllm import LLM, EngineArgs, SamplingParams from vllm.attention.backends.registry import AttentionBackendEnum from vllm.multimodal.utils import encode_image_base64 from vllm.multimodal.video import sample_frames_from_video from vllm.platforms import current_platform from ....utils import create_new_process_for_each_test from ...utils import dummy_hf_overrides # Dots.OCR prompt from official repository # https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3 # ruff: noqa: E501 DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox. 1. Bbox format: [x1, y1, x2, y2] 2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']. 3. Text Extraction & Formatting Rules: - Picture: For the 'Picture' category, the text field should be omitted. - Formula: Format its text as LaTeX. - Table: Format its text as HTML. - All Others (Text, Title, etc.): Format their text as Markdown. 4. Constraints: - The output text must be the original text from the image, with no translation. - All layout elements must be sorted according to human reading order. 5. Final Output: The entire output must be a single JSON object. """ VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>" # Model configurations MODEL_CONFIGS: dict[str, dict[str, Any]] = { "dots_ocr": { "model_name": "rednote-hilab/dots.ocr", "interface": "llm_chat", "max_model_len": 32768, "max_num_seqs": 1, "limit_mm_per_prompt": {"image": 1}, "sampling_params": { "temperature": 0.1, "max_tokens": 16384, "top_p": 0.9, "stop_token_ids": None, }, "use_specific_image": "stop_sign", "prompt_builder": "build_dots_ocr_prompt", "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(), }, "ernie45_vl": { "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT", "interface": "llm_generate", "max_model_len": 16384, "max_num_seqs": 2, "sampling_params": { "temperature": 0.0, "max_tokens": 256, "stop_token_ids": None, }, "use_processor": True, "question": "What is the content of each image?", }, "glm4_1v": { "model_name": "zai-org/GLM-4.1V-9B-Thinking", "interface": "llm_generate", "max_model_len": 32768, "max_num_seqs": 2, "sampling_params": { "temperature": 0.0, "max_tokens": 256, "stop_token_ids": None, }, "use_processor": True, "question": "What is the content of each image?", }, "keye_vl": { "model_name": "Kwai-Keye/Keye-VL-8B-Preview", "interface": "llm_generate", "max_model_len": 8192, "max_num_seqs": 5, "sampling_params": { "temperature": 0.0, "max_tokens": 256, "stop_token_ids": None, }, "supported_backends": { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.ROCM_AITER_FA, }, "use_processor": True, "question": "What is the content of each image?", }, "ovis2_5": { "model_name": "AIDC-AI/Ovis2.5-2B", "interface": "llm_generate", "max_model_len": 8192, "max_num_seqs": 2, "sampling_params": { "temperature": 0.0, "max_tokens": 256, "stop_token_ids": None, }, "prompt_builder": "build_ovis_prompt", "question": "What is the content of each image?", }, "qwen2_5_vl": { "model_name": "Qwen/Qwen2.5-VL-3B-Instruct", "interface": "vllm_runner", "media_type": "video", "max_model_len": 4000, "max_num_seqs": 1, "limit_mm_per_prompt": {"video": 1}, "sampling_params": { "max_tokens": 128, }, "runner_kwargs": { "runner": "generate", "dtype": "bfloat16", }, "video_params": { "num_frames": 16, "pruning_rates": [0.0, 0.75], }, }, "qwen2_5_omni": { "model_name": "Qwen/Qwen2.5-Omni-3B", "interface": "llm_generate", "max_model_len": 32768, "max_num_seqs": 2, "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3}, "sampling_params": { "temperature": 0.6, "top_p": 0.95, "top_k": 20, "max_tokens": 16384, }, "use_processor": True, "question": "What is the content of each image?", }, "qwen3_omni": { "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct", "interface": "llm_generate", "max_model_len": 32768, "max_num_seqs": 2, "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3}, "sampling_params": { "temperature": 0.6, "top_p": 0.95, "top_k": 20, "max_tokens": 16384, }, "use_processor": True, "question": "What is the content of each image?", }, } # Prompt builder functions def build_dots_ocr_prompt(images, config): """Build Dots.OCR specific prompt with OCR instructions.""" # Use only stop_sign image for Dots.OCR image = images[0] # Already filtered to stop_sign image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}" placeholders = [{"type": "image_url", "image_url": {"url": image_url}}] messages = [ { "role": "user", "content": [ *placeholders, { "type": "text", "text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}", }, ], }, ] return messages def build_processor_prompt(images, config): """Build prompt using AutoProcessor.apply_chat_template().""" processor = AutoProcessor.from_pretrained( config["model_name"], trust_remote_code=True ) image_urls = [ f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images ] placeholders = [{"type": "image", "image": url} for url in image_urls] messages = [ { "role": "user", "content": [ *placeholders, {"type": "text", "text": config["question"]}, ], }, ] return processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) def build_ovis_prompt(images, config): """Build Ovis2.5 specific prompt with custom format.""" image_urls = [ f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images ] placeholders = "\n".join( f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) ) return ( f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n" "<|im_start|>assistant\n" ) def build_qwen2_5_video_prompt(): """Build Qwen2.5-VL video prompt with EVS placeholder.""" return ( f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" f"<|im_start|>user\n{VIDEO_PLACEHOLDER}" "Describe this video with a short sentence (no more than 20 words)" "<|im_end|><|im_start|>assistant\n" ) # Handler functions def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets): """Standard LLM.generate() interface handler.""" images = [asset.pil_image for asset in image_assets] # Build prompt if config.get("use_processor"): prompt = build_processor_prompt(images, config) else: prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt") prompt_builder = globals()[prompt_builder_name] prompt = prompt_builder(images, config) # Determine limit_mm_per_prompt limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)}) # Create engine engine_args = EngineArgs( model=config["model_name"], trust_remote_code=True, max_model_len=config["max_model_len"], max_num_seqs=config["max_num_seqs"], limit_mm_per_prompt=limit_mm_per_prompt, mm_encoder_attn_backend=mm_encoder_attn_backend, hf_overrides=dummy_hf_overrides, load_format="dummy", ) engine_dict = asdict(engine_args) | {"seed": 42} llm = LLM(**engine_dict) # Generate sampling_params = SamplingParams(**config["sampling_params"]) outputs = llm.generate( { "prompt": prompt, "multi_modal_data": {"image": images}, }, sampling_params=sampling_params, ) # Validate for o in outputs: generated_text = o.outputs[0].text validator = config.get("output_validator", lambda x: len(x) > 10) assert validator(generated_text), ( f"Validation failed for {config['model_name']}: {generated_text}" ) def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets): """LLM.chat() interface handler for Dots.OCR.""" # Filter to stop_sign image only stop_sign_image = [ asset.pil_image for asset in image_assets if asset.name == "stop_sign" ][0] # Build messages messages = build_dots_ocr_prompt([stop_sign_image], config) # Create engine engine_args = EngineArgs( model=config["model_name"], trust_remote_code=True, max_model_len=config["max_model_len"], max_num_seqs=config["max_num_seqs"], limit_mm_per_prompt=config["limit_mm_per_prompt"], mm_encoder_attn_backend=mm_encoder_attn_backend, hf_overrides=dummy_hf_overrides, load_format="dummy", ) engine_dict = asdict(engine_args) | {"seed": 42} llm = LLM(**engine_dict) # Generate using chat sampling_params = SamplingParams(**config["sampling_params"]) outputs = llm.chat(messages=messages, sampling_params=sampling_params) # Validate for o in outputs: generated_text = o.outputs[0].text validator = config.get("output_validator", lambda x: len(x) > 10) assert validator(generated_text), ( f"Validation failed for {config['model_name']}: {generated_text}" ) def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner): """Video test with EVS (Efficient Video Sampling) handler.""" for pruning_rate in config["video_params"]["pruning_rates"]: num_frames = config["video_params"]["num_frames"] # Sample frames from video sampled_vids = [ sample_frames_from_video(asset.np_ndarrays, num_frames) for asset in video_assets ] # Build prompt and prepare video prompt = build_qwen2_5_video_prompt() prompts = [prompt] videos = [sampled_vids[0]] # Run with vllm_runner context manager with vllm_runner( config["model_name"], max_model_len=config["max_model_len"], max_num_seqs=config["max_num_seqs"], limit_mm_per_prompt=config["limit_mm_per_prompt"], tensor_parallel_size=1, video_pruning_rate=pruning_rate, mm_encoder_attn_backend=mm_encoder_attn_backend, hf_overrides=dummy_hf_overrides, load_format="dummy", **config["runner_kwargs"], ) as vllm_model: outputs = vllm_model.generate_greedy( prompts, config["sampling_params"]["max_tokens"], videos=videos, ) # Validate output assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}" output_ids, output_text = outputs[0] assert len(output_ids) > 0, "Generated no output IDs" assert len(output_text) > 0, "Generated empty text" assert isinstance(output_text, str), ( f"Output is not string: {type(output_text)}" ) # Main test function @pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys())) @pytest.mark.parametrize( "mm_encoder_attn_backend", [None] + current_platform.get_supported_vit_attn_backends(), ) @pytest.mark.skip(reason="Broken test due to memory segmentation fault") @create_new_process_for_each_test() def test_vit_backend_functionality( model_key: str, mm_encoder_attn_backend: AttentionBackendEnum | None, image_assets, video_assets, vllm_runner, request, ): """Test ViT attention backend functionality for multimodal models. This test validates that each model can successfully generate outputs using different ViT attention backends. The test: 1. Filters unsupported backends per model 2. Applies appropriate GPU marks 3. Routes to the correct test handler based on interface 4. Validates output meets minimum requirements """ config = MODEL_CONFIGS[model_key] # Step 1: Backend filtering if ( "supported_backends" in config and mm_encoder_attn_backend is not None and mm_encoder_attn_backend not in config["supported_backends"] ): pytest.skip( f"{model_key} does not support {mm_encoder_attn_backend} backend now." ) # Step 2: Apply GPU marks dynamically if "gpu_marks" in config: for mark in config["gpu_marks"]: request.applymarker(mark) # Step 3: Route to appropriate handler if config.get("media_type") == "video": run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner) elif config["interface"] == "llm_chat": run_llm_chat_test(config, mm_encoder_attn_backend, image_assets) elif config["interface"] == "llm_generate": run_llm_generate_test(config, mm_encoder_attn_backend, image_assets) else: raise ValueError(f"Unknown interface: {config['interface']}")