# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import os import random from contextlib import contextmanager from dataclasses import asdict from typing import NamedTuple from huggingface_hub import snapshot_download from transformers import AutoTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest from vllm.multimodal.image import convert_image_mode from vllm.utils.argparse_utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): engine_args: EngineArgs prompts: list[str] stop_token_ids: list[int] | None = None lora_requests: list[LoRARequest] | None = None sampling_params: list[SamplingParams] | None = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # Aria def run_aria(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "rhymes-ai/Aria" # NOTE: Need L40 (or equivalent) to avoid OOM engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, dtype="bfloat16", limit_mm_per_prompt={modality: 1}, ) prompts = [ ( f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n" ) for question in questions ] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # Aya Vision def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "CohereForAI/aya-vision-8b" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, mm_processor_kwargs={"crop_to_patches": True}, limit_mm_per_prompt={modality: 1}, ) prompts = [ f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Bee-8B def run_bee(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "Open-Bee/Bee-8B-RL" prompts = [ ( f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" f"<|im_start|>user\n\n{question}<|im_end|>" f"<|im_start|>assistant\n\n" ) for question in questions ] engine_args = EngineArgs( model=model_name, max_model_len=16384, limit_mm_per_prompt={modality: 1}, trust_remote_code=True, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # BLIP-2 def run_blip2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" # BLIP-2 prompt format is inaccurate on HuggingFace model repository. # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompts = [f"Question: {question} Answer:" for question in questions] engine_args = EngineArgs( model="Salesforce/blip2-opt-2.7b", limit_mm_per_prompt={modality: 1}, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Chameleon def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}" for question in questions] engine_args = EngineArgs( model="facebook/chameleon-7b", max_model_len=4096, max_num_seqs=2, limit_mm_per_prompt={modality: 1}, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "CohereLabs/command-a-vision-07-2025" engine_args = EngineArgs( model=model_name, max_model_len=32768, tensor_parallel_size=4, limit_mm_per_prompt={modality: 1}, ) prompts = [ f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Deepseek-VL2 def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "deepseek-ai/deepseek-vl2-tiny" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, limit_mm_per_prompt={modality: 1}, ) prompts = [ f"<|User|>: \n{question}\n\n<|Assistant|>:" for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData: from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor assert modality == "image" model_name = "deepseek-ai/DeepSeek-OCR" engine_args = EngineArgs( model=model_name, limit_mm_per_prompt={modality: 1}, logits_processors=[NGramPerReqLogitsProcessor], ) # deepseek-ocr use plain prompt template prompts = [f"\n{question}" for question in questions] # The following sampling params config is taken from # the official Deepseek-OCR inference example. # (IMPORTANT) Use the custom logits processor and avoid skipping # special tokens for this model for the optimal OCR performance. sampling_params = [ SamplingParams( temperature=0.0, max_tokens=8192, # ngram logit processor args extra_args=dict( ngram_size=30, window_size=90, # whitelist: , whitelist_token_ids={128821, 128822}, ), skip_special_tokens=False, ) for _ in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, sampling_params=sampling_params, ) # Dots-OCR def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions] engine_args = EngineArgs( model="rednote-hilab/dots.ocr", limit_mm_per_prompt={modality: 1}, trust_remote_code=True, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Ernie4.5-VL def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData: model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=5, limit_mm_per_prompt={modality: 1}, trust_remote_code=True, ) if modality == "image": placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>" elif modality == "video": placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>" prompts = [ ( f"<|begin_of_sentence|>User: {question}{placeholder}\n" "Assistant: " ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Fuyu def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}\n" for question in questions] engine_args = EngineArgs( model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2, limit_mm_per_prompt={modality: 1}, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Gemma 3 def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "google/gemma-3-4b-it" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, mm_processor_kwargs={"do_pan_and_scan": True}, limit_mm_per_prompt={modality: 1}, ) prompts = [ ( "user\n" f"{question}\n" "model\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Gemma3N def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "google/gemma-3n-E2B-it" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, limit_mm_per_prompt={modality: 1}, enforce_eager=True, ) prompts = [ ( "user\n" f"{question}\n" "model\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # GLM-4v def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "zai-org/glm-4v-9b" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, trust_remote_code=True, enforce_eager=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}, limit_mm_per_prompt={modality: 1}, ) prompts = [ ( "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>" f"{question}<|assistant|>" ) for question in questions ] stop_token_ids = [151329, 151336, 151338] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # GLM-4.1V def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.1V-9B-Thinking" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, mm_processor_kwargs={ "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, limit_mm_per_prompt={modality: 1}, enforce_eager=True, ) if modality == "image": placeholder = "<|begin_of_image|><|image|><|end_of_image|>" elif modality == "video": placeholder = "<|begin_of_video|><|video|><|end_of_video|>" prompts = [ ( "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" f"{placeholder}" f"{question}<|assistant|>assistant\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # GLM-4.5V def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.5V" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, mm_processor_kwargs={ "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, limit_mm_per_prompt={modality: 1}, enforce_eager=True, tensor_parallel_size=4, ) if modality == "image": placeholder = "<|begin_of_image|><|image|><|end_of_image|>" elif modality == "video": placeholder = "<|begin_of_video|><|video|><|end_of_video|>" prompts = [ ( "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" f"{placeholder}" f"{question}<|assistant|>assistant\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # GLM-4.5V-FP8 def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: model_name = "zai-org/GLM-4.5V-FP8" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, mm_processor_kwargs={ "size": {"shortest_edge": 12544, "longest_edge": 47040000}, "fps": 1, }, limit_mm_per_prompt={modality: 1}, enforce_eager=True, tensor_parallel_size=4, ) if modality == "image": placeholder = "<|begin_of_image|><|image|><|end_of_image|>" elif modality == "video": placeholder = "<|begin_of_video|><|video|><|end_of_video|>" prompts = [ ( "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" f"{placeholder}" f"{question}<|assistant|>assistant\n" ) for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # H2OVL-Mississippi def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "h2oai/h2ovl-mississippi-800m" engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, limit_mm_per_prompt={modality: 1}, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) messages = [ [{"role": "user", "content": f"\n{question}"}] for question in questions ] prompts = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m stop_token_ids = [tokenizer.eos_token_id] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # HunyuanOCR def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "tencent/HunyuanOCR" engine_args = EngineArgs( model=model_name, max_model_len=8192, limit_mm_per_prompt={modality: 1}, ) placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501 prompts = [ f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>" for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=None, ) # naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B def run_hyperclovax_seed_vision( questions: list[str], modality: str ) -> ModelRequestData: model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192 if modality == "image" else 16384, limit_mm_per_prompt={modality: 1}, ) messages = list() for question in questions: if modality == "image": """ ocr: List the words in the image in raster order. Even if the word order feels unnatural for reading, the model will handle it as long as it follows raster order. e.g. "Naver, CLOVA, bigshane" lens_keywords: List the entity names in the image. e.g. "iPhone" lens_local_keywords: List the entity names with quads in the image. e.g. "[0.07, 0.21, 0.92, 0.90] iPhone" """ messages.append( [ { "role": "user", "content": [ { "type": "image", "ocr": "", "lens_keywords": "", "lens_local_keywords": "", }, { "type": "text", "text": question, }, ], } ] ) elif modality == "video": messages.append( [ { "role": "user", "content": [ { "type": "video", }, { "type": "text", "text": question, }, ], } ] ) else: raise ValueError(f"Unsupported modality: {modality}") prompts = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=None, ) # Idefics3-8B-Llama3 def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "HuggingFaceM4/Idefics3-8B-Llama3" engine_args = EngineArgs( model=model_name, max_model_len=8192, max_num_seqs=2, enforce_eager=True, # if you are running out of memory, you can reduce the "longest_edge". # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations mm_processor_kwargs={ "size": {"longest_edge": 3 * 364}, }, limit_mm_per_prompt={modality: 1}, ) prompts = [ (f"<|begin_of_text|>User:{question}\nAssistant:") for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1-mini" engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, max_num_seqs=2, limit_mm_per_prompt={modality: 1}, enforce_eager=True, ) if modality == "image": placeholder = "" elif modality == "video": placeholder = "