# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on Qwen2.5-Omni (thinker only). """ from typing import NamedTuple from vllm import LLM, SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.multimodal.image import convert_image_mode from vllm.utils.argparse_utils import FlexibleArgumentParser class QueryResult(NamedTuple): inputs: dict limit_mm_per_prompt: dict[str, int] # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. default_system = ( "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " "Group, capable of perceiving auditory and visual inputs, as well as " "generating text and speech." ) def get_mixed_modalities_query() -> QueryResult: question = ( "What is recited in the audio? " "What is the content of this image? Why is this video funny?" ) prompt = ( f"<|im_start|>system\n{default_system}<|im_end|>\n" "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>" "<|vision_start|><|image_pad|><|vision_end|>" "<|vision_start|><|video_pad|><|vision_end|>" f"{question}<|im_end|>\n" f"<|im_start|>assistant\n" ) return QueryResult( inputs={ "prompt": prompt, "multi_modal_data": { "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, "image": convert_image_mode( ImageAsset("cherry_blossom").pil_image, "RGB" ), "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, }, }, limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1}, ) def get_use_audio_in_video_query() -> QueryResult: question = ( "Describe the content of the video in details, then convert what the " "baby say into text." ) prompt = ( f"<|im_start|>system\n{default_system}<|im_end|>\n" "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>" f"{question}<|im_end|>\n" f"<|im_start|>assistant\n" ) asset = VideoAsset(name="baby_reading", num_frames=16) audio = asset.get_audio(sampling_rate=16000) return QueryResult( inputs={ "prompt": prompt, "multi_modal_data": { "video": asset.np_ndarrays, "audio": audio, }, "mm_processor_kwargs": { "use_audio_in_video": True, }, }, limit_mm_per_prompt={"audio": 1, "video": 1}, ) def get_multi_audios_query() -> QueryResult: question = "Are these two audio clips the same?" prompt = ( f"<|im_start|>system\n{default_system}<|im_end|>\n" "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>" "<|audio_start|><|audio_pad|><|audio_end|>" f"{question}<|im_end|>\n" f"<|im_start|>assistant\n" ) return QueryResult( inputs={ "prompt": prompt, "multi_modal_data": { "audio": [ AudioAsset("winning_call").audio_and_sample_rate, AudioAsset("mary_had_lamb").audio_and_sample_rate, ], }, }, limit_mm_per_prompt={ "audio": 2, }, ) query_map = { "mixed_modalities": get_mixed_modalities_query, "use_audio_in_video": get_use_audio_in_video_query, "multi_audios": get_multi_audios_query, } def main(args): model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct" query_result = query_map[args.query_type]() llm = LLM( model=model_name, max_model_len=12800, max_num_seqs=5, limit_mm_per_prompt=query_result.limit_mm_per_prompt, seed=args.seed, ) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. sampling_params = SamplingParams(temperature=0.2, max_tokens=256) outputs = llm.generate(query_result.inputs, sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text print(generated_text) def parse_args(): parser = FlexibleArgumentParser( description="Demo on using vLLM for offline inference with " "audio language models" ) parser.add_argument( "--query-type", "-q", type=str, default="mixed_modalities", choices=query_map.keys(), help="Query type.", ) parser.add_argument( "--seed", type=int, default=None, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() if __name__ == "__main__": args = parse_args() main(args)