From e601efcb10ed982e15e522f7c29c8531677678ca Mon Sep 17 00:00:00 2001 From: Anton Date: Mon, 7 Jul 2025 22:43:08 +0300 Subject: [PATCH] [Misc] Add fully interleaved support for multimodal 'string' content format (#14047) Signed-off-by: drobyshev.anton Co-authored-by: drobyshev.anton --- tests/entrypoints/test_chat_utils.py | 352 ++++++++++++++++++++++++++- vllm/config.py | 14 +- vllm/engine/arg_utils.py | 5 + vllm/entrypoints/chat_utils.py | 150 +++++++++--- 4 files changed, 478 insertions(+), 43 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index e41ea686e992..e321ca70001d 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -2,11 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings -from typing import Optional +from collections.abc import Mapping +from typing import Literal, Optional import pytest +from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, parse_chat_messages, @@ -15,7 +18,8 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, resolve_hf_chat_template) from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64, + encode_video_base64) from vllm.transformers_utils.tokenizer_group import TokenizerGroup from ..models.registry import HF_EXAMPLE_MODELS @@ -28,6 +32,7 @@ ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b" QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" +QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B" MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B" HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B" @@ -48,6 +53,21 @@ def phi3v_model_config(): }) +@pytest.fixture(scope="function") +def phi3v_model_config_mm_interleaved(): + return ModelConfig(PHI3V_MODEL_ID, + task="generate", + tokenizer=PHI3V_MODEL_ID, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="auto", + seed=0, + interleave_mm_strings=True, + limit_mm_per_prompt={ + "image": 2, + }) + + @pytest.fixture(scope="module") def phi3v_tokenizer(): return TokenizerGroup( @@ -58,6 +78,32 @@ def phi3v_tokenizer(): ) +@pytest.fixture(scope="function") +def qwen25omni_model_config_mm_interleaved(): + return ModelConfig(QWEN25OMNI_MODEL_ID, + task="generate", + tokenizer=QWEN25OMNI_MODEL_ID, + tokenizer_mode="auto", + dtype="auto", + seed=0, + interleave_mm_strings=True, + limit_mm_per_prompt={ + "image": 2, + "audio": 1, + "video": 1, + }) + + +@pytest.fixture(scope="module") +def qwen25omni_tokenizer(): + return TokenizerGroup( + tokenizer_id=QWEN25OMNI_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + + @pytest.fixture(scope="module") def mllama_model_config(): return ModelConfig(MLLAMA_MODEL_ID, @@ -113,6 +159,20 @@ def image_url(): return f"data:image/jpeg;base64,{base64}" +@pytest.fixture(scope="module") +def video_url(): + video = VideoAsset('baby_reading', 1) + base64 = encode_video_base64(video.np_ndarrays) + return f"data:video/jpeg;base64,{base64}" + + +@pytest.fixture(scope="module") +def audio_url(): + audio = AudioAsset('mary_had_lamb') + base64 = encode_audio_base64(*audio.audio_and_sample_rate) + return f"data:audio/ogg;base64,{base64}" + + def _assert_mm_data_is_image_input( mm_data: Optional[MultiModalDataDict], image_count: int, @@ -126,6 +186,23 @@ def _assert_mm_data_is_image_input( assert isinstance(image_data, list) and len(image_data) == image_count +ModalityType = Literal["image", "video", "audio"] +MultiModalDataCounts = Mapping[ModalityType, int] + + +def _assert_mm_data_inputs( + mm_data: Optional[MultiModalDataDict], + data_count: MultiModalDataCounts, +) -> None: + assert mm_data is not None + assert set(data_count.keys()) == (set(mm_data.keys())) + + for modality, n in data_count.items(): + modality_data = mm_data.get(modality) + assert modality_data is not None + assert isinstance(modality_data, list) and len(modality_data) == n + + def test_parse_chat_messages_single_image( phi3v_model_config, phi3v_tokenizer, @@ -637,6 +714,277 @@ def test_parse_chat_messages_multiple_images_uncommon_input( _assert_mm_data_is_image_input(mm_data, 2) +def test_parse_chat_messages_multiple_images_interleave( + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "text", + "text": "I need you to compare this image" + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "and this one" + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "Do they have differences?" + }] + }], + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 + "Do they have differences?" + }] + _assert_mm_data_is_image_input(mm_data, 2) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_images_interleave_async( + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [{ + "type": "text", + "text": "I need you to compare this image" + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "and this one" + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "Do they have differences?" + }] + }], + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 + "Do they have differences?" + }] + _assert_mm_data_is_image_input(await mm_data, 2) + + +def test_parse_chat_messages_multiple_images_multiple_messages_interleave( + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Be accurate." + }, + ] + }, { + "role": "assistant", + "content": "Some stuff." + }, { + "role": + "user", + "content": [{ + "type": "text", + "text": "What's on this image?" + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }] + }], + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "What's on this image?\n<|image_1|>\nBe accurate." + }, { + "role": "assistant", + "content": "Some stuff." + }, { + "role": "user", + "content": "What's on this image?\n<|image_2|>" + }] + _assert_mm_data_is_image_input(mm_data, 2) + + +def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( + qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer, + image_url, video_url, audio_url): + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + } + }, + ] + }, { + "role": "assistant", + "content": "Some stuff." + }, { + "role": + "user", + "content": [{ + "type": "text", + "text": "What's on this image?" + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "And what's in the video?" + }, { + "type": "video_url", + "video_url": { + "url": video_url + } + }] + }], + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>" + }, { + "role": "assistant", + "content": "Some stuff." + }, { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>" + }] + + _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) + + +def test_parse_chat_messages_multiple_images_interleave_with_placeholders( + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + image_url, +): + with pytest.raises( + ValueError, + match=r"Found more '<|image_1|>' placeholders in input prompt " + "than actual multimodal data items."): + parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": + "text", + "text": + "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 + "Do they have differences?" + }, + ] + }], + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + content_format="string", + ) + + ### Mllama currently wraps images / texts as interleaved dictionaries def test_mllama_single_image( mllama_model_config, diff --git a/vllm/config.py b/vllm/config.py index b7ba434db917..bac18e8175d3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -346,6 +346,9 @@ class ModelConfig: limit_mm_per_prompt: dict[str, int] = field(default_factory=dict) """Maximum number of data items per modality per prompt. Only applicable for multimodal models.""" + interleave_mm_strings: bool = False + """Enable fully interleaved support for multimodal prompts, while using + --chat-template-content-format=string. Defaults to False.""" media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) """Additional args passed to process media inputs, keyed by modalities. For example, to set num_frames for video, set @@ -702,7 +705,8 @@ class ModelConfig: media_io_kwargs=self.media_io_kwargs, mm_processor_kwargs=self.mm_processor_kwargs, disable_mm_preprocessor_cache=self. - disable_mm_preprocessor_cache) + disable_mm_preprocessor_cache, + interleave_mm_strings=self.interleave_mm_strings) if self.limit_mm_per_prompt: raise ValueError("`limit_mm_per_prompt` is only supported for " @@ -713,6 +717,9 @@ class ModelConfig: if self.disable_mm_preprocessor_cache: raise ValueError("`disable_mm_preprocessor_cache` is only " "supported for multimodal models.") + if self.interleave_mm_strings: + raise ValueError("`interleave_mm_strings` is only " + "supported for multimodal models.") return None @@ -3126,6 +3133,11 @@ class MultiModalConfig: If `True`, disable caching of the processed multi-modal inputs. """ + interleave_mm_strings: bool = False + """ + Enable fully interleaved support for multimodal prompts. + """ + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cf94b6a64281..a497e3c8eeac 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -370,6 +370,7 @@ class EngineArgs: get_field(TokenizerPoolConfig, "extra_config") limit_mm_per_prompt: dict[str, int] = \ get_field(MultiModalConfig, "limit_per_prompt") + interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings media_io_kwargs: dict[str, dict[str, Any]] = get_field(MultiModalConfig, "media_io_kwargs") @@ -763,6 +764,9 @@ class EngineArgs: multimodal_group.add_argument( "--disable-mm-preprocessor-cache", **multimodal_kwargs["disable_mm_preprocessor_cache"]) + multimodal_group.add_argument( + "--interleave-mm-strings", + **multimodal_kwargs["interleave_mm_strings"]) # LoRA related configs lora_kwargs = get_kwargs(LoRAConfig) @@ -981,6 +985,7 @@ class EngineArgs: enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, + interleave_mm_strings=self.interleave_mm_strings, media_io_kwargs=self.media_io_kwargs, use_async_output_proc=not self.disable_async_output_proc, config_format=self.config_format, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 012ea1d75f44..08e94ec0fa1e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -4,7 +4,7 @@ import asyncio import json from abc import ABC, abstractmethod -from collections import defaultdict, deque +from collections import Counter, defaultdict, deque from collections.abc import Awaitable, Iterable from functools import cached_property, lru_cache, partial from pathlib import Path @@ -52,6 +52,12 @@ from vllm.utils import deprecate_kwargs, random_uuid logger = init_logger(__name__) +MODALITY_PLACEHOLDERS_MAP = { + "image": "<##IMAGE##>", + "audio": "<##AUDIO##>", + "video": "<##VIDEO##>", +} + class AudioURL(TypedDict, total=False): url: Required[str] @@ -354,6 +360,7 @@ def resolve_mistral_chat_template( "so it will be ignored.") return None + @deprecate_kwargs( "trust_remote_code", additional_message="Please use `model_config.trust_remote_code` instead.", @@ -633,15 +640,22 @@ class BaseMultiModalContentParser(ABC): def __init__(self) -> None: super().__init__() - # multimodal placeholder_string : count - self._placeholder_counts: dict[str, int] = defaultdict(lambda: 0) + # stores model placehodlers list with corresponding + # general MM placeholder: + # { + # "<##IMAGE##>": ["", "", ""], + # "<##AUDIO##>": ["