From 2e7cbf2d7da1030d141dd32e0271697b49b84e54 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Tue, 1 Jul 2025 23:34:03 -0700 Subject: [PATCH] [Frontend] Support configurable mm placeholder strings & flexible video sampling policies via CLI flags. (#20105) Signed-off-by: Chenheli Hua --- tests/async_engine/test_async_llm_engine.py | 6 ++- tests/engine/test_arg_utils.py | 52 +++++++++++++++++++ tests/entrypoints/openai/test_serving_chat.py | 6 ++- tests/multimodal/test_utils.py | 12 ++--- tests/multimodal/test_video.py | 48 ++++++++++++++++- vllm/config.py | 16 ++++++ vllm/engine/arg_utils.py | 12 +++++ vllm/entrypoints/chat_utils.py | 11 ++-- vllm/multimodal/audio.py | 10 ++++ vllm/multimodal/image.py | 8 ++- vllm/multimodal/utils.py | 27 ++++++---- vllm/multimodal/video.py | 20 +++++-- 12 files changed, 199 insertions(+), 29 deletions(-) diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 043b75cc5d385..3c030aea20661 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -6,8 +6,8 @@ import os import uuid from asyncio import CancelledError from copy import copy -from dataclasses import dataclass -from typing import Optional +from dataclasses import dataclass, field +from typing import Any, Optional import pytest import pytest_asyncio @@ -32,6 +32,8 @@ class RequestOutput: @dataclass class MockModelConfig: use_async_output_proc = True + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + mm_placeholder_str_override: dict[str, str] = field(default_factory=dict) class MockEngine: diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 847f150bd6443..8e0579b7cfb84 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -231,6 +231,58 @@ def test_limit_mm_per_prompt_parser(arg, expected): assert args.limit_mm_per_prompt == expected +@pytest.mark.parametrize( + ("arg", "expected"), + [ + (None, dict()), + ('{"video": {"num_frames": 123} }', { + "video": { + "num_frames": 123 + } + }), + ( + '{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }', # noqa + { + "video": { + "num_frames": 123, + "fps": 1.0, + "foo": "bar" + }, + "image": { + "foo": "bar" + } + }), + ]) +def test_media_io_kwargs_parser(arg, expected): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + if arg is None: + args = parser.parse_args([]) + else: + args = parser.parse_args(["--media-io-kwargs", arg]) + + assert args.media_io_kwargs == expected + + +@pytest.mark.parametrize(("arg", "expected"), [ + (None, dict()), + ('{"video":"<|video_placeholder|>"}', { + "video": "<|video_placeholder|>" + }), + ('{"video":"<|video_placeholder|>", "image": "<|image_placeholder|>"}', { + "video": "<|video_placeholder|>", + "image": "<|image_placeholder|>" + }), +]) +def test_mm_placeholder_str_override_parser(arg, expected): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + if arg is None: + args = parser.parse_args([]) + else: + args = parser.parse_args(["--mm-placeholder-str-override", arg]) + + assert args.mm_placeholder_str_override == expected + + def test_compilation_config(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 94740fefc870e..e339351189218 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -3,8 +3,8 @@ import asyncio from contextlib import suppress -from dataclasses import dataclass -from typing import Optional +from dataclasses import dataclass, field +from typing import Any, Optional from unittest.mock import MagicMock from vllm.config import MultiModalConfig @@ -40,6 +40,8 @@ class MockModelConfig: allowed_local_media_path: str = "" encoder_config = None generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + mm_placeholder_str_override: dict[str, str] = field(default_factory=dict) def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index a48542cec3f87..d927ae5cd0b27 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -167,14 +167,14 @@ async def test_fetch_image_error_conversion(): @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("num_frames", [-1, 32, 1800]) async def test_fetch_video_http(video_url: str, num_frames: int): - connector = MediaConnector() + connector = MediaConnector( + media_io_kwargs={"video": { + "num_frames": num_frames, + }}) - video_sync = connector.fetch_video(video_url, num_frames=num_frames) - video_async = await connector.fetch_video_async(video_url, - num_frames=num_frames) - # Check that the video frames are equal and metadata are same + video_sync = connector.fetch_video(video_url) + video_async = await connector.fetch_video_async(video_url) assert np.array_equal(video_sync[0], video_async[0]) - assert video_sync[1] == video_async[1] # Used for the next two tests related to `merge_and_sort_multimodal_metadata`. diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 9a700808d9d8a..897c9c33461ac 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -4,7 +4,10 @@ import numpy as np import numpy.typing as npt import pytest -from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader +from vllm import envs +from vllm.multimodal.image import ImageMediaIO +from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader, + VideoMediaIO) NUM_FRAMES = 10 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3) @@ -40,3 +43,46 @@ def test_video_loader_registry(): def test_video_loader_type_doesnt_exist(): with pytest.raises(AssertionError): VIDEO_LOADER_REGISTRY.load("non_existing_video_loader") + + +@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps") +class Assert10Frames1FPSVideoLoader(VideoLoader): + + @classmethod + def load_bytes(cls, + data: bytes, + num_frames: int = -1, + fps: float = -1.0, + **kwargs) -> npt.NDArray: + assert num_frames == 10, "bad num_frames" + assert fps == 1.0, "bad fps" + return FAKE_OUTPUT_2 + + +def test_video_media_io_kwargs(): + envs.VLLM_VIDEO_LOADER_BACKEND = "assert_10_frames_1_fps" + imageio = ImageMediaIO() + + # Verify that different args pass/fail assertions as expected. + videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0}) + _ = videoio.load_bytes(b"test") + + videoio = VideoMediaIO( + imageio, **{ + "num_frames": 10, + "fps": 1.0, + "not_used": "not_used" + }) + _ = videoio.load_bytes(b"test") + + with pytest.raises(AssertionError, match="bad num_frames"): + videoio = VideoMediaIO(imageio, **{}) + _ = videoio.load_bytes(b"test") + + with pytest.raises(AssertionError, match="bad num_frames"): + videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0}) + _ = videoio.load_bytes(b"test") + + with pytest.raises(AssertionError, match="bad fps"): + videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0}) + _ = videoio.load_bytes(b"test") diff --git a/vllm/config.py b/vllm/config.py index 6412e6e293b45..74e7ed2d48747 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -346,6 +346,12 @@ class ModelConfig: limit_mm_per_prompt: dict[str, int] = field(default_factory=dict) """Maximum number of data items per modality per prompt. Only applicable for multimodal models.""" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ + mm_placeholder_str_override: dict[str, str] = field(default_factory=dict) + """Optionally override placeholder string for given modalities.""" use_async_output_proc: bool = True """Whether to use async output processor.""" config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value @@ -694,6 +700,8 @@ class ModelConfig: if self.registry.is_multimodal_model(self.architectures): return MultiModalConfig( limit_per_prompt=self.limit_mm_per_prompt, + media_io_kwargs=self.media_io_kwargs, + mm_placeholder_str_override=self.mm_placeholder_str_override, mm_processor_kwargs=self.mm_processor_kwargs, disable_mm_preprocessor_cache=self. disable_mm_preprocessor_cache) @@ -3063,6 +3071,14 @@ class MultiModalConfig: `{"images": 16, "videos": 2}` """ + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ + + mm_placeholder_str_override: dict[str, str] = field(default_factory=dict) + """Optionally override placeholder string for given modalities.""" + mm_processor_kwargs: Optional[dict[str, object]] = None """ Overrides for the multi-modal processor obtained from diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2d3783363c00b..38f82e64de537 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -369,6 +369,11 @@ class EngineArgs: get_field(TokenizerPoolConfig, "extra_config") limit_mm_per_prompt: dict[str, int] = \ get_field(MultiModalConfig, "limit_per_prompt") + media_io_kwargs: dict[str, dict[str, + Any]] = get_field(MultiModalConfig, + "media_io_kwargs") + mm_placeholder_str_override: dict[str, str] = \ + get_field(MultiModalConfig, "mm_placeholder_str_override") mm_processor_kwargs: Optional[Dict[str, Any]] = \ MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = \ @@ -745,6 +750,11 @@ class EngineArgs: ) multimodal_group.add_argument("--limit-mm-per-prompt", **multimodal_kwargs["limit_per_prompt"]) + multimodal_group.add_argument("--media-io-kwargs", + **multimodal_kwargs["media_io_kwargs"]) + multimodal_group.add_argument( + "--mm-placeholder-str-override", + **multimodal_kwargs["mm_placeholder_str_override"]) multimodal_group.add_argument( "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]) @@ -969,6 +979,8 @@ class EngineArgs: enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, + media_io_kwargs=self.media_io_kwargs, + mm_placeholder_str_override=self.mm_placeholder_str_override, use_async_output_proc=not self.disable_async_output_proc, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 45f1894d022b3..093675f830e01 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -507,6 +507,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): def _placeholder_str(self, modality: ModalityStr, current_count: int) -> Optional[str]: + if modality in self._model_config.mm_placeholder_str_override: + return self._model_config.mm_placeholder_str_override[modality] + # TODO: Let user specify how to insert image tokens into prompt # (similar to chat template) hf_config = self._model_config.hf_config @@ -725,6 +728,7 @@ class MultiModalContentParser(BaseMultiModalContentParser): self._tracker = tracker self._connector = MediaConnector( + media_io_kwargs=self._tracker._model_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, ) @@ -763,7 +767,7 @@ class MultiModalContentParser(BaseMultiModalContentParser): return self.parse_audio(audio_url) def parse_video(self, video_url: str) -> None: - video = self._connector.fetch_video(video_url) + video = self._connector.fetch_video(video_url=video_url) placeholder = self._tracker.add("video", video) self._add_placeholder(placeholder) @@ -776,7 +780,8 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): self._tracker = tracker self._connector = MediaConnector( - allowed_local_media_path=tracker.allowed_local_media_path, + media_io_kwargs=self._tracker._model_config.media_io_kwargs, + allowed_local_media_path=tracker.allowed_local_media_path ) def parse_image(self, image_url: str) -> None: @@ -818,7 +823,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): return self.parse_audio(audio_url) def parse_video(self, video_url: str) -> None: - video = self._connector.fetch_video_async(video_url) + video = self._connector.fetch_video_async(video_url=video_url) placeholder = self._tracker.add("video", video) self._add_placeholder(placeholder) diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index fbb29276f6bdf..f3b273eb41e8f 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -83,6 +83,16 @@ class AudioResampler: class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): + def __init__(self, **kwargs) -> None: + super().__init__() + + # `kwargs` contains custom arguments from + # --media-io-kwargs for this modality. + # They can be passed to the underlying + # media loaders (e.g. custom implementations) + # for flexible control. + self.kwargs = kwargs + def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: return librosa.load(BytesIO(data), sr=None) diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index dce4c4c1cadba..a0448a80ac7c2 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -44,10 +44,16 @@ def convert_image_mode(image: Image.Image, to_mode: str): class ImageMediaIO(MediaIO[Image.Image]): - def __init__(self, *, image_mode: str = "RGB") -> None: + def __init__(self, image_mode: str = "RGB", **kwargs) -> None: super().__init__() self.image_mode = image_mode + # `kwargs` contains custom arguments from + # --media-io-kwargs for this modality. + # They can be passed to the underlying + # media loaders (e.g. custom implementations) + # for flexible control. + self.kwargs = kwargs def load_bytes(self, data: bytes) -> Image.Image: image = Image.open(BytesIO(data)) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 5e61d460fa428..2f2be59a1f42d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -38,12 +38,15 @@ class MediaConnector: def __init__( self, + media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None, connection: HTTPConnection = global_http_connection, *, allowed_local_media_path: str = "", ) -> None: super().__init__() + self.media_io_kwargs: dict[str, dict[ + str, Any]] = media_io_kwargs if media_io_kwargs else {} self.connection = connection if allowed_local_media_path: @@ -149,7 +152,7 @@ class MediaConnector: """ Load audio from a URL. """ - audio_io = AudioMediaIO() + audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {})) return self.load_from_url( audio_url, @@ -164,7 +167,7 @@ class MediaConnector: """ Asynchronously fetch audio from a URL. """ - audio_io = AudioMediaIO() + audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {})) return await self.load_from_url_async( audio_url, @@ -183,7 +186,8 @@ class MediaConnector: By default, the image is converted into RGB format. """ - image_io = ImageMediaIO(image_mode=image_mode) + image_io = ImageMediaIO(image_mode=image_mode, + **self.media_io_kwargs.get("image", {})) try: return self.load_from_url( @@ -206,7 +210,8 @@ class MediaConnector: By default, the image is converted into RGB format. """ - image_io = ImageMediaIO(image_mode=image_mode) + image_io = ImageMediaIO(image_mode=image_mode, + **self.media_io_kwargs.get("image", {})) try: return await self.load_from_url_async( @@ -223,13 +228,14 @@ class MediaConnector: video_url: str, *, image_mode: str = "RGB", - num_frames: int = 32, ) -> npt.NDArray: """ Load video from a HTTP or base64 data URL. """ - image_io = ImageMediaIO(image_mode=image_mode) - video_io = VideoMediaIO(image_io, num_frames=num_frames) + image_io = ImageMediaIO(image_mode=image_mode, + **self.media_io_kwargs.get("image", {})) + video_io = VideoMediaIO(image_io, + **self.media_io_kwargs.get("video", {})) return self.load_from_url( video_url, @@ -242,15 +248,16 @@ class MediaConnector: video_url: str, *, image_mode: str = "RGB", - num_frames: int = 32, ) -> npt.NDArray: """ Asynchronously load video from a HTTP or base64 data URL. By default, the image is converted into RGB format. """ - image_io = ImageMediaIO(image_mode=image_mode) - video_io = VideoMediaIO(image_io, num_frames=num_frames) + image_io = ImageMediaIO(image_mode=image_mode, + **self.media_io_kwargs.get("image", {})) + video_io = VideoMediaIO(image_io, + **self.media_io_kwargs.get("video", {})) return await self.load_from_url_async( video_url, diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index f15264cbde0b2..d9589068a203b 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -54,7 +54,10 @@ class VideoLoader: @classmethod @abstractmethod - def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray: + def load_bytes(cls, + data: bytes, + num_frames: int = -1, + **kwargs) -> npt.NDArray: raise NotImplementedError @@ -102,7 +105,8 @@ class OpenCVVideoBackend(VideoLoader): @classmethod def load_bytes(cls, data: bytes, - num_frames: int = -1) -> tuple[npt.NDArray, dict]: + num_frames: int = -1, + **kwargs) -> npt.NDArray: import cv2 backend = cls().get_cv2_video_api() @@ -159,18 +163,26 @@ class VideoMediaIO(MediaIO[npt.NDArray]): def __init__( self, image_io: ImageMediaIO, - *, num_frames: int = 32, + **kwargs, ) -> None: super().__init__() self.image_io = image_io self.num_frames = num_frames + # `kwargs` contains custom arguments from + # --media-io-kwargs for this modality. + # They can be passed to the underlying + # media loaders (e.g. custom implementations) + # for flexible control. + self.kwargs = kwargs video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend) def load_bytes(self, data: bytes) -> npt.NDArray: - return self.video_loader.load_bytes(data, self.num_frames) + return self.video_loader.load_bytes(data, + num_frames=self.num_frames, + **self.kwargs) def load_base64(self, media_type: str, data: str) -> npt.NDArray: if media_type.lower() == "video/jpeg":