diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 670d747b9ee7..ed1cd46dd858 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -10,6 +10,22 @@ This document walks you through the steps to extend a basic model so that it acc It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic]. Further update the model as follows: +- Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model. + + ??? Code + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "" + + raise ValueError("Only image modality is supported") + ``` + - Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example: ```diff diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 3c030aea2066..0eb7a6eb52aa 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -33,7 +33,6 @@ class RequestOutput: class MockModelConfig: use_async_output_proc = True media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - mm_placeholder_str_override: dict[str, str] = field(default_factory=dict) class MockEngine: diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 8e0579b7cfb8..86e28c687847 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -263,26 +263,6 @@ def test_media_io_kwargs_parser(arg, expected): assert args.media_io_kwargs == expected -@pytest.mark.parametrize(("arg", "expected"), [ - (None, dict()), - ('{"video":"<|video_placeholder|>"}', { - "video": "<|video_placeholder|>" - }), - ('{"video":"<|video_placeholder|>", "image": "<|image_placeholder|>"}', { - "video": "<|video_placeholder|>", - "image": "<|image_placeholder|>" - }), -]) -def test_mm_placeholder_str_override_parser(arg, expected): - parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) - if arg is None: - args = parser.parse_args([]) - else: - args = parser.parse_args(["--mm-placeholder-str-override", arg]) - - assert args.mm_placeholder_str_override == expected - - def test_compilation_config(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index e33935118921..ad80946b5671 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -41,7 +41,6 @@ class MockModelConfig: encoder_config = None generation_config: str = "auto" media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - mm_placeholder_str_override: dict[str, str] = field(default_factory=dict) def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/vllm/config.py b/vllm/config.py index 0ec4281ae3c1..5c19061a0d51 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -350,8 +350,6 @@ class ModelConfig: """Additional args passed to process media inputs, keyed by modalities. For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ - mm_placeholder_str_override: dict[str, str] = field(default_factory=dict) - """Optionally override placeholder string for given modalities.""" use_async_output_proc: bool = True """Whether to use async output processor.""" config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value @@ -661,7 +659,7 @@ class ModelConfig: return self._architecture @property - def model_info(self) -> dict[str, Any]: + def model_info(self): return self._model_info def maybe_pull_model_tokenizer_for_s3(self, model: str, @@ -701,7 +699,6 @@ class ModelConfig: return MultiModalConfig( limit_per_prompt=self.limit_mm_per_prompt, media_io_kwargs=self.media_io_kwargs, - mm_placeholder_str_override=self.mm_placeholder_str_override, mm_processor_kwargs=self.mm_processor_kwargs, disable_mm_preprocessor_cache=self. disable_mm_preprocessor_cache) @@ -3096,9 +3093,6 @@ class MultiModalConfig: For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ - mm_placeholder_str_override: dict[str, str] = field(default_factory=dict) - """Optionally override placeholder string for given modalities.""" - mm_processor_kwargs: Optional[dict[str, object]] = None """ Overrides for the multi-modal processor obtained from diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 11dbc23a743b..284f09236131 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -373,8 +373,6 @@ class EngineArgs: media_io_kwargs: dict[str, dict[str, Any]] = get_field(MultiModalConfig, "media_io_kwargs") - mm_placeholder_str_override: dict[str, str] = \ - get_field(MultiModalConfig, "mm_placeholder_str_override") mm_processor_kwargs: Optional[Dict[str, Any]] = \ MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = \ @@ -759,9 +757,6 @@ class EngineArgs: **multimodal_kwargs["limit_per_prompt"]) multimodal_group.add_argument("--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]) - multimodal_group.add_argument( - "--mm-placeholder-str-override", - **multimodal_kwargs["mm_placeholder_str_override"]) multimodal_group.add_argument( "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]) @@ -987,7 +982,6 @@ class EngineArgs: served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, media_io_kwargs=self.media_io_kwargs, - mm_placeholder_str_override=self.mm_placeholder_str_override, use_async_output_proc=not self.disable_async_output_proc, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index bbf651891339..1054b969cd3b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -6,7 +6,7 @@ import json from abc import ABC, abstractmethod from collections import defaultdict, deque from collections.abc import Awaitable, Iterable -from functools import cache, lru_cache, partial +from functools import cached_property, lru_cache, partial from pathlib import Path from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union, cast) @@ -37,6 +37,8 @@ from typing_extensions import Required, TypeAlias, TypedDict from vllm.config import ModelConfig from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model_cls +from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.utils import MediaConnector # yapf: disable @@ -492,6 +494,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): def model_config(self) -> ModelConfig: return self._model_config + @cached_property + def model_cls(self): + return get_model_cls(self.model_config) + @property def allowed_local_media_path(self): return self._model_config.allowed_local_media_path @@ -500,89 +506,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): def mm_registry(self): return MULTIMODAL_REGISTRY - @staticmethod - @cache - def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: - return tokenizer.decode(token_index) - - def _placeholder_str(self, modality: ModalityStr, - current_count: int) -> Optional[str]: - if modality in self._model_config.mm_placeholder_str_override: - return self._model_config.mm_placeholder_str_override[modality] - - # TODO: Let user specify how to insert image tokens into prompt - # (similar to chat template) - hf_config = self._model_config.hf_config - model_type = hf_config.model_type - - if modality in ("image", "image_embeds"): - if model_type == "chatglm": - return "<|begin_of_image|><|endoftext|><|end_of_image|>" - if model_type == "glm4v": - return "<|begin_of_image|><|image|><|end_of_image|>" - if model_type in ("phi3_v", "phi4mm"): - return f"<|image_{current_count}|>" - if model_type in ("minicpmo", "minicpmv"): - return "(./)" - if model_type in ("blip-2", "florence2", "fuyu", "paligemma", - "pixtral", "mistral3"): - # These models do not use image tokens in the prompt - return None - if model_type == "qwen": - return f"Picture {current_count}: " - if model_type.startswith("llava"): - return self._cached_token_str(self._tokenizer, - hf_config.image_token_index) - - if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2", - "internvl_chat", "ovis", "skywork_chat", - "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"): - return "" - if model_type in ("mllama", "llama4"): - return "<|image|>" - if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"): - return "<|vision_start|><|image_pad|><|vision_end|>" - if model_type == "qwen2_5_omni": - return "<|vision_start|><|IMAGE|><|vision_end|>" - if model_type == "molmo": - return "" - if model_type == "aria": - return "<|fim_prefix|><|img|><|fim_suffix|>" - if model_type == "gemma3": - return "" - if model_type == "kimi_vl": - return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501 - - raise TypeError(f"Unknown {modality} model type: {model_type}") - elif modality == "audio": - if model_type in ("ultravox", "granite_speech"): - return "<|audio|>" - if model_type == "phi4mm": - return f"<|audio_{current_count}|>" - if model_type in ("qwen2_audio", "qwen2_5_omni"): - return (f"Audio {current_count}: " - f"<|audio_bos|><|AUDIO|><|audio_eos|>") - if model_type == "minicpmo": - return "()" - raise TypeError(f"Unknown model type: {model_type}") - elif modality == "video": - if model_type == "internvl_chat": - return "