[Core] Move multimodal placeholder from chat utils to model definition (#20355)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-07-03 16:18:30 +08:00 committed by GitHub
parent cb97f2bfc5
commit b024a42e93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
54 changed files with 396 additions and 155 deletions

View File

@ -10,6 +10,22 @@ This document walks you through the steps to extend a basic model so that it acc
It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic]. It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic].
Further update the model as follows: Further update the model as follows:
- Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model.
??? Code
```python
class YourModelForImage2Seq(nn.Module):
...
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
```
- Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example: - Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example:
```diff ```diff

View File

@ -33,7 +33,6 @@ class RequestOutput:
class MockModelConfig: class MockModelConfig:
use_async_output_proc = True use_async_output_proc = True
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
class MockEngine: class MockEngine:

View File

@ -263,26 +263,6 @@ def test_media_io_kwargs_parser(arg, expected):
assert args.media_io_kwargs == expected assert args.media_io_kwargs == expected
@pytest.mark.parametrize(("arg", "expected"), [
(None, dict()),
('{"video":"<|video_placeholder|>"}', {
"video": "<|video_placeholder|>"
}),
('{"video":"<|video_placeholder|>", "image": "<|image_placeholder|>"}', {
"video": "<|video_placeholder|>",
"image": "<|image_placeholder|>"
}),
])
def test_mm_placeholder_str_override_parser(arg, expected):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--mm-placeholder-str-override", arg])
assert args.mm_placeholder_str_override == expected
def test_compilation_config(): def test_compilation_config():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())

View File

@ -41,7 +41,6 @@ class MockModelConfig:
encoder_config = None encoder_config = None
generation_config: str = "auto" generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
def get_diff_sampling_param(self): def get_diff_sampling_param(self):
return self.diff_sampling_param or {} return self.diff_sampling_param or {}

View File

@ -350,8 +350,6 @@ class ModelConfig:
"""Additional args passed to process media inputs, keyed by modalities. """Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """ `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
"""Optionally override placeholder string for given modalities."""
use_async_output_proc: bool = True use_async_output_proc: bool = True
"""Whether to use async output processor.""" """Whether to use async output processor."""
config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value
@ -661,7 +659,7 @@ class ModelConfig:
return self._architecture return self._architecture
@property @property
def model_info(self) -> dict[str, Any]: def model_info(self):
return self._model_info return self._model_info
def maybe_pull_model_tokenizer_for_s3(self, model: str, def maybe_pull_model_tokenizer_for_s3(self, model: str,
@ -701,7 +699,6 @@ class ModelConfig:
return MultiModalConfig( return MultiModalConfig(
limit_per_prompt=self.limit_mm_per_prompt, limit_per_prompt=self.limit_mm_per_prompt,
media_io_kwargs=self.media_io_kwargs, media_io_kwargs=self.media_io_kwargs,
mm_placeholder_str_override=self.mm_placeholder_str_override,
mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_kwargs=self.mm_processor_kwargs,
disable_mm_preprocessor_cache=self. disable_mm_preprocessor_cache=self.
disable_mm_preprocessor_cache) disable_mm_preprocessor_cache)
@ -3096,9 +3093,6 @@ class MultiModalConfig:
For example, to set num_frames for video, set For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """ `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
mm_placeholder_str_override: dict[str, str] = field(default_factory=dict)
"""Optionally override placeholder string for given modalities."""
mm_processor_kwargs: Optional[dict[str, object]] = None mm_processor_kwargs: Optional[dict[str, object]] = None
""" """
Overrides for the multi-modal processor obtained from Overrides for the multi-modal processor obtained from

View File

@ -373,8 +373,6 @@ class EngineArgs:
media_io_kwargs: dict[str, dict[str, media_io_kwargs: dict[str, dict[str,
Any]] = get_field(MultiModalConfig, Any]] = get_field(MultiModalConfig,
"media_io_kwargs") "media_io_kwargs")
mm_placeholder_str_override: dict[str, str] = \
get_field(MultiModalConfig, "mm_placeholder_str_override")
mm_processor_kwargs: Optional[Dict[str, Any]] = \ mm_processor_kwargs: Optional[Dict[str, Any]] = \
MultiModalConfig.mm_processor_kwargs MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = \ disable_mm_preprocessor_cache: bool = \
@ -759,9 +757,6 @@ class EngineArgs:
**multimodal_kwargs["limit_per_prompt"]) **multimodal_kwargs["limit_per_prompt"])
multimodal_group.add_argument("--media-io-kwargs", multimodal_group.add_argument("--media-io-kwargs",
**multimodal_kwargs["media_io_kwargs"]) **multimodal_kwargs["media_io_kwargs"])
multimodal_group.add_argument(
"--mm-placeholder-str-override",
**multimodal_kwargs["mm_placeholder_str_override"])
multimodal_group.add_argument( multimodal_group.add_argument(
"--mm-processor-kwargs", "--mm-processor-kwargs",
**multimodal_kwargs["mm_processor_kwargs"]) **multimodal_kwargs["mm_processor_kwargs"])
@ -987,7 +982,6 @@ class EngineArgs:
served_model_name=self.served_model_name, served_model_name=self.served_model_name,
limit_mm_per_prompt=self.limit_mm_per_prompt, limit_mm_per_prompt=self.limit_mm_per_prompt,
media_io_kwargs=self.media_io_kwargs, media_io_kwargs=self.media_io_kwargs,
mm_placeholder_str_override=self.mm_placeholder_str_override,
use_async_output_proc=not self.disable_async_output_proc, use_async_output_proc=not self.disable_async_output_proc,
config_format=self.config_format, config_format=self.config_format,
mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_kwargs=self.mm_processor_kwargs,

View File

@ -6,7 +6,7 @@ import json
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections import defaultdict, deque from collections import defaultdict, deque
from collections.abc import Awaitable, Iterable from collections.abc import Awaitable, Iterable
from functools import cache, lru_cache, partial from functools import cached_property, lru_cache, partial
from pathlib import Path from pathlib import Path
from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union, from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
cast) cast)
@ -37,6 +37,8 @@ from typing_extensions import Required, TypeAlias, TypedDict
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model_cls
from vllm.model_executor.models import SupportsMultiModal
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.utils import MediaConnector from vllm.multimodal.utils import MediaConnector
# yapf: disable # yapf: disable
@ -492,6 +494,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
def model_config(self) -> ModelConfig: def model_config(self) -> ModelConfig:
return self._model_config return self._model_config
@cached_property
def model_cls(self):
return get_model_cls(self.model_config)
@property @property
def allowed_local_media_path(self): def allowed_local_media_path(self):
return self._model_config.allowed_local_media_path return self._model_config.allowed_local_media_path
@ -500,89 +506,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
def mm_registry(self): def mm_registry(self):
return MULTIMODAL_REGISTRY return MULTIMODAL_REGISTRY
@staticmethod
@cache
def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
return tokenizer.decode(token_index)
def _placeholder_str(self, modality: ModalityStr,
current_count: int) -> Optional[str]:
if modality in self._model_config.mm_placeholder_str_override:
return self._model_config.mm_placeholder_str_override[modality]
# TODO: Let user specify how to insert image tokens into prompt
# (similar to chat template)
hf_config = self._model_config.hf_config
model_type = hf_config.model_type
if modality in ("image", "image_embeds"):
if model_type == "chatglm":
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
if model_type == "glm4v":
return "<|begin_of_image|><|image|><|end_of_image|>"
if model_type in ("phi3_v", "phi4mm"):
return f"<|image_{current_count}|>"
if model_type in ("minicpmo", "minicpmv"):
return "(<image>./</image>)"
if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
"pixtral", "mistral3"):
# These models do not use image tokens in the prompt
return None
if model_type == "qwen":
return f"Picture {current_count}: <img></img>"
if model_type.startswith("llava"):
return self._cached_token_str(self._tokenizer,
hf_config.image_token_index)
if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
"internvl_chat", "ovis", "skywork_chat",
"NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"):
return "<image>"
if model_type in ("mllama", "llama4"):
return "<|image|>"
if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if model_type == "qwen2_5_omni":
return "<|vision_start|><|IMAGE|><|vision_end|>"
if model_type == "molmo":
return ""
if model_type == "aria":
return "<|fim_prefix|><|img|><|fim_suffix|>"
if model_type == "gemma3":
return "<start_of_image>"
if model_type == "kimi_vl":
return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501
raise TypeError(f"Unknown {modality} model type: {model_type}")
elif modality == "audio":
if model_type in ("ultravox", "granite_speech"):
return "<|audio|>"
if model_type == "phi4mm":
return f"<|audio_{current_count}|>"
if model_type in ("qwen2_audio", "qwen2_5_omni"):
return (f"Audio {current_count}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
if model_type == "minicpmo":
return "(<audio>./</audio>)"
raise TypeError(f"Unknown model type: {model_type}")
elif modality == "video":
if model_type == "internvl_chat":
return "<video>"
if model_type == "glm4v":
return "<|begin_of_video|><|video|><|end_of_video|>"
if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
return "<|vision_start|><|video_pad|><|vision_end|>"
if model_type == "qwen2_5_omni":
return "<|vision_start|><|VIDEO|><|vision_end|>"
if model_type in ("minicpmo", "minicpmv"):
return "(<video>./</video>)"
if model_type.startswith("llava"):
return self._cached_token_str(self._tokenizer,
hf_config.video_token_index)
raise TypeError(f"Unknown {modality} model type: {model_type}")
else:
raise TypeError(f"Unknown modality: {modality}")
def add(self, modality: ModalityStr, item: _T) -> Optional[str]: def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
""" """
Add a multi-modal item to the current prompt and returns the Add a multi-modal item to the current prompt and returns the
@ -590,6 +513,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
""" """
mm_registry = self.mm_registry mm_registry = self.mm_registry
model_config = self.model_config model_config = self.model_config
model_cls = cast(SupportsMultiModal, self.model_cls)
input_modality = modality.replace("_embeds", "") input_modality = modality.replace("_embeds", "")
@ -614,7 +538,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
self._items_by_modality[modality].append(item) self._items_by_modality[modality].append(item)
return self._placeholder_str(modality, current_count) return model_cls.get_placeholder_str(modality, current_count)
@abstractmethod @abstractmethod
def create_parser(self) -> "BaseMultiModalContentParser": def create_parser(self) -> "BaseMultiModalContentParser":

View File

@ -5,6 +5,7 @@ import io
import math import math
import time import time
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from functools import cached_property
from math import ceil from math import ceil
from typing import Callable, Literal, Optional, TypeVar, Union, cast from typing import Callable, Literal, Optional, TypeVar, Union, cast
@ -24,7 +25,8 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.inputs.data import PromptType from vllm.inputs.data import PromptType
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.model_loader.utils import get_model_architecture from vllm.model_executor.model_loader import get_model_cls
from vllm.model_executor.models import SupportsTranscription
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.processor import cached_get_processor
from vllm.utils import PlaceholderModule from vllm.utils import PlaceholderModule
@ -76,24 +78,29 @@ class OpenAISpeechToText(OpenAIServing):
self.model_sr = processor.feature_extractor.sampling_rate self.model_sr = processor.feature_extractor.sampling_rate
self.hop_length = processor.feature_extractor.hop_length self.hop_length = processor.feature_extractor.hop_length
self.task_type = task_type self.task_type = task_type
self.model_cls, _ = get_model_architecture(model_config)
if self.default_sampling_params: if self.default_sampling_params:
logger.info( logger.info(
"Overwriting default completion sampling param with: %s", "Overwriting default completion sampling param with: %s",
self.default_sampling_params) self.default_sampling_params)
@cached_property
def model_cls(self):
return get_model_cls(self.model_config)
async def _preprocess_speech_to_text( async def _preprocess_speech_to_text(
self, self,
request: SpeechToTextRequest, request: SpeechToTextRequest,
audio_data: bytes, audio_data: bytes,
) -> tuple[list[PromptType], float]: ) -> tuple[list[PromptType], float]:
model_cls = cast(SupportsTranscription, self.model_cls)
# Validate request # Validate request
# TODO language should be optional and can be guessed. # TODO language should be optional and can be guessed.
# For now we default to en. See # For now we default to en. See
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520
lang = request.language or "en" lang = request.language or "en"
self.model_cls.validate_language(lang) # type: ignore[attr-defined] model_cls.validate_language(lang)
if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB: if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB:
raise ValueError("Maximum file size exceeded.") raise ValueError("Maximum file size exceeded.")
@ -117,9 +124,8 @@ class OpenAISpeechToText(OpenAIServing):
}, },
}, },
"decoder_prompt": "decoder_prompt":
self.model_cls. model_cls.get_decoder_prompt(lang, self.task_type,
get_decoder_prompt( # type: ignore[attr-defined] request.prompt)
lang, self.task_type, request.prompt)
} }
prompts.append(cast(PromptType, prompt)) prompts.append(cast(PromptType, prompt))
return prompts, duration return prompts, duration

View File

@ -18,7 +18,7 @@ from vllm.model_executor.model_loader.sharded_state_loader import (
ShardedStateLoader) ShardedStateLoader)
from vllm.model_executor.model_loader.tensorizer_loader import TensorizerLoader from vllm.model_executor.model_loader.tensorizer_loader import TensorizerLoader
from vllm.model_executor.model_loader.utils import ( from vllm.model_executor.model_loader.utils import (
get_architecture_class_name, get_model_architecture) get_architecture_class_name, get_model_architecture, get_model_cls)
def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
@ -65,6 +65,7 @@ __all__ = [
"get_model_loader", "get_model_loader",
"get_architecture_class_name", "get_architecture_class_name",
"get_model_architecture", "get_model_architecture",
"get_model_cls",
"BaseModelLoader", "BaseModelLoader",
"BitsAndBytesModelLoader", "BitsAndBytesModelLoader",
"GGUFModelLoader", "GGUFModelLoader",

View File

@ -13,7 +13,7 @@ import time
from collections.abc import Generator from collections.abc import Generator
from dataclasses import dataclass from dataclasses import dataclass
from functools import partial from functools import partial
from typing import Any, BinaryIO, Optional, Union from typing import TYPE_CHECKING, Any, BinaryIO, Optional, Union
import regex as re import regex as re
import torch import torch
@ -24,12 +24,14 @@ from transformers import PretrainedConfig
import vllm.envs as envs import vllm.envs as envs
from vllm.config import (ModelConfig, ParallelConfig, VllmConfig, from vllm.config import (ModelConfig, ParallelConfig, VllmConfig,
set_current_vllm_config) set_current_vllm_config)
from vllm.engine.arg_utils import EngineArgs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding) VocabParallelEmbedding)
from vllm.utils import FlexibleArgumentParser, PlaceholderModule from vllm.utils import FlexibleArgumentParser, PlaceholderModule
if TYPE_CHECKING:
from vllm.engine.arg_utils import EngineArgs
try: try:
from tensorizer import (DecryptionParams, EncryptionParams, from tensorizer import (DecryptionParams, EncryptionParams,
TensorDeserializer, TensorSerializer) TensorDeserializer, TensorSerializer)
@ -503,7 +505,7 @@ def serialize_vllm_model(
return model return model
def tensorize_vllm_model(engine_args: EngineArgs, def tensorize_vllm_model(engine_args: "EngineArgs",
tensorizer_config: TensorizerConfig, tensorizer_config: TensorizerConfig,
generate_keyfile: bool = True): generate_keyfile: bool = True):
"""Utility to load a model and then serialize it with Tensorizer """Utility to load a model and then serialize it with Tensorizer

View File

@ -253,6 +253,10 @@ def get_model_architecture(
return model_cls, arch return model_cls, arch
def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
return get_model_architecture(model_config)[0]
def get_architecture_class_name(model_config: ModelConfig) -> str: def get_architecture_class_name(model_config: ModelConfig) -> str:
return get_model_architecture(model_config)[1] return get_model_architecture(model_config)[1]

View File

@ -2,9 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
SupportsPP, SupportsV0Only, has_inner_state, SupportsPP, SupportsTranscription, SupportsV0Only,
supports_lora, supports_multimodal, supports_pp, has_inner_state, supports_lora, supports_multimodal,
supports_v0_only) supports_pp, supports_transcription, supports_v0_only)
from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration, from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
is_pooling_model, is_text_generation_model) is_pooling_model, is_text_generation_model)
from .registry import ModelRegistry from .registry import ModelRegistry
@ -23,6 +23,8 @@ __all__ = [
"supports_multimodal", "supports_multimodal",
"SupportsPP", "SupportsPP",
"supports_pp", "supports_pp",
"SupportsTranscription",
"supports_transcription",
"SupportsV0Only", "SupportsV0Only",
"supports_v0_only", "supports_v0_only",
] ]

View File

@ -499,6 +499,13 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
}, },
) )
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|fim_prefix|><|img|><|fim_suffix|>"
raise ValueError("Only image modality is supported")
def __init__( def __init__(
self, self,
vllm_config: VllmConfig, vllm_config: VllmConfig,

View File

@ -304,6 +304,13 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: AyaVisionConfig = vllm_config.model_config.hf_config config: AyaVisionConfig = vllm_config.model_config.hf_config

View File

@ -507,6 +507,13 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
SupportsQuant): SupportsQuant):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()

View File

@ -933,6 +933,13 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
"gate_up_proj": ["gate_proj", "up_proj"] "gate_up_proj": ["gate_proj", "up_proj"]
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -315,6 +315,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
"language.": "language_model.", "language.": "language_model.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: DeepseekVLV2Config = vllm_config.model_config.hf_config config: DeepseekVLV2Config = vllm_config.model_config.hf_config

View File

@ -877,6 +877,13 @@ class Florence2MultiModalProcessor(
class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsV0Only): SupportsV0Only):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -254,6 +254,13 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -483,6 +483,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<start_of_image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -1257,6 +1257,15 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
"model.visual.": "visual.", "model.visual.": "visual.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|begin_of_image|><|image|><|end_of_image|>"
if modality.startswith("video"):
return "<|begin_of_video|><|video|><|end_of_video|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: Glm4vConfig = vllm_config.model_config.hf_config config: Glm4vConfig = vllm_config.model_config.hf_config

View File

@ -540,6 +540,13 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
connector="transformer.vision.linear_proj", connector="transformer.vision.linear_proj",
tower_model="transformer.vision.transformer") tower_model="transformer.vision.transformer")
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
raise ValueError("Only image modality is supported")
def __init__( def __init__(
self, self,
*, *,

View File

@ -533,6 +533,13 @@ class GraniteSpeechForConditionalGeneration(
], ],
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("audio"):
return "<|audio|>"
raise ValueError("Only audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str): def __init__(self, *, vllm_config: VllmConfig, prefix: str):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -591,6 +591,13 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
], ],
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()

View File

@ -46,6 +46,13 @@ class SupportsMultiModal(Protocol):
MRO of your model class. MRO of your model class.
""" """
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
"""
Get the placeholder text for the `i`th `modality` item in the prompt.
"""
...
def get_multimodal_embeddings(self, def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings: **kwargs: object) -> MultiModalEmbeddings:
""" """

View File

@ -1023,6 +1023,15 @@ class InternVLMultiModalProcessor(
class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
SupportsLoRA): SupportsLoRA):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
if modality.startswith("video"):
return "<video>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()

View File

@ -1343,6 +1343,15 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
"model.": "language_model.model.", "model.": "language_model.model.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|video_pad|><|vision_end|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: PretrainedConfig = vllm_config.model_config.hf_config config: PretrainedConfig = vllm_config.model_config.hf_config

View File

@ -264,6 +264,13 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
dummy_inputs=KimiVLDummyInputsBuilder) dummy_inputs=KimiVLDummyInputsBuilder)
class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal): class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>"
raise ValueError("Only image modality is supported")
def __init__( def __init__(
self, self,
vllm_config: VllmConfig, vllm_config: VllmConfig,

View File

@ -511,6 +511,13 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()

View File

@ -215,6 +215,13 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -281,6 +281,15 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
if modality.startswith("video"):
return "<video>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -446,6 +446,15 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
if modality.startswith("video"):
return "<video>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -511,6 +511,17 @@ class MiniCPMO(MiniCPMV2_6):
], ],
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "(<image>./</image>)"
if modality.startswith("video"):
return "(<video>./</video>)"
if modality.startswith("audio"):
return "(<audio>./</audio>)"
raise ValueError("Only image, video or audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix) super().__init__(vllm_config=vllm_config, prefix=prefix)
self.apm = self.init_audio_module(vllm_config=vllm_config, self.apm = self.init_audio_module(vllm_config=vllm_config,

View File

@ -735,6 +735,15 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
instantiated. instantiated.
""" """
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "(<image>./</image>)"
if modality.startswith("video"):
return "(<video>./</video>)"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
multimodal_config = vllm_config.model_config.multimodal_config multimodal_config = vllm_config.model_config.multimodal_config

View File

@ -158,6 +158,13 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
"gate_up_proj": ["gate_proj", "up_proj"] "gate_up_proj": ["gate_proj", "up_proj"]
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()

View File

@ -401,6 +401,13 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()

View File

@ -1276,6 +1276,13 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
}, },
) )
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|image|>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: MllamaConfig = vllm_config.model_config.hf_config config: MllamaConfig = vllm_config.model_config.hf_config

View File

@ -719,6 +719,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
"qkv_proj": ["q_proj", "k_proj", "v_proj"], "qkv_proj": ["q_proj", "k_proj", "v_proj"],
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|image|>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -1366,6 +1366,13 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
"merged_linear": ["gate_proj", "up_proj"] # image_projector "merged_linear": ["gate_proj", "up_proj"] # image_projector
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -405,6 +405,13 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
dummy_inputs=OvisDummyInputsBuilder) dummy_inputs=OvisDummyInputsBuilder)
class Ovis(nn.Module, SupportsMultiModal, SupportsPP): class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -240,6 +240,13 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -520,6 +520,13 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
"model.": "language_model.model.", "model.": "language_model.model.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return f"<|image_{i}|>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -902,6 +902,15 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
}, },
) )
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return f"<|image_{i}|>"
if modality.startswith("audio"):
return f"<|audio_{i}|>"
raise ValueError("Only image or audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -327,6 +327,13 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsPP): SupportsPP):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -118,6 +118,13 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal,
SupportsV0Only): SupportsV0Only):
""" Prithvi Masked Autoencoder""" """ Prithvi Masked Autoencoder"""
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def _instantiate_model(self, config: dict) -> Optional[nn.Module]: def _instantiate_model(self, config: dict) -> Optional[nn.Module]:
# We might be able/need to support different tasks with this same model # We might be able/need to support different tasks with this same model

View File

@ -717,6 +717,17 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
"thinker.": "", "thinker.": "",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|IMAGE|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|VIDEO|><|vision_end|>"
if modality.startswith("audio"):
return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
raise ValueError("Only image, video or audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
thinker_config: Qwen2_5OmniThinkerConfig = ( thinker_config: Qwen2_5OmniThinkerConfig = (

View File

@ -835,6 +835,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
"model.": "language_model.model.", "model.": "language_model.model.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|video_pad|><|vision_end|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config

View File

@ -251,6 +251,13 @@ class Qwen2AudioMultiModalProcessor(
class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsPP): SupportsPP):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("audio"):
return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
raise ValueError("Only audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -1096,6 +1096,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
"model.": "language_model.model.", "model.": "language_model.model.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|video_pad|><|vision_end|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: Qwen2VLConfig = vllm_config.model_config.hf_config config: Qwen2VLConfig = vllm_config.model_config.hf_config

View File

@ -675,6 +675,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
connector="transformer.visual.attn_pool", connector="transformer.visual.attn_pool",
tower_model="transformer.visual.transformer") tower_model="transformer.visual.transformer")
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return f"Picture {i}: <img></img>"
raise ValueError("Only image modality is supported")
def __init__( def __init__(
self, self,
*, *,

View File

@ -648,6 +648,13 @@ class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
dummy_inputs=SkyworkR1VDummyInputsBuilder) dummy_inputs=SkyworkR1VDummyInputsBuilder)
class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()

View File

@ -393,6 +393,13 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
"gate_up_proj": ["gate_proj", "up_proj"] "gate_up_proj": ["gate_proj", "up_proj"]
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()
config: TarsierHfConfig = vllm_config.model_config.hf_config config: TarsierHfConfig = vllm_config.model_config.hf_config

View File

@ -407,6 +407,13 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."}) orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("audio"):
return "<|audio|>"
raise ValueError("Only audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config

View File

@ -761,6 +761,35 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
".fc2.": ".mlp.fc2." ".fc2.": ".mlp.fc2."
}) })
@classmethod
def validate_language(cls, language: str) -> bool:
if language in ISO639_1_SUPPORTED_LANGS:
return True
elif language in ISO639_1_OTHER_LANGS:
logger.warning(
"The selected language %s has limited accuracy with"
" reported WER>=0.5. Results may be less accurate "
"for this choice.", language)
return True
else:
raise ValueError(f"Unsupported language: {language}."
"Language should be one of:" +
f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
f"or {list(ISO639_1_OTHER_LANGS.values())}")
@classmethod
def get_decoder_prompt(cls, language: str, task_type: str,
prompt: str) -> str:
return (f"<|startoftranscript|><|{language}|><|{task_type}|>"
f"<|notimestamps|>{prompt}")
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("audio"):
return None
raise ValueError("Only audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
@ -840,28 +869,6 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
weights = _create_fake_bias_for_k_proj(weights) weights = _create_fake_bias_for_k_proj(weights)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
@classmethod
def validate_language(cls, language: str) -> bool:
if language in ISO639_1_SUPPORTED_LANGS:
return True
elif language in ISO639_1_OTHER_LANGS:
logger.warning(
"The selected language %s has limited accuracy with"
" reported WER>=0.5. Results may be less accurate "
"for this choice.", language)
return True
else:
raise ValueError(f"Unsupported language: {language}."
"Language should be one of:" +
f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
f"or {list(ISO639_1_OTHER_LANGS.values())}")
@classmethod
def get_decoder_prompt(cls, language: str, task_type: str,
prompt: str) -> str:
return (f"<|startoftranscript|><|{language}|><|{task_type}|>"
f"<|notimestamps|>{prompt}")
def _create_fake_bias_for_k_proj( def _create_fake_bias_for_k_proj(
weights: Iterable[tuple[str, torch.Tensor]] weights: Iterable[tuple[str, torch.Tensor]]