[Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs (#23053)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-08-18 17:52:00 +08:00 committed by GitHub
parent 5c79b0d648
commit 27e8d1ea3e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
77 changed files with 431 additions and 383 deletions

View File

@ -77,6 +77,7 @@ Internal data structures.
- [vllm.multimodal.inputs.MultiModalFieldElem][] - [vllm.multimodal.inputs.MultiModalFieldElem][]
- [vllm.multimodal.inputs.MultiModalFieldConfig][] - [vllm.multimodal.inputs.MultiModalFieldConfig][]
- [vllm.multimodal.inputs.MultiModalKwargsItem][] - [vllm.multimodal.inputs.MultiModalKwargsItem][]
- [vllm.multimodal.inputs.MultiModalKwargsItems][]
- [vllm.multimodal.inputs.MultiModalKwargs][] - [vllm.multimodal.inputs.MultiModalKwargs][]
- [vllm.multimodal.inputs.MultiModalInputs][] - [vllm.multimodal.inputs.MultiModalInputs][]

View File

@ -629,7 +629,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index image_token_id = hf_config.image_token_index
@ -778,7 +778,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id bos_token_id = hf_config.bos_token_id

View File

@ -370,10 +370,16 @@ def _assert_inputs_equal(
if ignore_mm_keys is None: if ignore_mm_keys is None:
ignore_mm_keys = set() ignore_mm_keys = set()
assert "mm_kwargs" in a and "mm_kwargs" in b, msg a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"}
b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"}
assert a_rest == b_rest, msg
a_data = a["mm_kwargs"].get_data()
b_data = b["mm_kwargs"].get_data()
for key in ignore_mm_keys: for key in ignore_mm_keys:
a["mm_kwargs"].pop(key, None) a_data.pop(key, None)
b["mm_kwargs"].pop(key, None) b_data.pop(key, None)
assert a == b, msg assert a_data == b_data, msg

View File

@ -45,7 +45,8 @@ def test_processor_override(
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token) video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
video_tok_count = processed_inputs["prompt_token_ids"].count( video_tok_count = processed_inputs["prompt_token_ids"].count(
video_token_id) video_token_id)
grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0] grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
)["video_grid_thw"][0]
assert grid_t == expected_grid_t assert grid_t == expected_grid_t
assert video_tok_count == expected_toks_per_frame * grid_t assert video_tok_count == expected_toks_per_frame * grid_t

View File

@ -108,7 +108,8 @@ def _run_check(
# Ensure we have the right number of placeholders per num_crops size # Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>") image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches

View File

@ -68,7 +68,8 @@ def _run_check(
# Ensure we have the right number of placeholders per num_crops size # Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>") image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches

View File

@ -51,14 +51,14 @@ def test_processor_override(
prompt = encode_tokens(tokenizer, prompt) prompt = encode_tokens(tokenizer, prompt)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
mm_kwargs = processed_inputs["mm_kwargs"] mm_data = processed_inputs["mm_kwargs"].get_data()
# place holder replacements # place holder replacements
prompt_token_ids = processed_inputs["prompt_token_ids"] prompt_token_ids = processed_inputs["prompt_token_ids"]
assert prompt_token_ids.count(config.boi_token_index) == num_imgs assert prompt_token_ids.count(config.boi_token_index) == num_imgs
assert prompt_token_ids.count(config.eoi_token_index) == num_imgs assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
aspect_ratios = mm_kwargs["aspect_ratios"] aspect_ratios = mm_data["aspect_ratios"]
num_x_separators = num_y_separators = 0 num_x_separators = num_y_separators = 0
for tiles_y, tiles_x in aspect_ratios: for tiles_y, tiles_x in aspect_ratios:
if tiles_x * tiles_y > 1: if tiles_x * tiles_y > 1:
@ -80,6 +80,6 @@ def test_processor_override(
num_patches_per_chunk = processor.info.get_patch_per_chunk( num_patches_per_chunk = processor.info.get_patch_per_chunk(
config.vision_config) config.vision_config)
assert prompt_token_ids.count(config.image_token_index) \ assert prompt_token_ids.count(config.image_token_index) \
== mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk == sum(mm_data["patches_per_image"]) * num_patches_per_chunk
assert mm_kwargs["pixel_values"].shape[0] \ assert len(mm_data["pixel_values"]) \
== mm_kwargs["patches_per_image"].sum() == sum(mm_data["patches_per_image"])

View File

@ -49,18 +49,18 @@ def test_profiling(
encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids) encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
] * max_num_seqs ] * max_num_seqs
mm_kwargs = processor.apply( mm_data = processor.apply(
prompt=dummy_mm_data.prompt, prompt=dummy_mm_data.prompt,
mm_data=dummy_mm_data.mm_data, mm_data=dummy_mm_data.mm_data,
hf_processor_mm_kwargs=dict(), hf_processor_mm_kwargs=dict(),
)["mm_kwargs"] )["mm_kwargs"].get_data()
# Get the actual number of encoder tokens for each sample. # Get the actual number of encoder tokens for each sample.
# Because attn_metadata.encoder_seq_lens only counts the last # Because attn_metadata.encoder_seq_lens only counts the last
# group of images for each sample, which is used to cheat the # group of images for each sample, which is used to cheat the
# block manager to allocate blocks for those images only. # block manager to allocate blocks for those images only.
# See MllamaMultiModalProcessor for more details. # See MllamaMultiModalProcessor for more details.
num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")] num_tiles = [[t] for t in mm_data.pop("num_tiles")]
num_tokens_per_tile = calc_token_per_chunk(image_size) num_tokens_per_tile = calc_token_per_chunk(image_size)
actual_encoder_seq_lens = [ actual_encoder_seq_lens = [
sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles

View File

@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int):
hf_config = ctx.get_hf_config(Llama4Config) hf_config = ctx.get_hf_config(Llama4Config)
mm_kwargs = processor.apply( mm_data = processor.apply(
prompt=dummy_mm_data.prompt, prompt=dummy_mm_data.prompt,
mm_data=dummy_mm_data.mm_data, mm_data=dummy_mm_data.mm_data,
hf_processor_mm_kwargs=dict(), hf_processor_mm_kwargs=dict(),
)["mm_kwargs"] )["mm_kwargs"].get_data()
image_size = hf_config.vision_config.image_size image_size = hf_config.vision_config.image_size
patch_size = hf_config.vision_config.patch_size patch_size = hf_config.vision_config.patch_size
downsample_ratio = int( downsample_ratio = int(
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))) round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
chunks_per_image = prod(mm_kwargs["patches_per_image"]) chunks_per_image = prod(mm_data["patches_per_image"])
total_num_patches = chunks_per_image * tokens_per_patch total_num_patches = chunks_per_image * tokens_per_patch
num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][ num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
0][1] # x-y seperator tokens 1] # x-y seperator tokens
total_tokens = total_num_patches.item() + num_tiles.item( total_tokens = total_num_patches.item() + num_tiles.item(
) + 3 # image start, image, image end ) + 3 # image start, image, image end

View File

@ -70,7 +70,8 @@ def _run_check(
# Ensure we have the right number of placeholders per num_crops size # Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<image>") image_token_id = tokenizer.convert_tokens_to_ids("<image>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape) print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
assert img_tok_count == 256 * total_expected_num_patches assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches

View File

@ -48,7 +48,8 @@ def test_processor_override(
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values"].shape
assert img_tok_count == expected_toks_per_img * num_imgs assert img_tok_count == expected_toks_per_img * num_imgs
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs

View File

@ -128,7 +128,7 @@ def create_batched_mm_kwargs(
)["mm_kwargs"] )["mm_kwargs"]
items = [ items = [
item for modality in supported_mm_limits item for modality in supported_mm_limits
for item in mm_kwargs.get_items(modality) for item in mm_kwargs[modality]
] ]
return group_mm_kwargs_by_modality(items) return group_mm_kwargs_by_modality(items)

View File

@ -4,8 +4,8 @@ import pytest
import torch import torch
from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
MultiModalKwargsItem, MultiModalKwargsItems,
MultiModalSharedField) MultiModalSharedField)
@ -24,8 +24,8 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]):
]) ])
def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]):
return MultiModalKwargs([ return MultiModalKwargsItems.from_seq([
_dummy_item(modality, size_by_key) _dummy_item(modality, size_by_key)
for modality, size_by_key in size_by_key_modality.items() for modality, size_by_key in size_by_key_modality.items()
]) ])
@ -37,7 +37,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
[ [
(_dummy_item("a", {"a1": 100}), 100), (_dummy_item("a", {"a1": 100}), 100),
(_dummy_item("a", {"a1": 100, "a2": 110}), 210), (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
(_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501
], ],
) )
# yapf: enable # yapf: enable

View File

@ -11,7 +11,8 @@ import torch
from vllm.multimodal.inputs import (MultiModalBatchedField, from vllm.multimodal.inputs import (MultiModalBatchedField,
MultiModalFieldElem, MultiModalFlatField, MultiModalFieldElem, MultiModalFlatField,
MultiModalKwargs, MultiModalKwargsItem, MultiModalKwargsItem,
MultiModalKwargsItems,
MultiModalSharedField, NestedTensors) MultiModalSharedField, NestedTensors)
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
class MyRequest(msgspec.Struct): class MyRequest(msgspec.Struct):
mm: Optional[list[MultiModalKwargs]] mm: Optional[list[MultiModalKwargsItems]]
def test_multimodal_kwargs(): def test_multimodal_kwargs():
@ -119,7 +120,7 @@ def test_multimodal_kwargs():
audio = MultiModalKwargsItem.from_elems([e1]) audio = MultiModalKwargsItem.from_elems([e1])
video = MultiModalKwargsItem.from_elems([e2]) video = MultiModalKwargsItem.from_elems([e2])
image = MultiModalKwargsItem.from_elems([e3, e4]) image = MultiModalKwargsItem.from_elems([e3, e4])
mm = MultiModalKwargs([audio, video, image]) mm = MultiModalKwargsItems.from_seq([audio, video, image])
# pack mm kwargs into a mock request so that it can be decoded properly # pack mm kwargs into a mock request so that it can be decoded properly
req = MyRequest([mm]) req = MyRequest([mm])
@ -133,19 +134,22 @@ def test_multimodal_kwargs():
total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
# expected total encoding length, should be 14255, +-20 for minor changes # expected total encoding length, should be 14306, +-20 for minor changes
assert 14250 <= total_len <= 14300 assert 14275 <= total_len <= 14325
decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] decoded = decoder.decode(encoded).mm[0]
assert isinstance(decoded, MultiModalKwargsItems)
# check all modalities were recovered and do some basic sanity checks # check all modalities were recovered and do some basic sanity checks
assert len(decoded.modalities) == 3 assert len(decoded) == 3
images = decoded.get_items("image") images = decoded["image"]
assert len(images) == 1 assert len(images) == 1
assert len(images[0].items()) == 2 assert len(images[0].items()) == 2
assert list(images[0].keys()) == ["i0", "i1"] assert list(images[0].keys()) == ["i0", "i1"]
# check the tensor contents and layout in the main dict # check the tensor contents and layout in the main dict
assert all(nested_equal(mm[k], decoded[k]) for k in mm) mm_data = mm.get_data()
decoded_data = decoded.get_data()
assert all(nested_equal(mm_data[k], decoded_data[k]) for k in mm_data)
def nested_equal(a: NestedTensors, b: NestedTensors): def nested_equal(a: NestedTensors, b: NestedTensors):

View File

@ -4,11 +4,12 @@
from array import array from array import array
from typing import Any, Type from typing import Any, Type
from vllm.multimodal.inputs import MultiModalKwargs
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
def encode_hook(obj: Any) -> Any: def encode_hook(obj: Any) -> Any:
"""Custom msgspec enc hook that supports array types. """Custom msgspec enc hook that supports array types and MultiModalKwargs.
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
""" """
@ -17,10 +18,12 @@ def encode_hook(obj: Any) -> Any:
f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
f"Given array has a type code of {obj.typecode}.") f"Given array has a type code of {obj.typecode}.")
return obj.tobytes() return obj.tobytes()
if isinstance(obj, MultiModalKwargs):
return dict(obj)
def decode_hook(type: Type, obj: Any) -> Any: def decode_hook(type: Type, obj: Any) -> Any:
"""Custom msgspec dec hook that supports array types. """Custom msgspec dec hook that supports array types and MultiModalKwargs.
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
""" """
@ -28,3 +31,5 @@ def decode_hook(type: Type, obj: Any) -> Any:
deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
deserialized.frombytes(obj) deserialized.frombytes(obj)
return deserialized return deserialized
if type is MultiModalKwargs:
return MultiModalKwargs(obj)

View File

@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.weight_utils import (
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement, BaseProcessingInfo, PromptReplacement,
@ -470,7 +470,7 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index image_token_id = hf_config.image_token_index

View File

@ -18,7 +18,7 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -242,7 +242,7 @@ class AyaVisionMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token = hf_processor.image_token image_token = hf_processor.image_token

View File

@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptIndexTargets, BaseProcessingInfo, PromptIndexTargets,
@ -492,7 +492,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab() vocab = tokenizer.get_vocab()

View File

@ -31,7 +31,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement, BaseProcessingInfo, PromptReplacement,
@ -151,7 +151,7 @@ class ChameleonMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()

View File

@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -241,7 +241,7 @@ class Cohere2VisionMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token = hf_processor.image_token image_token = hf_processor.image_token

View File

@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.models.transformers import replace_linear_class from vllm.model_executor.models.transformers import replace_linear_class
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -252,7 +252,7 @@ class DeepseekVL2MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
@ -291,7 +291,8 @@ class DeepseekVL2MultiModalProcessor(
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*, *,
return_mm_hashes: bool, return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
bool]:
# The processor logic is different for len(images) <= 2 vs > 2 # The processor logic is different for len(images) <= 2 vs > 2
# Since the processing cache assumes that the processor output is # Since the processing cache assumes that the processor output is
# invariant of how many images are passed per prompt, we only # invariant of how many images are passed per prompt, we only

View File

@ -21,7 +21,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseProcessingInfo, from vllm.multimodal.processing import (BaseProcessingInfo,
EncDecMultiModalProcessor, EncDecMultiModalProcessor,
@ -860,7 +860,7 @@ class Florence2MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
pad_token_id = hf_config.pad_token_id pad_token_id = hf_config.pad_token_id

View File

@ -32,7 +32,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -226,7 +226,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id bos_token_id = hf_config.bos_token_id

View File

@ -17,7 +17,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
# yapf: disable # yapf: disable
@ -311,7 +311,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token = hf_processor.boi_token image_token = hf_processor.boi_token

View File

@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems, from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
# yapf: disable # yapf: disable
@ -209,7 +209,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)

View File

@ -59,7 +59,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, VideoItem) MultiModalKwargsItems, VideoItem)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -1158,7 +1158,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor( image_processor = self.info.get_image_processor(
@ -1175,14 +1175,16 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
merge_length = image_processor.merge_size**2 merge_length = image_processor.merge_size**2
def get_image_replacement_glm4v(item_idx: int): def get_image_replacement_glm4v(item_idx: int):
grid_thw = out_mm_kwargs["image_grid_thw"][item_idx] out_item = out_mm_kwargs["image"][item_idx]
grid_thw = out_item["image_grid_thw"].data
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
num_tokens = int(grid_thw.prod()) // merge_length num_tokens = int(grid_thw.prod()) // merge_length
return [hf_processor.image_token_id] * num_tokens return [hf_processor.image_token_id] * num_tokens
def get_video_replacement_glm4v(item_idx: int): def get_video_replacement_glm4v(item_idx: int):
grid_thw = out_mm_kwargs["video_grid_thw"][item_idx] out_item = out_mm_kwargs["video"][item_idx]
grid_thw = out_item["video_grid_thw"].data
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
video, metadata = mm_items["video"][item_idx] video, metadata = mm_items["video"][item_idx]

View File

@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement, BaseProcessingInfo, PromptReplacement,
@ -503,7 +503,7 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()

View File

@ -40,7 +40,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -118,7 +118,7 @@ class GraniteSpeechMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> list[PromptUpdate]: ) -> list[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()

View File

@ -17,7 +17,7 @@ from transformers import PretrainedConfig
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs from vllm.multimodal.inputs import MultiModalKwargsItems
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement, from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
@ -425,18 +425,19 @@ class H2OVLMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "image_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
image_num_patches = out_mm_kwargs["image_num_patches"] if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor) assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist() image_num_patches = image_num_patches.tolist()
elif "image_embeds" in out_mm_kwargs: elif "image_embeds" in out_mm_data:
# TODO: Use image size information in dictionary embedding inputs # TODO: Use image size information in dictionary embedding inputs
# to compute num_patches (similar to Qwen2-VL) # to compute num_patches (similar to Qwen2-VL)
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) image_num_patches = [None] * len(out_mm_data["image_embeds"])
else: else:
image_num_patches = [] image_num_patches = []
@ -479,7 +480,8 @@ class H2OVLMultiModalProcessor(
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*, *,
return_mm_hashes: bool, return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
bool]:
# The processor logic is different for len(images) <= 1 vs > 1 # The processor logic is different for len(images) <= 1 vs > 1
# Since the processing cache assumes that the processor output is # Since the processing cache assumes that the processor output is
# invariant of how many images are passed per prompt, we only # invariant of how many images are passed per prompt, we only

View File

@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.parse import ImageSize, MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, ProcessingCache, BaseProcessingInfo, ProcessingCache,
@ -295,7 +295,7 @@ class HCXVisionMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
placeholder = { placeholder = {
@ -306,21 +306,22 @@ class HCXVisionMultiModalProcessor(
def get_replacement_hyperclovax( def get_replacement_hyperclovax(
item_idx: int, item_idx: int,
modality: str, modality: str,
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
): ):
num_tokens = None out_item = out_mm_kwargs[modality][item_idx]
if modality == "image": if modality == "image":
lens = out_item["vision_query_lengths_images"].data
num_tokens = self.info.get_num_image_tokens( num_tokens = self.info.get_num_image_tokens(
vision_query_length=out_mm_kwargs[ vision_query_length=lens)
"vision_query_lengths_images"][item_idx], ) elif modality == "video":
if modality == "video": lens = out_item["vision_query_lengths_videos"].data
num_tokens = self.info.get_num_video_tokens( num_tokens = self.info.get_num_video_tokens(
vision_query_length=out_mm_kwargs[ vision_query_length=lens)
"vision_query_lengths_videos"][item_idx], ) else:
assert isinstance(num_tokens, int) raise NotImplementedError(modality)
return [
placeholder[modality], return [placeholder[modality]] * num_tokens
] * num_tokens
return [ return [
PromptReplacement( PromptReplacement(

View File

@ -34,7 +34,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import ImageProcessorItems, ImageSize from vllm.multimodal.parse import ImageProcessorItems, ImageSize
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
# yapf: disable # yapf: disable
@ -374,7 +374,7 @@ class Idefics3MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token, _, _ = self.info._get_image_token(hf_processor) image_token, _, _ = self.info._get_image_token(hf_processor)

View File

@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -399,7 +399,7 @@ class InternS1MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
img_context_token = hf_processor.image_token img_context_token = hf_processor.image_token
@ -407,15 +407,16 @@ class InternS1MultiModalProcessor(
end_image_token = hf_processor.end_image_token end_image_token = hf_processor.end_image_token
video_token = hf_processor.video_token video_token = hf_processor.video_token
if "video_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
video_num_patches = out_mm_kwargs["video_num_patches"] if "video_num_patches" in out_mm_data:
video_num_patches = out_mm_data["video_num_patches"]
assert isinstance(video_num_patches, torch.Tensor) assert isinstance(video_num_patches, torch.Tensor)
video_num_patches = video_num_patches.tolist() video_num_patches = video_num_patches.tolist()
else: else:
video_num_patches = [] video_num_patches = []
if "image_num_patches" in out_mm_kwargs: if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_kwargs["image_num_patches"] image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor) assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist() image_num_patches = image_num_patches.tolist()
else: else:

View File

@ -28,7 +28,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -797,18 +797,19 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "image_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
image_num_patches = out_mm_kwargs["image_num_patches"] if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor) assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist() image_num_patches = image_num_patches.tolist()
elif "image_embeds" in out_mm_kwargs: elif "image_embeds" in out_mm_data:
# TODO: Use image size information in dictionary embedding inputs # TODO: Use image size information in dictionary embedding inputs
# to compute num_patches (similar to Qwen2-VL) # to compute num_patches (similar to Qwen2-VL)
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) image_num_patches = [None] * len(out_mm_data["image_embeds"])
else: else:
image_num_patches = [] image_num_patches = []
@ -966,15 +967,19 @@ class InternVLMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
prompt_repl: list[PromptUpdate] = super()._get_prompt_updates( prompt_repl = super()._get_prompt_updates(
mm_items, hf_processor_mm_kwargs, out_mm_kwargs) mm_items=mm_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
out_mm_kwargs=out_mm_kwargs,
)
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "video_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
video_num_patches = out_mm_kwargs["video_num_patches"] if "video_num_patches" in out_mm_data:
video_num_patches = out_mm_data["video_num_patches"]
assert isinstance(video_num_patches, torch.Tensor) assert isinstance(video_num_patches, torch.Tensor)
video_num_patches = video_num_patches.tolist() video_num_patches = video_num_patches.tolist()
else: else:
@ -992,12 +997,15 @@ class InternVLMultiModalProcessor(
video_context_token=hf_processor.video_token) video_context_token=hf_processor.video_token)
if self.info.supports_video: if self.info.supports_video:
prompt_repl.append( prompt_repl = [
*prompt_repl,
PromptReplacement( PromptReplacement(
modality="video", modality="video",
target="<video>", target="<video>",
replacement=get_video_replacement_internvl, replacement=get_video_replacement_internvl,
)) )
]
return prompt_repl return prompt_repl

View File

@ -33,7 +33,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
from vllm.multimodal.inputs import (ImageItem, ModalityData, from vllm.multimodal.inputs import (ImageItem, ModalityData,
MultiModalDataDict, MultiModalFieldConfig, MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, VideoItem) MultiModalKwargsItems, VideoItem)
from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize, from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
ModalityDataItems, MultiModalDataItems, ModalityDataItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
@ -1192,7 +1192,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor( image_processor = self.info.get_image_processor(
@ -1208,7 +1208,8 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
merge_length = image_processor.merge_size**2 merge_length = image_processor.merge_size**2
def get_replacement_keye(item_idx: int, modality: str): def get_replacement_keye(item_idx: int, modality: str):
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] out_item = out_mm_kwargs[modality][item_idx]
grid_thw = out_item[f"{modality}_grid_thw"].data
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
num_tokens = int(grid_thw.prod()) // merge_length num_tokens = int(grid_thw.prod()) // merge_length

View File

@ -69,7 +69,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -239,7 +239,7 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
image_token_id = self.info.image_token_id image_token_id = self.info.image_token_id

View File

@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs) MultiModalInputs, MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -250,7 +250,7 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index image_token_id = hf_config.image_token_index
@ -343,7 +343,7 @@ class PixtralHFMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()

View File

@ -16,7 +16,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
VideoEmbeddingItems, VideoProcessorItems) VideoEmbeddingItems, VideoProcessorItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -185,7 +185,7 @@ class LlavaNextVideoMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
video_token_id = hf_config.video_token_index video_token_id = hf_config.video_token_index

View File

@ -18,7 +18,7 @@ from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
VideoEmbeddingItems, VideoProcessorItems) VideoEmbeddingItems, VideoProcessorItems)
from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.multimodal.processing import PromptReplacement, PromptUpdate
@ -372,7 +372,7 @@ class LlavaOnevisionMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
image_repls = super()._get_prompt_updates( image_repls = super()._get_prompt_updates(
mm_items=mm_items, mm_items=mm_items,

View File

@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig
from vllm.model_executor.layers.quantization.gptq_marlin import ( from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQMarlinConfig) GPTQMarlinConfig)
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
NestedTensors) NestedTensors)
from vllm.multimodal.parse import (AudioItem, AudioProcessorItems, from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
@ -316,7 +316,7 @@ class MiniCPMOMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
base_updates = super()._get_prompt_updates( base_updates = super()._get_prompt_updates(
mm_items=mm_items, mm_items=mm_items,

View File

@ -48,7 +48,7 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
NestedTensors) NestedTensors)
from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
@ -694,7 +694,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
placeholders = [("image", self.info.image_pattern), placeholders = [("image", self.info.image_pattern),
("video", self.info.video_pattern)] ("video", self.info.video_pattern)]

View File

@ -23,7 +23,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -265,7 +265,7 @@ class Mistral3MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()

View File

@ -56,7 +56,8 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
MultiModalFieldConfig, MultiModalKwargs) MultiModalFieldConfig,
MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseProcessingInfo, from vllm.multimodal.processing import (BaseProcessingInfo,
@ -217,7 +218,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
# Set encoder prompt length based on the number of tiles. # Set encoder prompt length based on the number of tiles.
# This tells the block manager to allocate correct number # This tells the block manager to allocate correct number
# of slots for encoder tokens. # of slots for encoder tokens.
num_tiles = mm_inputs["mm_kwargs"]["num_tiles"] num_tiles = mm_inputs["mm_kwargs"].get_data()["num_tiles"]
decode_tiles = num_tiles[num_encode_images:num_images].sum().item() decode_tiles = num_tiles[num_encode_images:num_images].sum().item()
num_tokens = decode_tiles * token_per_chunk num_tokens = decode_tiles * token_per_chunk
mm_inputs["encoder_prompt_token_ids"] = [image_token_id mm_inputs["encoder_prompt_token_ids"] = [image_token_id
@ -302,7 +303,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
token_per_chunk = self.info.get_token_per_chunk_from_config() token_per_chunk = self.info.get_token_per_chunk_from_config()
image_token_id = self.info.get_hf_config().image_token_index image_token_id = self.info.get_hf_config().image_token_index

View File

@ -44,7 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -646,13 +646,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> list[PromptUpdate]: ) -> list[PromptUpdate]:
assert (
mm_items.get_count("image", strict=False) == 0
or "aspect_ratios" in out_mm_kwargs
), "Transformers expect to include aspect_ratios in out_mm_kwargs"
config = self.info.get_hf_config() config = self.info.get_hf_config()
vision_config = config.vision_config vision_config = config.vision_config
@ -662,7 +657,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
img_patch_token = hf_processor.img_patch_token img_patch_token = hf_processor.img_patch_token
def get_replacement(item_idx: int): def get_replacement(item_idx: int):
aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx] out_item = out_mm_kwargs["image"][item_idx]
aspect_ratio = out_item["aspect_ratios"].data
repl = hf_processor._prompt_split_image( repl = hf_processor._prompt_split_image(
aspect_ratio=aspect_ratio, aspect_ratio=aspect_ratio,

View File

@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -1282,7 +1282,7 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)

View File

@ -16,7 +16,7 @@ from transformers import PretrainedConfig
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (PromptReplacement, PromptUpdate, from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
@ -106,18 +106,19 @@ class NVLMMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "image_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
image_num_patches = out_mm_kwargs["image_num_patches"] if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor) assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist() image_num_patches = image_num_patches.tolist()
elif "image_embeds" in out_mm_kwargs: elif "image_embeds" in out_mm_data:
# TODO: Use image size information in dictionary embedding inputs # TODO: Use image size information in dictionary embedding inputs
# to compute num_patches (similar to Qwen2-VL) # to compute num_patches (similar to Qwen2-VL)
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) image_num_patches = [None] * len(out_mm_data["image_embeds"])
else: else:
image_num_patches = [] image_num_patches = []

View File

@ -42,7 +42,7 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.parse import ImageSize, MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement) BaseProcessingInfo, PromptReplacement)
@ -375,11 +375,12 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> list[PromptReplacement]: ) -> list[PromptReplacement]:
def get_replacement_ovis(item_idx): def get_replacement_ovis(item_idx: int):
grid = out_mm_kwargs["grids"][item_idx] out_item = out_mm_kwargs["image"][item_idx]
grid = out_item["grids"].data
hf_processor = self.info.get_hf_processor() hf_processor = self.info.get_hf_processor()
return hf_processor.construct_image_placeholders(grid) return hf_processor.construct_image_placeholders(grid)

View File

@ -12,7 +12,7 @@ from vllm.logger import init_logger
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs) MultiModalInputs, MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -146,7 +146,7 @@ class PaliGemmaMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index image_token_id = hf_config.image_token_index

View File

@ -32,7 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
@ -410,7 +410,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_tokens: list[str] = hf_processor.img_tokens # type: ignore image_tokens: list[str] = hf_processor.img_tokens # type: ignore

View File

@ -30,7 +30,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems, from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
ImageProcessorItems, ImageSize, ImageProcessorItems, ImageSize,
MultiModalDataItems, MultiModalDataParser) MultiModalDataItems, MultiModalDataParser)
@ -1029,7 +1029,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
image_token_id = tokenizer.vocab[tokenizer.image_token] image_token_id = tokenizer.vocab[tokenizer.image_token]

View File

@ -21,7 +21,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems, from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
ImageProcessorItems, ImageSize, ImageProcessorItems, ImageSize,
MultiModalDataItems, MultiModalDataParser) MultiModalDataItems, MultiModalDataParser)
@ -802,7 +802,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
image_tokens: list[str] = self.info.image_tokens # type: ignore image_tokens: list[str] = self.info.image_tokens # type: ignore
audio_tokens: list[str] = self.info.audio_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore

View File

@ -33,7 +33,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
NestedTensors) NestedTensors)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
@ -273,7 +273,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
@ -309,7 +309,8 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*, *,
return_mm_hashes: bool, return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
bool]:
( (
prompt_ids, prompt_ids,
mm_kwargs, mm_kwargs,

View File

@ -34,7 +34,8 @@ from vllm.model_executor.models.utils import AutoWeightsLoader
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalFieldElem, MultiModalInputs, MultiModalFieldElem, MultiModalInputs,
MultiModalKwargs, MultiModalKwargsItem, MultiModalKwargsItem,
MultiModalKwargsItems,
MultiModalSharedField, PlaceholderRange) MultiModalSharedField, PlaceholderRange)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -88,7 +89,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
return [] return []
@ -136,7 +137,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
type="multimodal", type="multimodal",
prompt=prompt, prompt=prompt,
prompt_token_ids=[1], prompt_token_ids=[1],
mm_kwargs=MultiModalKwargs(multimodal_kwargs_items), mm_kwargs=MultiModalKwargsItems.from_seq(multimodal_kwargs_items),
mm_hashes=None, mm_hashes=None,
mm_placeholders=mm_placeholders, mm_placeholders=mm_placeholders,
) )

View File

@ -54,7 +54,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (ImageItem, ModalityData, from vllm.multimodal.inputs import (ImageItem, ModalityData,
MultiModalDataDict, MultiModalFieldConfig, MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems, from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
ModalityDataItems, MultiModalDataItems, ModalityDataItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
@ -265,7 +265,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
prompt_ids: list[int], prompt_ids: list[int],
mm_kwargs: MultiModalKwargs, mm_kwargs: MultiModalKwargsItems,
is_update_applied: bool, is_update_applied: bool,
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
""" """
@ -325,7 +325,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
@ -340,8 +340,9 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
image_token_id = vocab[image_token] image_token_id = vocab[image_token]
video_token_id = vocab[video_token] video_token_id = vocab[video_token]
audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths") out_mm_data = out_mm_kwargs.get_data()
feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") audio_feature_lengths = out_mm_data.get("audio_feature_lengths")
feature_attention_mask = out_mm_data.get("feature_attention_mask")
if audio_feature_lengths is None and feature_attention_mask is None: if audio_feature_lengths is None and feature_attention_mask is None:
audio_output_lengths = [] audio_output_lengths = []
elif audio_feature_lengths is not None: elif audio_feature_lengths is not None:
@ -371,7 +372,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
return [audio_token_id] * num_features return [audio_token_id] * num_features
def get_replacement_qwen2_vision(item_idx: int, modality: str): def get_replacement_qwen2_vision(item_idx: int, modality: str):
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx]
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
merge_length = image_processor.merge_size**2 merge_length = image_processor.merge_size**2
@ -387,7 +388,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
audio_num_features = audio_output_lengths[audio_in_video_item_idx + audio_num_features = audio_output_lengths[audio_in_video_item_idx +
item_idx] item_idx]
video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx] video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
audio_in_video_item_idx += 1 audio_in_video_item_idx += 1

View File

@ -37,7 +37,7 @@ from vllm.config import VllmConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -182,7 +182,7 @@ class Qwen2AudioMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
@ -199,7 +199,8 @@ class Qwen2AudioMultiModalProcessor(
audio_bos_id = vocab[audio_bos_token] audio_bos_id = vocab[audio_bos_token]
audio_eos_id = vocab[audio_eos_token] audio_eos_id = vocab[audio_eos_token]
feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") out_mm_data = out_mm_kwargs.get_data()
feature_attention_mask = out_mm_data.get("feature_attention_mask")
if feature_attention_mask is None: if feature_attention_mask is None:
audio_output_lengths = [] audio_output_lengths = []
else: else:

View File

@ -58,7 +58,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (ImageItem, ModalityData, from vllm.multimodal.inputs import (ImageItem, ModalityData,
MultiModalDataDict, MultiModalFieldConfig, MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, VideoItem) MultiModalKwargsItems, VideoItem)
from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize, from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
ModalityDataItems, MultiModalDataItems, ModalityDataItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
@ -975,7 +975,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor( image_processor = self.info.get_image_processor(
@ -991,7 +991,8 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
merge_length = image_processor.merge_size**2 merge_length = image_processor.merge_size**2
def get_replacement_qwen2vl(item_idx: int, modality: str): def get_replacement_qwen2vl(item_idx: int, modality: str):
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] out_item = out_mm_kwargs[modality][item_idx]
grid_thw = out_item[f"{modality}_grid_thw"].data
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
num_tokens = int(grid_thw.prod()) // merge_length num_tokens = int(grid_thw.prod()) // merge_length

View File

@ -33,7 +33,7 @@ from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement, BaseProcessingInfo, PromptReplacement,
@ -627,7 +627,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
special_tokens: dict[str, special_tokens: dict[str,

View File

@ -26,7 +26,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -552,18 +552,19 @@ class SkyworkR1VMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "image_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
image_num_patches = out_mm_kwargs["image_num_patches"] if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor) assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist() image_num_patches = image_num_patches.tolist()
elif "image_embeds" in out_mm_kwargs: elif "image_embeds" in out_mm_data:
# TODO: Use image size information in dictionary embedding inputs # TODO: Use image size information in dictionary embedding inputs
# to compute num_patches (similar to Qwen2-VL) # to compute num_patches (similar to Qwen2-VL)
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) image_num_patches = [None] * len(out_mm_data["image_embeds"])
else: else:
image_num_patches = [] image_num_patches = []

View File

@ -28,7 +28,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.parse import ImageSize, MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement, BaseProcessingInfo, PromptReplacement,
@ -520,20 +520,18 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_placeholder_token_id = hf_processor.image_token_id image_placeholder_token_id = hf_processor.image_token_id
batch_num_patches = out_mm_kwargs["num_patches"].tolist()
def get_replacement_step1o(item_idx: int): def get_replacement_step1o(item_idx: int):
img_out = out_mm_kwargs.get_item("image", item_idx) out_item = out_mm_kwargs["image"][item_idx]
num_patches = batch_num_patches[item_idx] num_patches = int(out_item["num_patches"].data)
if num_patches > 0: if num_patches > 0:
patch_newline_mask = img_out["patch_newline_mask"].data.tolist( patch_newline_mask = out_item["patch_newline_mask"].data
)
image_repl_ids = hf_processor._get_image_repl_features( image_repl_ids = hf_processor._get_image_repl_features(
1, num_patches, patch_newline_mask)[1] 1, num_patches, patch_newline_mask.tolist())[1]
else: else:
image_repl_ids = hf_processor._get_image_repl_features( image_repl_ids = hf_processor._get_image_repl_features(
1, 0, None)[1] 1, 0, None)[1]

View File

@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.llava import LlavaDummyInputsBuilder from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -275,7 +275,7 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index # The <IMAGE> token ID image_token_id = hf_config.image_token_index # The <IMAGE> token ID

View File

@ -41,7 +41,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, PlaceholderRange) MultiModalInputs, PlaceholderRange)
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
@ -237,7 +237,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
): ):
""" """
Given the original multi-modal items for this modality Given the original multi-modal items for this modality
@ -372,7 +372,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
mm_tokens_per_modality["num_image_patches"] mm_tokens_per_modality["num_image_patches"]
) if "num_image_patches" in mm_tokens_per_modality else None ) if "num_image_patches" in mm_tokens_per_modality else None
processed_data['num_image_patches'] = num_image_patches processed_data['num_image_patches'] = num_image_patches
mm_kwargs = MultiModalKwargs.from_hf_inputs( mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
processed_data, processed_data,
self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs, self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
num_image_patches), num_image_patches),

View File

@ -23,7 +23,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement, BaseProcessingInfo, PromptReplacement,
@ -194,7 +194,7 @@ class UltravoxMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
@ -203,7 +203,8 @@ class UltravoxMultiModalProcessor(
# Each audio can be split into multiple chunks. # Each audio can be split into multiple chunks.
# chunks_start_idx[i] indicates the start index of the chunks # chunks_start_idx[i] indicates the start index of the chunks
# belonging to the i-th audio. # belonging to the i-th audio.
num_chunks = out_mm_kwargs.get("audio_num_chunks", torch.zeros(0)) out_mm_data = out_mm_kwargs.get_data()
num_chunks = out_mm_data.get("audio_num_chunks", torch.zeros(0))
chunks_start_idx: torch.Tensor = torch.cumsum(num_chunks, chunks_start_idx: torch.Tensor = torch.cumsum(num_chunks,
dim=0, dim=0,
dtype=torch.int32) dtype=torch.int32)
@ -213,7 +214,7 @@ class UltravoxMultiModalProcessor(
def get_replacement_ultravox(item_idx: int): def get_replacement_ultravox(item_idx: int):
start = chunks_start_idx[item_idx] start = chunks_start_idx[item_idx]
end = chunks_start_idx[item_idx + 1] end = chunks_start_idx[item_idx + 1]
audio_token_len = out_mm_kwargs["audio_token_len"][start:end].sum() audio_token_len = out_mm_data["audio_token_len"][start:end].sum()
return [replacement_id] * int(audio_token_len) # type: ignore return [replacement_id] * int(audio_token_len) # type: ignore
return [ return [

View File

@ -31,7 +31,7 @@ from vllm.model_executor.models.whisper import WhisperEncoder
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
@ -259,7 +259,7 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
@ -289,7 +289,8 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*, *,
return_mm_hashes: bool, return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
bool]:
prompt_ids, mm_kwargs, mm_hashes, _ = super( prompt_ids, mm_kwargs, mm_hashes, _ = super(
)._cached_apply_hf_processor( )._cached_apply_hf_processor(
prompt=prompt, prompt=prompt,

View File

@ -33,7 +33,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
from vllm.multimodal.processing import (BaseProcessingInfo, from vllm.multimodal.processing import (BaseProcessingInfo,
EncDecMultiModalProcessor, EncDecMultiModalProcessor,
@ -728,7 +728,7 @@ class WhisperMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
num_tokens = self.info.get_num_audio_tokens() num_tokens = self.info.get_num_audio_tokens()
return [ return [

View File

@ -4,7 +4,8 @@ from .base import MultiModalPlaceholderMap
from .hasher import MultiModalHashDict, MultiModalHasher from .hasher import MultiModalHashDict, MultiModalHasher
from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
MultiModalDataDict, MultiModalKwargs, MultiModalDataDict, MultiModalKwargs,
MultiModalPlaceholderDict, NestedTensors) MultiModalKwargsItems, MultiModalPlaceholderDict,
NestedTensors)
from .registry import MultiModalRegistry from .registry import MultiModalRegistry
MULTIMODAL_REGISTRY = MultiModalRegistry() MULTIMODAL_REGISTRY = MultiModalRegistry()
@ -25,6 +26,7 @@ __all__ = [
"MultiModalHashDict", "MultiModalHashDict",
"MultiModalHasher", "MultiModalHasher",
"MultiModalKwargs", "MultiModalKwargs",
"MultiModalKwargsItems",
"MultiModalPlaceholderDict", "MultiModalPlaceholderDict",
"MultiModalPlaceholderMap", "MultiModalPlaceholderMap",
"NestedTensors", "NestedTensors",

View File

@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.sequence import SequenceGroupMetadata from vllm.sequence import SequenceGroupMetadata
from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange from .inputs import MultiModalKwargs, PlaceholderRange
_T = TypeVar("_T") _T = TypeVar("_T")
@ -56,8 +56,7 @@ class MultiModalPlaceholderMap:
@classmethod @classmethod
def from_seq_group( def from_seq_group(
cls, seq_group: "SequenceGroupMetadata", positions: range cls, seq_group: "SequenceGroupMetadata", positions: range
) -> tuple[dict[str, NestedTensors], dict[str, ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
"MultiModalPlaceholderMap"]]:
""" """
Returns the multi-modal items that intersect with the portion of a Returns the multi-modal items that intersect with the portion of a
prompt (``seq_group``) represented by ``positions``, as well as a prompt (``seq_group``) represented by ``positions``, as well as a
@ -100,7 +99,7 @@ class MultiModalPlaceholderMap:
seq_mm_placeholders = seq_group.multi_modal_placeholders seq_mm_placeholders = seq_group.multi_modal_placeholders
if not seq_mm_data or not seq_mm_placeholders: if not seq_mm_data or not seq_mm_placeholders:
return MultiModalKwargs().get_data(), {} return MultiModalKwargs(), {}
placeholder_maps = dict[str, MultiModalPlaceholderMap]() placeholder_maps = dict[str, MultiModalPlaceholderMap]()
@ -117,8 +116,6 @@ class MultiModalPlaceholderMap:
placeholder_maps[modality] = placeholder_map placeholder_maps[modality] = placeholder_map
seq_mm_data = seq_mm_data if isinstance(
seq_mm_data, dict) else seq_mm_data.get_data()
return seq_mm_data, placeholder_maps return seq_mm_data, placeholder_maps
def append_items_from_seq_group( def append_items_from_seq_group(

View File

@ -11,7 +11,9 @@ from vllm.logger import init_logger
from vllm.utils import GiB_bytes, LRUCache from vllm.utils import GiB_bytes, LRUCache
from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
from .inputs import MultiModalKwargs, MultiModalKwargsItem, NestedTensors from .inputs import (MultiModalFieldElem, MultiModalKwargs,
MultiModalKwargsItem, MultiModalKwargsItems,
NestedTensors)
logger = init_logger(__name__) logger = init_logger(__name__)
@ -26,8 +28,9 @@ class MultiModalCacheItemMetadata:
MultiModalCacheValue = Union[ MultiModalCacheValue = Union[
MultiModalKwargs, MultiModalKwargsItems,
MultiModalKwargsItem, MultiModalKwargsItem,
MultiModalKwargs,
Mapping[str, NestedTensors], Mapping[str, NestedTensors],
MultiModalCacheItemMetadata, MultiModalCacheItemMetadata,
] ]
@ -44,14 +47,16 @@ class MultiModalCache:
*, *,
debug: bool = False, debug: bool = False,
) -> int: ) -> int:
# MultiModalKwargs is not a subclass of dict if isinstance(leaf, MultiModalFieldElem):
if isinstance(leaf, MultiModalKwargs): return cls.get_item_size(leaf.data) # type: ignore
return cls.get_item_size(leaf.get_data(), debug=debug)
# MultiModalKwargsItem is not a subclass of dict # These are not subclasses of dict
if isinstance(leaf, MultiModalKwargsItems):
return cls.get_item_size(leaf.data) # type: ignore
if isinstance(leaf, MultiModalKwargsItem): if isinstance(leaf, MultiModalKwargsItem):
leaf_data = {k: v.data for k, v in leaf.items()} return cls.get_item_size(leaf.data) # type: ignore
return cls.get_item_size(leaf_data, debug=debug) if isinstance(leaf, MultiModalKwargs):
return cls.get_item_size(leaf.data) # type: ignore
# sys.getsizeof doesn't work for tensors # sys.getsizeof doesn't work for tensors
if isinstance(leaf, torch.Tensor): if isinstance(leaf, torch.Tensor):

View File

@ -11,7 +11,7 @@ from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
Union, cast, final) Union, cast, final)
import numpy as np import numpy as np
from typing_extensions import NotRequired, TypeAlias from typing_extensions import NotRequired, TypeAlias, deprecated
from vllm.utils import LazyLoader, full_groupby, is_list_of from vllm.utils import LazyLoader, full_groupby, is_list_of
from vllm.utils.jsontree import JSONTree, json_map_leaves from vllm.utils.jsontree import JSONTree, json_map_leaves
@ -656,7 +656,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None: def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None:
super().__init__(data) super().__init__(data)
modalities = {elem.modality for elem in self.data.values()} modalities = {elem.modality for elem in self.values()}
assert len(modalities) == 1, f"Found different modalities={modalities}" assert len(modalities) == 1, f"Found different modalities={modalities}"
self._modality = next(iter(modalities)) self._modality = next(iter(modalities))
@ -668,16 +668,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
return {key: elem.data for key, elem in self.items()} return {key: elem.data for key, elem in self.items()}
class MultiModalKwargs: class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
""" """
A dictionary that represents the keyword arguments to A dictionary of
[`torch.nn.Module.forward`][]. [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s
by modality.
The metadata `items` enables us to obtain the keyword arguments
corresponding to each data item in
[`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
[`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
[`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
""" """
@staticmethod @staticmethod
@ -712,19 +707,64 @@ class MultiModalKwargs:
elems = [v[item_idx] for v in elems_in_modality.values()] elems = [v[item_idx] for v in elems_in_modality.values()]
items.append(MultiModalKwargsItem.from_elems(elems)) items.append(MultiModalKwargsItem.from_elems(elems))
return MultiModalKwargs(items) return MultiModalKwargsItems.from_seq(items)
def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None:
super().__init__()
@staticmethod
def from_seq(items: Sequence[MultiModalKwargsItem]):
items_by_modality = full_groupby(items, key=lambda x: x.modality) items_by_modality = full_groupby(items, key=lambda x: x.modality)
self._items_by_modality = dict(items_by_modality) return MultiModalKwargsItems(items_by_modality)
self._data: Optional[dict[str, NestedTensors]] = None def __getitem__(self, modality: str):
if modality not in self:
raise KeyError(f"Modality {modality!r} not found. "
f"Available modalities: {set(self.keys())}")
@property return super().__getitem__(modality)
def modalities(self):
return self._items_by_modality.keys() def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
for items in self.values():
for item in items:
for key, elem in item.items():
elems_by_key[key].append(elem)
return MultiModalKwargs({
key:
elems[0].field.reduce_data(elems, pin_memory=pin_memory)
for key, elems in elems_by_key.items() if len(elems) > 0
})
class MultiModalKwargs(UserDict[str, NestedTensors]):
"""
A dictionary that represents the keyword arguments to
[`torch.nn.Module.forward`][].
"""
@staticmethod
@deprecated("`MultiModalKwargs.from_hf_inputs` is deprecated and "
"will be removed in v0.13. "
"Please use `MultiModalKwargsItems.from_hf_inputs` and "
"access the tensor data using `.get_data()`.")
def from_hf_inputs(
hf_inputs: "BatchFeature",
config_by_key: Mapping[str, MultiModalFieldConfig],
):
return MultiModalKwargsItems.from_hf_inputs(hf_inputs, config_by_key) \
.get_data()
@staticmethod
@deprecated("`MultiModalKwargs.from_items` is deprecated and "
"will be removed in v0.13. "
"Please use `MultiModalKwargsItems.from_seq` and "
"access the tensor data using `.get_data()`.")
def from_items(
items: Sequence[MultiModalKwargsItem],
*,
pin_memory: bool = False,
):
return MultiModalKwargsItems.from_seq(items) \
.get_data(pin_memory=pin_memory)
@staticmethod @staticmethod
def _try_stack(nested_tensors: NestedTensors, def _try_stack(nested_tensors: NestedTensors,
@ -813,92 +853,24 @@ class MultiModalKwargs:
return cast(BatchedTensorInputs, json_mapped) return cast(BatchedTensorInputs, json_mapped)
def keys(self):
return self.get_data().keys()
def values(self):
return self.get_data().values()
def items(self):
return self.get_data().items()
def get(self, key: str, /, default=None):
return self.get_data().get(key, default)
def pop(self, key: str, *args, **kwargs):
data = dict(self.get_data())
res = data.pop(key, *args, **kwargs)
for items in self._items_by_modality.values():
for item in items:
item.pop(key, *args, **kwargs)
self._data = None
return res
def __iter__(self):
return iter(self.get_data())
def __getitem__(self, key: str): def __getitem__(self, key: str):
return self.get_data()[key] if key not in self:
raise KeyError(f"Keyword argument {key!r} not found. "
f"Available keys: {set(self.keys())}")
return super().__getitem__(key)
def __eq__(self, other: object) -> bool: def __eq__(self, other: object) -> bool:
if not isinstance(other, self.__class__): if not isinstance(other, self.__class__):
return False return False
return self._items_by_modality == other._items_by_modality for k in self:
if k not in other:
return False
if not nested_tensors_equal(self[k], other[k]):
return False
def _validate_modality(self, method_name: str, modality: str) -> None: return True
if not self._items_by_modality:
raise RuntimeError(
f"`{method_name}` is not supported when "
"MultiModalKwargs is not initialized with `items`")
if modality not in self._items_by_modality:
available_modalities = set(self._items_by_modality.keys())
raise KeyError(f"Modality {modality!r} not found. "
f"Available modalities: {available_modalities}")
def get_item_count(self, modality: str) -> int:
"""Get the number of items belonging to a modality."""
self._validate_modality("get_item_count", modality)
return len(self._items_by_modality[modality])
def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
"""
Get the keyword arguments corresponding to an item identified by
its modality and index.
"""
self._validate_modality("get_item", modality)
return self._items_by_modality[modality][item_index]
def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
"""
Get the keyword arguments corresponding to each item belonging to
a modality.
"""
self._validate_modality("get_items", modality)
return self._items_by_modality[modality]
def get_data(self,
*,
pin_memory: bool = False) -> dict[str, NestedTensors]:
if self._data is not None:
return self._data
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
for items in self._items_by_modality.values():
for item in items:
for key, elem in item.items():
elems_by_key[key].append(elem)
data = {
key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
for key, elems in elems_by_key.items() if len(elems) > 0
}
self._data = data
return data
MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]] MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]]
@ -926,7 +898,7 @@ class MultiModalInputs(TypedDict):
token_type_ids: NotRequired[list[int]] token_type_ids: NotRequired[list[int]]
"""The token type IDs of the prompt.""" """The token type IDs of the prompt."""
mm_kwargs: MultiModalKwargs mm_kwargs: MultiModalKwargsItems
"""Keyword arguments to be directly passed to the model after batching.""" """Keyword arguments to be directly passed to the model after batching."""
mm_hashes: Optional["MultiModalHashDict"] mm_hashes: Optional["MultiModalHashDict"]

View File

@ -16,7 +16,7 @@ from vllm.utils import LazyLoader, is_list_of
from .audio import AudioResampler from .audio import AudioResampler
from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
ImageItem, ModalityData, MultiModalDataDict, ImageItem, ModalityData, MultiModalDataDict,
MultiModalFieldConfig, MultiModalKwargs, VideoItem) MultiModalFieldConfig, MultiModalKwargsItems, VideoItem)
_T = TypeVar("_T") _T = TypeVar("_T")
_I = TypeVar("_I") _I = TypeVar("_I")
@ -157,19 +157,16 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
self.fields_config = fields_config self.fields_config = fields_config
self.required_fields = required_fields self.required_fields = required_fields
self._kwargs = MultiModalKwargs.from_hf_inputs( self._kwargs = MultiModalKwargsItems.from_hf_inputs(
BatchFeature(dict(data)), BatchFeature(dict(data)),
fields_config, fields_config,
) )
def get_count(self) -> int: def get_count(self) -> int:
return self._kwargs.get_item_count(self.modality) return len(self._kwargs[self.modality])
def get(self, index: int) -> Mapping[str, torch.Tensor]: def get(self, index: int) -> Mapping[str, torch.Tensor]:
return { return self._kwargs[self.modality][index].get_data()
k: v.data
for k, v in self._kwargs.get_item(self.modality, index).items()
}
def get_processor_data(self) -> Mapping[str, object]: def get_processor_data(self) -> Mapping[str, object]:
return {} return {}

View File

@ -23,8 +23,9 @@ from vllm.utils import flatten_2d_lists, full_groupby
from .cache import MultiModalCache from .cache import MultiModalCache
from .hasher import MultiModalHasher from .hasher import MultiModalHasher
from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs, MultiModalFieldConfig, MultiModalInputs,
MultiModalKwargsItem, PlaceholderRange) MultiModalKwargsItem, MultiModalKwargsItems,
PlaceholderRange)
from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
@ -985,7 +986,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
MultiModalHashes = dict[str, list[str]] MultiModalHashes = dict[str, list[str]]
""" """
A collection of hashes with a similar structure as A collection of hashes with a similar structure as
[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]. [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
""" """
@ -1095,7 +1096,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
""" """
Given the original multi-modal items for this modality Given the original multi-modal items for this modality
@ -1361,7 +1362,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
self, self,
cache: ProcessingCache, cache: ProcessingCache,
mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]], mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
mm_missing_kwargs: MultiModalKwargs, mm_missing_kwargs: MultiModalKwargsItems,
) -> dict[str, list[MultiModalKwargsItem]]: ) -> dict[str, list[MultiModalKwargsItem]]:
mm_missing_next_idx = defaultdict[str, int](lambda: 0) mm_missing_next_idx = defaultdict[str, int](lambda: 0)
@ -1369,10 +1370,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
for modality, items_or_hashes in mm_cache_items_or_hashes.items(): for modality, items_or_hashes in mm_cache_items_or_hashes.items():
for item_or_hash in items_or_hashes: for item_or_hash in items_or_hashes:
if isinstance(item_or_hash, str): if isinstance(item_or_hash, str):
kw_item = mm_missing_kwargs.get_item( kw_item = mm_missing_kwargs[modality][
modality, mm_missing_next_idx[modality]]
mm_missing_next_idx[modality],
)
cache.put(item_or_hash, kw_item) cache.put(item_or_hash, kw_item)
mm_missing_next_idx[modality] += 1 mm_missing_next_idx[modality] += 1
else: else:
@ -1390,7 +1389,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*, *,
return_mm_hashes: bool, return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
bool]:
( (
prompt_ids, prompt_ids,
mm_processed_data, mm_processed_data,
@ -1403,7 +1403,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
enable_hf_prompt_update=True, enable_hf_prompt_update=True,
) )
mm_kwargs = MultiModalKwargs.from_hf_inputs( mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
mm_processed_data, mm_processed_data,
self._get_mm_fields_config(mm_processed_data, self._get_mm_fields_config(mm_processed_data,
hf_processor_mm_kwargs), hf_processor_mm_kwargs),
@ -1423,7 +1423,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*, *,
return_mm_hashes: bool, return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
bool]:
""" """
Apply the HF processor on the full prompt text, Apply the HF processor on the full prompt text,
caching the results and reusing cached results. caching the results and reusing cached results.
@ -1468,7 +1469,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
enable_hf_prompt_update=False, enable_hf_prompt_update=False,
) )
mm_missing_kwargs = MultiModalKwargs.from_hf_inputs( mm_missing_kwargs = MultiModalKwargsItems.from_hf_inputs(
mm_missing_processed_data, mm_missing_processed_data,
self._get_mm_fields_config(mm_missing_processed_data, self._get_mm_fields_config(mm_missing_processed_data,
hf_processor_mm_kwargs), hf_processor_mm_kwargs),
@ -1480,7 +1481,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_missing_kwargs=mm_missing_kwargs, mm_missing_kwargs=mm_missing_kwargs,
) )
mm_kwargs = MultiModalKwargs([ mm_kwargs = MultiModalKwargsItems.from_seq([
item for cache_items in mm_cache_items_merged.values() item for cache_items in mm_cache_items_merged.values()
for item in cache_items for item in cache_items
]) ])
@ -1585,14 +1586,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
def _validate_mm_kwargs( def _validate_mm_kwargs(
self, self,
mm_kwargs: MultiModalKwargs, mm_kwargs: MultiModalKwargsItems,
mm_item_counts: Mapping[str, int], mm_item_counts: Mapping[str, int],
) -> None: ) -> None:
for modality, item_count in mm_item_counts.items(): for modality, item_count in mm_item_counts.items():
if modality in mm_kwargs.modalities: items = mm_kwargs.get(modality, [])
items = mm_kwargs.get_items(modality)
else:
items = []
if len(items) != item_count: if len(items) != item_count:
raise RuntimeError( raise RuntimeError(
@ -1630,7 +1628,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
prompt_ids: list[int], prompt_ids: list[int],
mm_kwargs: MultiModalKwargs, mm_kwargs: MultiModalKwargsItems,
is_update_applied: bool, is_update_applied: bool,
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
unbound_prompt_updates = self._get_prompt_updates( unbound_prompt_updates = self._get_prompt_updates(

View File

@ -13,7 +13,7 @@ import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
MultiModalInputs, MultiModalKwargs, MultiModalInputs, MultiModalKwargsItems,
MultiModalPlaceholderDict) MultiModalPlaceholderDict)
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
EncDecMultiModalProcessor) EncDecMultiModalProcessor)
@ -43,7 +43,7 @@ class DummyDecoderData(NamedTuple):
"""Dummy data used for profiling.""" """Dummy data used for profiling."""
prompt_token_ids: list[int] prompt_token_ids: list[int]
multi_modal_data: MultiModalKwargs multi_modal_data: MultiModalKwargsItems
multi_modal_placeholders: MultiModalPlaceholderDict multi_modal_placeholders: MultiModalPlaceholderDict

View File

@ -32,11 +32,13 @@ _M = TypeVar("_M")
if TYPE_CHECKING: if TYPE_CHECKING:
from .inputs import (BatchedTensorInputs, MultiModalKwargs, from .inputs import (BatchedTensorInputs, MultiModalKwargs,
MultiModalKwargsItem, MultiModalPlaceholderDict) MultiModalKwargsItem, MultiModalKwargsItems,
MultiModalPlaceholderDict)
else: else:
BatchedTensorInputs = Any BatchedTensorInputs = Any
MultiModalKwargs = Any MultiModalKwargs = Any
MultiModalKwargsItem = Any MultiModalKwargsItem = Any
MultiModalKwargsItems = Any
MultiModalPlaceholderDict = Any MultiModalPlaceholderDict = Any
global_thread_pool = ThreadPoolExecutor( global_thread_pool = ThreadPoolExecutor(
@ -359,18 +361,20 @@ def argsort_mm_positions(
"`group_mm_kwargs_by_modality` and will be removed in v0.13. " "`group_mm_kwargs_by_modality` and will be removed in v0.13. "
"Please use `group_mm_kwargs_by_modality` instead.") "Please use `group_mm_kwargs_by_modality` instead.")
def group_mm_inputs_by_modality( def group_mm_inputs_by_modality(
mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]: mm_inputs: list[MultiModalKwargsItems]
) -> list[list[MultiModalKwargsItems]]:
if not mm_inputs: if not mm_inputs:
return [] return []
def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]: def modality_group_func(
mm_input: MultiModalKwargsItems) -> Union[str, int]:
# If the input has multiple modalities, return a id as the unique key # If the input has multiple modalities, return a id as the unique key
# for the mm_input input. # for the mm_input input.
if len(mm_input.modalities) > 1: if len(mm_input) > 1:
return id(mm_input) return id(mm_input)
elif len(mm_input.modalities) == 1: elif len(mm_input) == 1:
return list(mm_input.modalities)[0] return next(iter(mm_input.keys()))
# FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty, # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
# this is used to make InternVL with legacy pipeline still work with v1. # this is used to make InternVL with legacy pipeline still work with v1.
@ -397,12 +401,12 @@ def group_mm_kwargs_by_modality(
Yields: Yields:
A tuple `(modality, num_items, grouped_kwargs)`. A tuple `(modality, num_items, grouped_kwargs)`.
""" """
from vllm.multimodal.inputs import MultiModalKwargs from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
for modality, items in groupby(mm_kwargs, key=lambda item: item.modality): for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
items_lst = list(items) items_lst = list(items)
# mm_kwargs_group = MultiModalKwargs(items_lst) \ # mm_kwargs_group = MultiModalKwargsItems.from_items(items_lst) \
# .get_data(pin_memory=pin_memory) # .get_data(pin_memory=pin_memory)
# if device is not None: # if device is not None:
@ -417,7 +421,10 @@ def group_mm_kwargs_by_modality(
# We will also need to update each model to remove `flatten_bn`. # We will also need to update each model to remove `flatten_bn`.
mm_kwargs_group = MultiModalKwargs.as_kwargs( mm_kwargs_group = MultiModalKwargs.as_kwargs(
MultiModalKwargs.batch( MultiModalKwargs.batch(
[MultiModalKwargs([item]) for item in items_lst], [
MultiModalKwargsItems.from_seq([item]).get_data()
for item in items_lst
],
pin_memory=pin_memory, pin_memory=pin_memory,
), ),
device=device, device=device,

View File

@ -22,7 +22,6 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sampling_params import RequestOutputKind, SamplingParams
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.multimodal.inputs import NestedTensors
from vllm.v1.worker.kv_connector_model_runner_mixin import ( from vllm.v1.worker.kv_connector_model_runner_mixin import (
KVConnectorOutput) KVConnectorOutput)
@ -523,7 +522,7 @@ class Sequence:
@property @property
def multi_modal_data(self) -> MultiModalKwargs: def multi_modal_data(self) -> MultiModalKwargs:
if self.inputs["type"] == "multimodal": if self.inputs["type"] == "multimodal":
return self.inputs["mm_kwargs"] return self.inputs["mm_kwargs"].get_data()
return MultiModalKwargs() return MultiModalKwargs()
@ -979,8 +978,7 @@ class SequenceGroupMetadata(
state: Optional[SequenceGroupState] = msgspec.field( state: Optional[SequenceGroupState] = msgspec.field(
default_factory=lambda: SequenceGroupState()) default_factory=lambda: SequenceGroupState())
token_type_ids: Optional[list[int]] = None token_type_ids: Optional[list[int]] = None
multi_modal_data: Optional[Union[MultiModalKwargs, multi_modal_data: Optional[MultiModalKwargs] = None
dict[str, "NestedTensors"]]] = None
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
encoder_seq_data: Optional[SequenceData] = None encoder_seq_data: Optional[SequenceData] = None
cross_block_table: Optional[list[int]] = None cross_block_table: Optional[list[int]] = None

View File

@ -310,7 +310,7 @@ class Processor:
sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions) sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
orig_sorted_mm_inputs = [ orig_sorted_mm_inputs = [
decoder_mm_inputs.get_item(modality, idx) decoder_mm_inputs[modality][idx]
for modality, idx in sorted_mm_idxs for modality, idx in sorted_mm_idxs
] ]
sorted_mm_positions = [ sorted_mm_positions = [

View File

@ -18,12 +18,15 @@ from msgspec import msgpack
from vllm import envs from vllm import envs
from vllm.logger import init_logger from vllm.logger import init_logger
# yapf: disable
from vllm.multimodal.inputs import (BaseMultiModalField, from vllm.multimodal.inputs import (BaseMultiModalField,
MultiModalBatchedField, MultiModalBatchedField,
MultiModalFieldConfig, MultiModalFieldElem, MultiModalFieldConfig, MultiModalFieldElem,
MultiModalFlatField, MultiModalKwargs, MultiModalFlatField, MultiModalKwargs,
MultiModalKwargsItem, MultiModalKwargsItem,
MultiModalKwargsItems,
MultiModalSharedField, NestedTensors) MultiModalSharedField, NestedTensors)
# yapf: enable
from vllm.v1.engine import UtilityResult from vllm.v1.engine import UtilityResult
logger = init_logger(__name__) logger = init_logger(__name__)
@ -116,12 +119,11 @@ class MsgpackEncoder:
if isinstance(obj, MultiModalKwargsItem): if isinstance(obj, MultiModalKwargsItem):
return self._encode_mm_item(obj) return self._encode_mm_item(obj)
if isinstance(obj, MultiModalKwargsItems):
return self._encode_mm_items(obj)
if isinstance(obj, MultiModalKwargs): if isinstance(obj, MultiModalKwargs):
return [ return self._encode_mm_kwargs(obj)
self._encode_mm_item(item)
for itemlist in obj._items_by_modality.values()
for item in itemlist
]
if isinstance(obj, UtilityResult): if isinstance(obj, UtilityResult):
result = obj.result result = obj.result
@ -183,6 +185,12 @@ class MsgpackEncoder:
dtype = str(obj.dtype).removeprefix("torch.") dtype = str(obj.dtype).removeprefix("torch.")
return dtype, obj.shape, data return dtype, obj.shape, data
def _encode_mm_items(self, items: MultiModalKwargsItems) -> dict[str, Any]:
return {
modality: [self._encode_mm_item(item) for item in itemlist]
for modality, itemlist in items.items()
}
def _encode_mm_item(self, def _encode_mm_item(self,
item: MultiModalKwargsItem) -> list[dict[str, Any]]: item: MultiModalKwargsItem) -> list[dict[str, Any]]:
return [self._encode_mm_field_elem(elem) for elem in item.values()] return [self._encode_mm_field_elem(elem) for elem in item.values()]
@ -200,6 +208,12 @@ class MsgpackEncoder:
self._encode_mm_field(elem.field), self._encode_mm_field(elem.field),
} }
def _encode_mm_kwargs(self, kw: MultiModalKwargs) -> dict[str, Any]:
return {
modality: self._encode_nested_tensors(data)
for modality, data in kw.items()
}
def _encode_nested_tensors(self, nt: NestedTensors) -> Any: def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
if isinstance(nt, torch.Tensor): if isinstance(nt, torch.Tensor):
return self._encode_tensor(nt) return self._encode_tensor(nt)
@ -260,8 +274,10 @@ class MsgpackDecoder:
return slice(*obj) return slice(*obj)
if issubclass(t, MultiModalKwargsItem): if issubclass(t, MultiModalKwargsItem):
return self._decode_mm_item(obj) return self._decode_mm_item(obj)
if issubclass(t, MultiModalKwargsItems):
return self._decode_mm_items(obj)
if issubclass(t, MultiModalKwargs): if issubclass(t, MultiModalKwargs):
return MultiModalKwargs(self._decode_mm_items(obj)) return self._decode_mm_kwargs(obj)
if t is UtilityResult: if t is UtilityResult:
return self._decode_utility_result(obj) return self._decode_utility_result(obj)
return obj return obj
@ -315,8 +331,11 @@ class MsgpackDecoder:
# Convert back to proper shape & type # Convert back to proper shape & type
return arr.view(torch_dtype).view(shape) return arr.view(torch_dtype).view(shape)
def _decode_mm_items(self, obj: list[Any]) -> list[MultiModalKwargsItem]: def _decode_mm_items(self, obj: dict[str, Any]) -> MultiModalKwargsItems:
return [self._decode_mm_item(v) for v in obj] return MultiModalKwargsItems({
modality: [self._decode_mm_item(item) for item in itemlist]
for modality, itemlist in obj.items()
})
def _decode_mm_item(self, obj: list[Any]) -> MultiModalKwargsItem: def _decode_mm_item(self, obj: list[Any]) -> MultiModalKwargsItem:
return MultiModalKwargsItem.from_elems( return MultiModalKwargsItem.from_elems(
@ -339,6 +358,12 @@ class MsgpackDecoder:
obj["field"] = factory_meth(None, *field_args).field obj["field"] = factory_meth(None, *field_args).field
return MultiModalFieldElem(**obj) return MultiModalFieldElem(**obj)
def _decode_mm_kwargs(self, obj: dict[str, Any]) -> MultiModalKwargs:
return MultiModalKwargs({
modality: self._decode_nested_tensors(data)
for modality, data in obj.items()
})
def _decode_nested_tensors(self, obj: Any) -> NestedTensors: def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
if isinstance(obj, (int, float)): if isinstance(obj, (int, float)):
# Although it violates NestedTensors type, MultiModalKwargs # Although it violates NestedTensors type, MultiModalKwargs

View File

@ -10,8 +10,8 @@ import torch
from typing_extensions import deprecated from typing_extensions import deprecated
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem, from vllm.multimodal.inputs import (MultiModalKwargsItem,
PlaceholderRange) MultiModalKwargsItems, PlaceholderRange)
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams, SamplingType from vllm.sampling_params import SamplingParams, SamplingType
from vllm.utils import swap_dict_values from vllm.utils import swap_dict_values
@ -57,8 +57,10 @@ class CachedRequestState:
@property @property
@deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be " @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
"removed in v0.13. Please use `mm_kwargs` instead.") "removed in v0.13. Please use `mm_kwargs` instead.")
def mm_inputs(self) -> list[MultiModalKwargs]: def mm_inputs(self) -> list[MultiModalKwargsItems]:
return [MultiModalKwargs([item]) for item in self.mm_kwargs] return [
MultiModalKwargsItems.from_seq([item]) for item in self.mm_kwargs
]
def get_token_id(self, idx: int) -> int: def get_token_id(self, idx: int) -> int:
if idx < self.num_prompt_tokens: if idx < self.num_prompt_tokens:

View File

@ -2218,11 +2218,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
dummy_mm_data = dummy_decoder_data.multi_modal_data dummy_mm_data = dummy_decoder_data.multi_modal_data
# Result in the maximum GPU consumption of the model # Result in the maximum GPU consumption of the model
dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) dummy_mm_item = dummy_mm_data[modality][0]
dummy_mm_items = [dummy_mm_item] * max_items_per_batch
return next(mm_kwargs_group return next(mm_kwargs_group
for _, _, mm_kwargs_group in group_mm_kwargs_by_modality( for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
[dummy_mm_item] * max_items_per_batch, dummy_mm_items,
device=self.device, device=self.device,
pin_memory=self.pin_memory, pin_memory=self.pin_memory,
)) ))

View File

@ -1824,11 +1824,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
dummy_mm_data = dummy_decoder_data.multi_modal_data dummy_mm_data = dummy_decoder_data.multi_modal_data
# Result in the maximum GPU consumption of the model # Result in the maximum GPU consumption of the model
dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) dummy_mm_item = dummy_mm_data[modality][0]
dummy_mm_items = [dummy_mm_item] * max_items_per_batch
return next(grouped_mm_kwargs return next(grouped_mm_kwargs
for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality( for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
[dummy_mm_item] * max_items_per_batch, dummy_mm_items,
device=self.device, device=self.device,
pin_memory=self.pin_memory, pin_memory=self.pin_memory,
)) ))