mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 08:05:31 +08:00
[Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs (#23053)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
5c79b0d648
commit
27e8d1ea3e
@ -77,6 +77,7 @@ Internal data structures.
|
||||
- [vllm.multimodal.inputs.MultiModalFieldElem][]
|
||||
- [vllm.multimodal.inputs.MultiModalFieldConfig][]
|
||||
- [vllm.multimodal.inputs.MultiModalKwargsItem][]
|
||||
- [vllm.multimodal.inputs.MultiModalKwargsItems][]
|
||||
- [vllm.multimodal.inputs.MultiModalKwargs][]
|
||||
- [vllm.multimodal.inputs.MultiModalInputs][]
|
||||
|
||||
|
||||
@ -629,7 +629,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
@ -778,7 +778,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id
|
||||
|
||||
@ -370,10 +370,16 @@ def _assert_inputs_equal(
|
||||
if ignore_mm_keys is None:
|
||||
ignore_mm_keys = set()
|
||||
|
||||
assert "mm_kwargs" in a and "mm_kwargs" in b, msg
|
||||
a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"}
|
||||
b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"}
|
||||
|
||||
assert a_rest == b_rest, msg
|
||||
|
||||
a_data = a["mm_kwargs"].get_data()
|
||||
b_data = b["mm_kwargs"].get_data()
|
||||
|
||||
for key in ignore_mm_keys:
|
||||
a["mm_kwargs"].pop(key, None)
|
||||
b["mm_kwargs"].pop(key, None)
|
||||
a_data.pop(key, None)
|
||||
b_data.pop(key, None)
|
||||
|
||||
assert a == b, msg
|
||||
assert a_data == b_data, msg
|
||||
|
||||
@ -45,7 +45,8 @@ def test_processor_override(
|
||||
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
|
||||
video_tok_count = processed_inputs["prompt_token_ids"].count(
|
||||
video_token_id)
|
||||
grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
|
||||
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
|
||||
)["video_grid_thw"][0]
|
||||
|
||||
assert grid_t == expected_grid_t
|
||||
assert video_tok_count == expected_toks_per_frame * grid_t
|
||||
|
||||
@ -108,7 +108,8 @@ def _run_check(
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
@ -68,7 +68,8 @@ def _run_check(
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
@ -51,14 +51,14 @@ def test_processor_override(
|
||||
prompt = encode_tokens(tokenizer, prompt)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
mm_kwargs = processed_inputs["mm_kwargs"]
|
||||
mm_data = processed_inputs["mm_kwargs"].get_data()
|
||||
|
||||
# place holder replacements
|
||||
prompt_token_ids = processed_inputs["prompt_token_ids"]
|
||||
assert prompt_token_ids.count(config.boi_token_index) == num_imgs
|
||||
assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
|
||||
assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
|
||||
aspect_ratios = mm_kwargs["aspect_ratios"]
|
||||
aspect_ratios = mm_data["aspect_ratios"]
|
||||
num_x_separators = num_y_separators = 0
|
||||
for tiles_y, tiles_x in aspect_ratios:
|
||||
if tiles_x * tiles_y > 1:
|
||||
@ -80,6 +80,6 @@ def test_processor_override(
|
||||
num_patches_per_chunk = processor.info.get_patch_per_chunk(
|
||||
config.vision_config)
|
||||
assert prompt_token_ids.count(config.image_token_index) \
|
||||
== mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
|
||||
assert mm_kwargs["pixel_values"].shape[0] \
|
||||
== mm_kwargs["patches_per_image"].sum()
|
||||
== sum(mm_data["patches_per_image"]) * num_patches_per_chunk
|
||||
assert len(mm_data["pixel_values"]) \
|
||||
== sum(mm_data["patches_per_image"])
|
||||
|
||||
@ -49,18 +49,18 @@ def test_profiling(
|
||||
encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
|
||||
] * max_num_seqs
|
||||
|
||||
mm_kwargs = processor.apply(
|
||||
mm_data = processor.apply(
|
||||
prompt=dummy_mm_data.prompt,
|
||||
mm_data=dummy_mm_data.mm_data,
|
||||
hf_processor_mm_kwargs=dict(),
|
||||
)["mm_kwargs"]
|
||||
)["mm_kwargs"].get_data()
|
||||
|
||||
# Get the actual number of encoder tokens for each sample.
|
||||
# Because attn_metadata.encoder_seq_lens only counts the last
|
||||
# group of images for each sample, which is used to cheat the
|
||||
# block manager to allocate blocks for those images only.
|
||||
# See MllamaMultiModalProcessor for more details.
|
||||
num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")]
|
||||
num_tiles = [[t] for t in mm_data.pop("num_tiles")]
|
||||
num_tokens_per_tile = calc_token_per_chunk(image_size)
|
||||
actual_encoder_seq_lens = [
|
||||
sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
|
||||
|
||||
@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
|
||||
hf_config = ctx.get_hf_config(Llama4Config)
|
||||
|
||||
mm_kwargs = processor.apply(
|
||||
mm_data = processor.apply(
|
||||
prompt=dummy_mm_data.prompt,
|
||||
mm_data=dummy_mm_data.mm_data,
|
||||
hf_processor_mm_kwargs=dict(),
|
||||
)["mm_kwargs"]
|
||||
)["mm_kwargs"].get_data()
|
||||
|
||||
image_size = hf_config.vision_config.image_size
|
||||
patch_size = hf_config.vision_config.patch_size
|
||||
downsample_ratio = int(
|
||||
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
|
||||
tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
|
||||
chunks_per_image = prod(mm_kwargs["patches_per_image"])
|
||||
chunks_per_image = prod(mm_data["patches_per_image"])
|
||||
total_num_patches = chunks_per_image * tokens_per_patch
|
||||
num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][
|
||||
0][1] # x-y seperator tokens
|
||||
num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
|
||||
1] # x-y seperator tokens
|
||||
total_tokens = total_num_patches.item() + num_tiles.item(
|
||||
) + 3 # image start, image, image end
|
||||
|
||||
|
||||
@ -70,7 +70,8 @@ def _run_check(
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
@ -48,7 +48,8 @@ def test_processor_override(
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values"].shape
|
||||
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
|
||||
|
||||
@ -128,7 +128,7 @@ def create_batched_mm_kwargs(
|
||||
)["mm_kwargs"]
|
||||
items = [
|
||||
item for modality in supported_mm_limits
|
||||
for item in mm_kwargs.get_items(modality)
|
||||
for item in mm_kwargs[modality]
|
||||
]
|
||||
return group_mm_kwargs_by_modality(items)
|
||||
|
||||
|
||||
@ -4,8 +4,8 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
|
||||
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
|
||||
MultiModalKwargsItem,
|
||||
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
|
||||
MultiModalKwargsItems,
|
||||
MultiModalSharedField)
|
||||
|
||||
|
||||
@ -24,8 +24,8 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]):
|
||||
])
|
||||
|
||||
|
||||
def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
|
||||
return MultiModalKwargs([
|
||||
def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]):
|
||||
return MultiModalKwargsItems.from_seq([
|
||||
_dummy_item(modality, size_by_key)
|
||||
for modality, size_by_key in size_by_key_modality.items()
|
||||
])
|
||||
@ -37,7 +37,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
|
||||
[
|
||||
(_dummy_item("a", {"a1": 100}), 100),
|
||||
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
|
||||
(_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
|
||||
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
|
||||
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
|
||||
@ -11,7 +11,8 @@ import torch
|
||||
|
||||
from vllm.multimodal.inputs import (MultiModalBatchedField,
|
||||
MultiModalFieldElem, MultiModalFlatField,
|
||||
MultiModalKwargs, MultiModalKwargsItem,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalKwargsItems,
|
||||
MultiModalSharedField, NestedTensors)
|
||||
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
|
||||
|
||||
@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
|
||||
class MyRequest(msgspec.Struct):
|
||||
mm: Optional[list[MultiModalKwargs]]
|
||||
mm: Optional[list[MultiModalKwargsItems]]
|
||||
|
||||
|
||||
def test_multimodal_kwargs():
|
||||
@ -119,7 +120,7 @@ def test_multimodal_kwargs():
|
||||
audio = MultiModalKwargsItem.from_elems([e1])
|
||||
video = MultiModalKwargsItem.from_elems([e2])
|
||||
image = MultiModalKwargsItem.from_elems([e3, e4])
|
||||
mm = MultiModalKwargs([audio, video, image])
|
||||
mm = MultiModalKwargsItems.from_seq([audio, video, image])
|
||||
|
||||
# pack mm kwargs into a mock request so that it can be decoded properly
|
||||
req = MyRequest([mm])
|
||||
@ -133,19 +134,22 @@ def test_multimodal_kwargs():
|
||||
|
||||
total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
|
||||
|
||||
# expected total encoding length, should be 14255, +-20 for minor changes
|
||||
assert 14250 <= total_len <= 14300
|
||||
decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
|
||||
# expected total encoding length, should be 14306, +-20 for minor changes
|
||||
assert 14275 <= total_len <= 14325
|
||||
decoded = decoder.decode(encoded).mm[0]
|
||||
assert isinstance(decoded, MultiModalKwargsItems)
|
||||
|
||||
# check all modalities were recovered and do some basic sanity checks
|
||||
assert len(decoded.modalities) == 3
|
||||
images = decoded.get_items("image")
|
||||
assert len(decoded) == 3
|
||||
images = decoded["image"]
|
||||
assert len(images) == 1
|
||||
assert len(images[0].items()) == 2
|
||||
assert list(images[0].keys()) == ["i0", "i1"]
|
||||
|
||||
# check the tensor contents and layout in the main dict
|
||||
assert all(nested_equal(mm[k], decoded[k]) for k in mm)
|
||||
mm_data = mm.get_data()
|
||||
decoded_data = decoded.get_data()
|
||||
assert all(nested_equal(mm_data[k], decoded_data[k]) for k in mm_data)
|
||||
|
||||
|
||||
def nested_equal(a: NestedTensors, b: NestedTensors):
|
||||
|
||||
@ -4,11 +4,12 @@
|
||||
from array import array
|
||||
from typing import Any, Type
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargs
|
||||
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
|
||||
|
||||
|
||||
def encode_hook(obj: Any) -> Any:
|
||||
"""Custom msgspec enc hook that supports array types.
|
||||
"""Custom msgspec enc hook that supports array types and MultiModalKwargs.
|
||||
|
||||
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
|
||||
"""
|
||||
@ -17,10 +18,12 @@ def encode_hook(obj: Any) -> Any:
|
||||
f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
|
||||
f"Given array has a type code of {obj.typecode}.")
|
||||
return obj.tobytes()
|
||||
if isinstance(obj, MultiModalKwargs):
|
||||
return dict(obj)
|
||||
|
||||
|
||||
def decode_hook(type: Type, obj: Any) -> Any:
|
||||
"""Custom msgspec dec hook that supports array types.
|
||||
"""Custom msgspec dec hook that supports array types and MultiModalKwargs.
|
||||
|
||||
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
|
||||
"""
|
||||
@ -28,3 +31,5 @@ def decode_hook(type: Type, obj: Any) -> Any:
|
||||
deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
|
||||
deserialized.frombytes(obj)
|
||||
return deserialized
|
||||
if type is MultiModalKwargs:
|
||||
return MultiModalKwargs(obj)
|
||||
|
||||
@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
@ -470,7 +470,7 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
@ -18,7 +18,7 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import (
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
|
||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -242,7 +242,7 @@ class AyaVisionMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token = hf_processor.image_token
|
||||
|
||||
@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptIndexTargets,
|
||||
@ -492,7 +492,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
vocab = tokenizer.get_vocab()
|
||||
|
||||
@ -31,7 +31,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
@ -151,7 +151,7 @@ class ChameleonMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
|
||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -241,7 +241,7 @@ class Cohere2VisionMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token = hf_processor.image_token
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.model_executor.models.transformers import replace_linear_class
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
ImageSize, MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -252,7 +252,7 @@ class DeepseekVL2MultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
@ -291,7 +291,8 @@ class DeepseekVL2MultiModalProcessor(
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
|
||||
bool]:
|
||||
# The processor logic is different for len(images) <= 2 vs > 2
|
||||
# Since the processing cache assumes that the processor output is
|
||||
# invariant of how many images are passed per prompt, we only
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseProcessingInfo,
|
||||
EncDecMultiModalProcessor,
|
||||
@ -860,7 +860,7 @@ class Florence2MultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
pad_token_id = hf_config.pad_token_id
|
||||
|
||||
@ -32,7 +32,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -226,7 +226,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id
|
||||
|
||||
@ -17,7 +17,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
# yapf: disable
|
||||
@ -311,7 +311,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token = hf_processor.boi_token
|
||||
|
||||
@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
# yapf: disable
|
||||
@ -209,7 +209,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
|
||||
@ -59,7 +59,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, VideoItem)
|
||||
MultiModalKwargsItems, VideoItem)
|
||||
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -1158,7 +1158,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_processor = self.info.get_image_processor(
|
||||
@ -1175,14 +1175,16 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
||||
merge_length = image_processor.merge_size**2
|
||||
|
||||
def get_image_replacement_glm4v(item_idx: int):
|
||||
grid_thw = out_mm_kwargs["image_grid_thw"][item_idx]
|
||||
out_item = out_mm_kwargs["image"][item_idx]
|
||||
grid_thw = out_item["image_grid_thw"].data
|
||||
assert isinstance(grid_thw, torch.Tensor)
|
||||
|
||||
num_tokens = int(grid_thw.prod()) // merge_length
|
||||
return [hf_processor.image_token_id] * num_tokens
|
||||
|
||||
def get_video_replacement_glm4v(item_idx: int):
|
||||
grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
|
||||
out_item = out_mm_kwargs["video"][item_idx]
|
||||
grid_thw = out_item["video_grid_thw"].data
|
||||
assert isinstance(grid_thw, torch.Tensor)
|
||||
|
||||
video, metadata = mm_items["video"][item_idx]
|
||||
|
||||
@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
@ -503,7 +503,7 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -118,7 +118,7 @@ class GraniteSpeechMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> list[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
@ -17,7 +17,7 @@ from transformers import PretrainedConfig
|
||||
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalKwargs
|
||||
from vllm.multimodal.inputs import MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
|
||||
@ -425,18 +425,19 @@ class H2OVLMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
if "image_num_patches" in out_mm_kwargs:
|
||||
image_num_patches = out_mm_kwargs["image_num_patches"]
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
if "image_num_patches" in out_mm_data:
|
||||
image_num_patches = out_mm_data["image_num_patches"]
|
||||
assert isinstance(image_num_patches, torch.Tensor)
|
||||
image_num_patches = image_num_patches.tolist()
|
||||
elif "image_embeds" in out_mm_kwargs:
|
||||
elif "image_embeds" in out_mm_data:
|
||||
# TODO: Use image size information in dictionary embedding inputs
|
||||
# to compute num_patches (similar to Qwen2-VL)
|
||||
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
|
||||
image_num_patches = [None] * len(out_mm_data["image_embeds"])
|
||||
else:
|
||||
image_num_patches = []
|
||||
|
||||
@ -479,7 +480,8 @@ class H2OVLMultiModalProcessor(
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
|
||||
bool]:
|
||||
# The processor logic is different for len(images) <= 1 vs > 1
|
||||
# Since the processing cache assumes that the processor output is
|
||||
# invariant of how many images are passed per prompt, we only
|
||||
|
||||
@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import ImageSize, MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, ProcessingCache,
|
||||
@ -295,7 +295,7 @@ class HCXVisionMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
placeholder = {
|
||||
@ -306,21 +306,22 @@ class HCXVisionMultiModalProcessor(
|
||||
def get_replacement_hyperclovax(
|
||||
item_idx: int,
|
||||
modality: str,
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
):
|
||||
num_tokens = None
|
||||
out_item = out_mm_kwargs[modality][item_idx]
|
||||
|
||||
if modality == "image":
|
||||
lens = out_item["vision_query_lengths_images"].data
|
||||
num_tokens = self.info.get_num_image_tokens(
|
||||
vision_query_length=out_mm_kwargs[
|
||||
"vision_query_lengths_images"][item_idx], )
|
||||
if modality == "video":
|
||||
vision_query_length=lens)
|
||||
elif modality == "video":
|
||||
lens = out_item["vision_query_lengths_videos"].data
|
||||
num_tokens = self.info.get_num_video_tokens(
|
||||
vision_query_length=out_mm_kwargs[
|
||||
"vision_query_lengths_videos"][item_idx], )
|
||||
assert isinstance(num_tokens, int)
|
||||
return [
|
||||
placeholder[modality],
|
||||
] * num_tokens
|
||||
vision_query_length=lens)
|
||||
else:
|
||||
raise NotImplementedError(modality)
|
||||
|
||||
return [placeholder[modality]] * num_tokens
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
|
||||
@ -34,7 +34,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import ImageProcessorItems, ImageSize
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
@ -374,7 +374,7 @@ class Idefics3MultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token, _, _ = self.info._get_image_token(hf_processor)
|
||||
|
||||
@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
ImageSize, MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -399,7 +399,7 @@ class InternS1MultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
img_context_token = hf_processor.image_token
|
||||
@ -407,15 +407,16 @@ class InternS1MultiModalProcessor(
|
||||
end_image_token = hf_processor.end_image_token
|
||||
video_token = hf_processor.video_token
|
||||
|
||||
if "video_num_patches" in out_mm_kwargs:
|
||||
video_num_patches = out_mm_kwargs["video_num_patches"]
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
if "video_num_patches" in out_mm_data:
|
||||
video_num_patches = out_mm_data["video_num_patches"]
|
||||
assert isinstance(video_num_patches, torch.Tensor)
|
||||
video_num_patches = video_num_patches.tolist()
|
||||
else:
|
||||
video_num_patches = []
|
||||
|
||||
if "image_num_patches" in out_mm_kwargs:
|
||||
image_num_patches = out_mm_kwargs["image_num_patches"]
|
||||
if "image_num_patches" in out_mm_data:
|
||||
image_num_patches = out_mm_data["image_num_patches"]
|
||||
assert isinstance(image_num_patches, torch.Tensor)
|
||||
image_num_patches = image_num_patches.tolist()
|
||||
else:
|
||||
|
||||
@ -28,7 +28,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
ImageSize, MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -797,18 +797,19 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
if "image_num_patches" in out_mm_kwargs:
|
||||
image_num_patches = out_mm_kwargs["image_num_patches"]
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
if "image_num_patches" in out_mm_data:
|
||||
image_num_patches = out_mm_data["image_num_patches"]
|
||||
assert isinstance(image_num_patches, torch.Tensor)
|
||||
image_num_patches = image_num_patches.tolist()
|
||||
elif "image_embeds" in out_mm_kwargs:
|
||||
elif "image_embeds" in out_mm_data:
|
||||
# TODO: Use image size information in dictionary embedding inputs
|
||||
# to compute num_patches (similar to Qwen2-VL)
|
||||
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
|
||||
image_num_patches = [None] * len(out_mm_data["image_embeds"])
|
||||
else:
|
||||
image_num_patches = []
|
||||
|
||||
@ -966,15 +967,19 @@ class InternVLMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
prompt_repl: list[PromptUpdate] = super()._get_prompt_updates(
|
||||
mm_items, hf_processor_mm_kwargs, out_mm_kwargs)
|
||||
prompt_repl = super()._get_prompt_updates(
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
out_mm_kwargs=out_mm_kwargs,
|
||||
)
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
if "video_num_patches" in out_mm_kwargs:
|
||||
video_num_patches = out_mm_kwargs["video_num_patches"]
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
if "video_num_patches" in out_mm_data:
|
||||
video_num_patches = out_mm_data["video_num_patches"]
|
||||
assert isinstance(video_num_patches, torch.Tensor)
|
||||
video_num_patches = video_num_patches.tolist()
|
||||
else:
|
||||
@ -992,12 +997,15 @@ class InternVLMultiModalProcessor(
|
||||
video_context_token=hf_processor.video_token)
|
||||
|
||||
if self.info.supports_video:
|
||||
prompt_repl.append(
|
||||
prompt_repl = [
|
||||
*prompt_repl,
|
||||
PromptReplacement(
|
||||
modality="video",
|
||||
target="<video>",
|
||||
replacement=get_video_replacement_internvl,
|
||||
))
|
||||
)
|
||||
]
|
||||
|
||||
return prompt_repl
|
||||
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
|
||||
from vllm.multimodal.inputs import (ImageItem, ModalityData,
|
||||
MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, VideoItem)
|
||||
MultiModalKwargsItems, VideoItem)
|
||||
from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
|
||||
ModalityDataItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
@ -1192,7 +1192,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_processor = self.info.get_image_processor(
|
||||
@ -1208,7 +1208,8 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
|
||||
merge_length = image_processor.merge_size**2
|
||||
|
||||
def get_replacement_keye(item_idx: int, modality: str):
|
||||
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
|
||||
out_item = out_mm_kwargs[modality][item_idx]
|
||||
grid_thw = out_item[f"{modality}_grid_thw"].data
|
||||
assert isinstance(grid_thw, torch.Tensor)
|
||||
|
||||
num_tokens = int(grid_thw.prod()) // merge_length
|
||||
|
||||
@ -69,7 +69,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -239,7 +239,7 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
image_token_id = self.info.image_token_id
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargs)
|
||||
MultiModalInputs, MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
ImageSize, MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -250,7 +250,7 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
@ -343,7 +343,7 @@ class PixtralHFMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_config = self.info.get_hf_config()
|
||||
|
||||
@ -16,7 +16,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
|
||||
VideoEmbeddingItems, VideoProcessorItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -185,7 +185,7 @@ class LlavaNextVideoMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
video_token_id = hf_config.video_token_index
|
||||
|
||||
@ -18,7 +18,7 @@ from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
|
||||
VideoEmbeddingItems, VideoProcessorItems)
|
||||
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
|
||||
@ -372,7 +372,7 @@ class LlavaOnevisionMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
image_repls = super()._get_prompt_updates(
|
||||
mm_items=mm_items,
|
||||
|
||||
@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQConfig
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
||||
GPTQMarlinConfig)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
|
||||
@ -316,7 +316,7 @@ class MiniCPMOMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
base_updates = super()._get_prompt_updates(
|
||||
mm_items=mm_items,
|
||||
|
||||
@ -48,7 +48,7 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
|
||||
@ -694,7 +694,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
placeholders = [("image", self.info.image_pattern),
|
||||
("video", self.info.video_pattern)]
|
||||
|
||||
@ -23,7 +23,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -265,7 +265,7 @@ class Mistral3MultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_config = self.info.get_hf_config()
|
||||
|
||||
@ -56,7 +56,8 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
||||
MultiModalFieldConfig, MultiModalKwargs)
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseProcessingInfo,
|
||||
@ -217,7 +218,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
# Set encoder prompt length based on the number of tiles.
|
||||
# This tells the block manager to allocate correct number
|
||||
# of slots for encoder tokens.
|
||||
num_tiles = mm_inputs["mm_kwargs"]["num_tiles"]
|
||||
num_tiles = mm_inputs["mm_kwargs"].get_data()["num_tiles"]
|
||||
decode_tiles = num_tiles[num_encode_images:num_images].sum().item()
|
||||
num_tokens = decode_tiles * token_per_chunk
|
||||
mm_inputs["encoder_prompt_token_ids"] = [image_token_id
|
||||
@ -302,7 +303,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
token_per_chunk = self.info.get_token_per_chunk_from_config()
|
||||
image_token_id = self.info.get_hf_config().image_token_index
|
||||
|
||||
@ -44,7 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -646,13 +646,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> list[PromptUpdate]:
|
||||
assert (
|
||||
mm_items.get_count("image", strict=False) == 0
|
||||
or "aspect_ratios" in out_mm_kwargs
|
||||
), "Transformers expect to include aspect_ratios in out_mm_kwargs"
|
||||
|
||||
config = self.info.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
@ -662,7 +657,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
|
||||
img_patch_token = hf_processor.img_patch_token
|
||||
|
||||
def get_replacement(item_idx: int):
|
||||
aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx]
|
||||
out_item = out_mm_kwargs["image"][item_idx]
|
||||
aspect_ratio = out_item["aspect_ratios"].data
|
||||
|
||||
repl = hf_processor._prompt_split_image(
|
||||
aspect_ratio=aspect_ratio,
|
||||
|
||||
@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -1282,7 +1282,7 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ from transformers import PretrainedConfig
|
||||
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
|
||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
|
||||
@ -106,18 +106,19 @@ class NVLMMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
if "image_num_patches" in out_mm_kwargs:
|
||||
image_num_patches = out_mm_kwargs["image_num_patches"]
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
if "image_num_patches" in out_mm_data:
|
||||
image_num_patches = out_mm_data["image_num_patches"]
|
||||
assert isinstance(image_num_patches, torch.Tensor)
|
||||
image_num_patches = image_num_patches.tolist()
|
||||
elif "image_embeds" in out_mm_kwargs:
|
||||
elif "image_embeds" in out_mm_data:
|
||||
# TODO: Use image size information in dictionary embedding inputs
|
||||
# to compute num_patches (similar to Qwen2-VL)
|
||||
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
|
||||
image_num_patches = [None] * len(out_mm_data["image_embeds"])
|
||||
else:
|
||||
image_num_patches = []
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import ImageSize, MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement)
|
||||
@ -375,11 +375,12 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> list[PromptReplacement]:
|
||||
|
||||
def get_replacement_ovis(item_idx):
|
||||
grid = out_mm_kwargs["grids"][item_idx]
|
||||
def get_replacement_ovis(item_idx: int):
|
||||
out_item = out_mm_kwargs["image"][item_idx]
|
||||
grid = out_item["grids"].data
|
||||
|
||||
hf_processor = self.info.get_hf_processor()
|
||||
return hf_processor.construct_image_placeholders(grid)
|
||||
|
||||
@ -12,7 +12,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, MultiModalKwargs)
|
||||
MultiModalInputs, MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -146,7 +146,7 @@ class PaliGemmaMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
@ -32,7 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
ImageSize, MultiModalDataItems)
|
||||
# yapf conflicts with isort for this block
|
||||
@ -410,7 +410,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_tokens: list[str] = hf_processor.img_tokens # type: ignore
|
||||
|
||||
@ -30,7 +30,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
|
||||
ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems, MultiModalDataParser)
|
||||
@ -1029,7 +1029,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
image_token_id = tokenizer.vocab[tokenizer.image_token]
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
|
||||
ImageProcessorItems, ImageSize,
|
||||
MultiModalDataItems, MultiModalDataParser)
|
||||
@ -802,7 +802,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
image_tokens: list[str] = self.info.image_tokens # type: ignore
|
||||
audio_tokens: list[str] = self.info.audio_tokens # type: ignore
|
||||
|
||||
@ -33,7 +33,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||
@ -273,7 +273,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
@ -309,7 +309,8 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
|
||||
bool]:
|
||||
(
|
||||
prompt_ids,
|
||||
mm_kwargs,
|
||||
|
||||
@ -34,7 +34,8 @@ from vllm.model_executor.models.utils import AutoWeightsLoader
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalFieldElem, MultiModalInputs,
|
||||
MultiModalKwargs, MultiModalKwargsItem,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalKwargsItems,
|
||||
MultiModalSharedField, PlaceholderRange)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -88,7 +89,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
return []
|
||||
|
||||
@ -136,7 +137,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
||||
type="multimodal",
|
||||
prompt=prompt,
|
||||
prompt_token_ids=[1],
|
||||
mm_kwargs=MultiModalKwargs(multimodal_kwargs_items),
|
||||
mm_kwargs=MultiModalKwargsItems.from_seq(multimodal_kwargs_items),
|
||||
mm_hashes=None,
|
||||
mm_placeholders=mm_placeholders,
|
||||
)
|
||||
|
||||
@ -54,7 +54,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (ImageItem, ModalityData,
|
||||
MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
|
||||
ModalityDataItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
@ -265,7 +265,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
prompt_ids: list[int],
|
||||
mm_kwargs: MultiModalKwargs,
|
||||
mm_kwargs: MultiModalKwargsItems,
|
||||
is_update_applied: bool,
|
||||
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||
"""
|
||||
@ -325,7 +325,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
@ -340,8 +340,9 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
image_token_id = vocab[image_token]
|
||||
video_token_id = vocab[video_token]
|
||||
|
||||
audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths")
|
||||
feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
audio_feature_lengths = out_mm_data.get("audio_feature_lengths")
|
||||
feature_attention_mask = out_mm_data.get("feature_attention_mask")
|
||||
if audio_feature_lengths is None and feature_attention_mask is None:
|
||||
audio_output_lengths = []
|
||||
elif audio_feature_lengths is not None:
|
||||
@ -371,7 +372,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
return [audio_token_id] * num_features
|
||||
|
||||
def get_replacement_qwen2_vision(item_idx: int, modality: str):
|
||||
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
|
||||
grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx]
|
||||
assert isinstance(grid_thw, torch.Tensor)
|
||||
merge_length = image_processor.merge_size**2
|
||||
|
||||
@ -387,7 +388,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
|
||||
audio_num_features = audio_output_lengths[audio_in_video_item_idx +
|
||||
item_idx]
|
||||
video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
|
||||
video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
|
||||
|
||||
audio_in_video_item_idx += 1
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -182,7 +182,7 @@ class Qwen2AudioMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
@ -199,7 +199,8 @@ class Qwen2AudioMultiModalProcessor(
|
||||
audio_bos_id = vocab[audio_bos_token]
|
||||
audio_eos_id = vocab[audio_eos_token]
|
||||
|
||||
feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
feature_attention_mask = out_mm_data.get("feature_attention_mask")
|
||||
if feature_attention_mask is None:
|
||||
audio_output_lengths = []
|
||||
else:
|
||||
|
||||
@ -58,7 +58,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (ImageItem, ModalityData,
|
||||
MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, VideoItem)
|
||||
MultiModalKwargsItems, VideoItem)
|
||||
from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
|
||||
ModalityDataItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
@ -975,7 +975,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_processor = self.info.get_image_processor(
|
||||
@ -991,7 +991,8 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
||||
merge_length = image_processor.merge_size**2
|
||||
|
||||
def get_replacement_qwen2vl(item_idx: int, modality: str):
|
||||
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
|
||||
out_item = out_mm_kwargs[modality][item_idx]
|
||||
grid_thw = out_item[f"{modality}_grid_thw"].data
|
||||
assert isinstance(grid_thw, torch.Tensor)
|
||||
|
||||
num_tokens = int(grid_thw.prod()) // merge_length
|
||||
|
||||
@ -33,7 +33,7 @@ from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
@ -627,7 +627,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
special_tokens: dict[str,
|
||||
|
||||
@ -26,7 +26,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
ImageSize, MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -552,18 +552,19 @@ class SkyworkR1VMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
if "image_num_patches" in out_mm_kwargs:
|
||||
image_num_patches = out_mm_kwargs["image_num_patches"]
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
if "image_num_patches" in out_mm_data:
|
||||
image_num_patches = out_mm_data["image_num_patches"]
|
||||
assert isinstance(image_num_patches, torch.Tensor)
|
||||
image_num_patches = image_num_patches.tolist()
|
||||
elif "image_embeds" in out_mm_kwargs:
|
||||
elif "image_embeds" in out_mm_data:
|
||||
# TODO: Use image size information in dictionary embedding inputs
|
||||
# to compute num_patches (similar to Qwen2-VL)
|
||||
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
|
||||
image_num_patches = [None] * len(out_mm_data["image_embeds"])
|
||||
else:
|
||||
image_num_patches = []
|
||||
|
||||
|
||||
@ -28,7 +28,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import ImageSize, MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
@ -520,20 +520,18 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_placeholder_token_id = hf_processor.image_token_id
|
||||
batch_num_patches = out_mm_kwargs["num_patches"].tolist()
|
||||
|
||||
def get_replacement_step1o(item_idx: int):
|
||||
img_out = out_mm_kwargs.get_item("image", item_idx)
|
||||
num_patches = batch_num_patches[item_idx]
|
||||
out_item = out_mm_kwargs["image"][item_idx]
|
||||
num_patches = int(out_item["num_patches"].data)
|
||||
if num_patches > 0:
|
||||
patch_newline_mask = img_out["patch_newline_mask"].data.tolist(
|
||||
)
|
||||
patch_newline_mask = out_item["patch_newline_mask"].data
|
||||
image_repl_ids = hf_processor._get_image_repl_features(
|
||||
1, num_patches, patch_newline_mask)[1]
|
||||
1, num_patches, patch_newline_mask.tolist())[1]
|
||||
else:
|
||||
image_repl_ids = hf_processor._get_image_repl_features(
|
||||
1, 0, None)[1]
|
||||
|
||||
@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
ImageSize, MultiModalDataItems)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -275,7 +275,7 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index # The <IMAGE> token ID
|
||||
|
||||
@ -41,7 +41,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead, VocabParallelEmbedding)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalInputs, PlaceholderRange)
|
||||
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
|
||||
@ -237,7 +237,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
):
|
||||
"""
|
||||
Given the original multi-modal items for this modality
|
||||
@ -372,7 +372,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
||||
mm_tokens_per_modality["num_image_patches"]
|
||||
) if "num_image_patches" in mm_tokens_per_modality else None
|
||||
processed_data['num_image_patches'] = num_image_patches
|
||||
mm_kwargs = MultiModalKwargs.from_hf_inputs(
|
||||
mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
|
||||
processed_data,
|
||||
self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
|
||||
num_image_patches),
|
||||
|
||||
@ -23,7 +23,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
@ -194,7 +194,7 @@ class UltravoxMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
@ -203,7 +203,8 @@ class UltravoxMultiModalProcessor(
|
||||
# Each audio can be split into multiple chunks.
|
||||
# chunks_start_idx[i] indicates the start index of the chunks
|
||||
# belonging to the i-th audio.
|
||||
num_chunks = out_mm_kwargs.get("audio_num_chunks", torch.zeros(0))
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
num_chunks = out_mm_data.get("audio_num_chunks", torch.zeros(0))
|
||||
chunks_start_idx: torch.Tensor = torch.cumsum(num_chunks,
|
||||
dim=0,
|
||||
dtype=torch.int32)
|
||||
@ -213,7 +214,7 @@ class UltravoxMultiModalProcessor(
|
||||
def get_replacement_ultravox(item_idx: int):
|
||||
start = chunks_start_idx[item_idx]
|
||||
end = chunks_start_idx[item_idx + 1]
|
||||
audio_token_len = out_mm_kwargs["audio_token_len"][start:end].sum()
|
||||
audio_token_len = out_mm_data["audio_token_len"][start:end].sum()
|
||||
return [replacement_id] * int(audio_token_len) # type: ignore
|
||||
|
||||
return [
|
||||
|
||||
@ -31,7 +31,7 @@ from vllm.model_executor.models.whisper import WhisperEncoder
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs, NestedTensors)
|
||||
MultiModalKwargsItems, NestedTensors)
|
||||
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
@ -259,7 +259,7 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
@ -289,7 +289,8 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
|
||||
bool]:
|
||||
prompt_ids, mm_kwargs, mm_hashes, _ = super(
|
||||
)._cached_apply_hf_processor(
|
||||
prompt=prompt,
|
||||
|
||||
@ -33,7 +33,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargs)
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
|
||||
from vllm.multimodal.processing import (BaseProcessingInfo,
|
||||
EncDecMultiModalProcessor,
|
||||
@ -728,7 +728,7 @@ class WhisperMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
num_tokens = self.info.get_num_audio_tokens()
|
||||
return [
|
||||
|
||||
@ -4,7 +4,8 @@ from .base import MultiModalPlaceholderMap
|
||||
from .hasher import MultiModalHashDict, MultiModalHasher
|
||||
from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
|
||||
MultiModalDataDict, MultiModalKwargs,
|
||||
MultiModalPlaceholderDict, NestedTensors)
|
||||
MultiModalKwargsItems, MultiModalPlaceholderDict,
|
||||
NestedTensors)
|
||||
from .registry import MultiModalRegistry
|
||||
|
||||
MULTIMODAL_REGISTRY = MultiModalRegistry()
|
||||
@ -25,6 +26,7 @@ __all__ = [
|
||||
"MultiModalHashDict",
|
||||
"MultiModalHasher",
|
||||
"MultiModalKwargs",
|
||||
"MultiModalKwargsItems",
|
||||
"MultiModalPlaceholderDict",
|
||||
"MultiModalPlaceholderMap",
|
||||
"NestedTensors",
|
||||
|
||||
@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
|
||||
if TYPE_CHECKING:
|
||||
from vllm.sequence import SequenceGroupMetadata
|
||||
|
||||
from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange
|
||||
from .inputs import MultiModalKwargs, PlaceholderRange
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
@ -56,8 +56,7 @@ class MultiModalPlaceholderMap:
|
||||
@classmethod
|
||||
def from_seq_group(
|
||||
cls, seq_group: "SequenceGroupMetadata", positions: range
|
||||
) -> tuple[dict[str, NestedTensors], dict[str,
|
||||
"MultiModalPlaceholderMap"]]:
|
||||
) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
|
||||
"""
|
||||
Returns the multi-modal items that intersect with the portion of a
|
||||
prompt (``seq_group``) represented by ``positions``, as well as a
|
||||
@ -100,7 +99,7 @@ class MultiModalPlaceholderMap:
|
||||
seq_mm_placeholders = seq_group.multi_modal_placeholders
|
||||
|
||||
if not seq_mm_data or not seq_mm_placeholders:
|
||||
return MultiModalKwargs().get_data(), {}
|
||||
return MultiModalKwargs(), {}
|
||||
|
||||
placeholder_maps = dict[str, MultiModalPlaceholderMap]()
|
||||
|
||||
@ -117,8 +116,6 @@ class MultiModalPlaceholderMap:
|
||||
|
||||
placeholder_maps[modality] = placeholder_map
|
||||
|
||||
seq_mm_data = seq_mm_data if isinstance(
|
||||
seq_mm_data, dict) else seq_mm_data.get_data()
|
||||
return seq_mm_data, placeholder_maps
|
||||
|
||||
def append_items_from_seq_group(
|
||||
|
||||
@ -11,7 +11,9 @@ from vllm.logger import init_logger
|
||||
from vllm.utils import GiB_bytes, LRUCache
|
||||
from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
|
||||
|
||||
from .inputs import MultiModalKwargs, MultiModalKwargsItem, NestedTensors
|
||||
from .inputs import (MultiModalFieldElem, MultiModalKwargs,
|
||||
MultiModalKwargsItem, MultiModalKwargsItems,
|
||||
NestedTensors)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -26,8 +28,9 @@ class MultiModalCacheItemMetadata:
|
||||
|
||||
|
||||
MultiModalCacheValue = Union[
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalKwargs,
|
||||
Mapping[str, NestedTensors],
|
||||
MultiModalCacheItemMetadata,
|
||||
]
|
||||
@ -44,14 +47,16 @@ class MultiModalCache:
|
||||
*,
|
||||
debug: bool = False,
|
||||
) -> int:
|
||||
# MultiModalKwargs is not a subclass of dict
|
||||
if isinstance(leaf, MultiModalKwargs):
|
||||
return cls.get_item_size(leaf.get_data(), debug=debug)
|
||||
if isinstance(leaf, MultiModalFieldElem):
|
||||
return cls.get_item_size(leaf.data) # type: ignore
|
||||
|
||||
# MultiModalKwargsItem is not a subclass of dict
|
||||
# These are not subclasses of dict
|
||||
if isinstance(leaf, MultiModalKwargsItems):
|
||||
return cls.get_item_size(leaf.data) # type: ignore
|
||||
if isinstance(leaf, MultiModalKwargsItem):
|
||||
leaf_data = {k: v.data for k, v in leaf.items()}
|
||||
return cls.get_item_size(leaf_data, debug=debug)
|
||||
return cls.get_item_size(leaf.data) # type: ignore
|
||||
if isinstance(leaf, MultiModalKwargs):
|
||||
return cls.get_item_size(leaf.data) # type: ignore
|
||||
|
||||
# sys.getsizeof doesn't work for tensors
|
||||
if isinstance(leaf, torch.Tensor):
|
||||
|
||||
@ -11,7 +11,7 @@ from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
|
||||
Union, cast, final)
|
||||
|
||||
import numpy as np
|
||||
from typing_extensions import NotRequired, TypeAlias
|
||||
from typing_extensions import NotRequired, TypeAlias, deprecated
|
||||
|
||||
from vllm.utils import LazyLoader, full_groupby, is_list_of
|
||||
from vllm.utils.jsontree import JSONTree, json_map_leaves
|
||||
@ -656,7 +656,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
|
||||
def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None:
|
||||
super().__init__(data)
|
||||
|
||||
modalities = {elem.modality for elem in self.data.values()}
|
||||
modalities = {elem.modality for elem in self.values()}
|
||||
assert len(modalities) == 1, f"Found different modalities={modalities}"
|
||||
self._modality = next(iter(modalities))
|
||||
|
||||
@ -668,16 +668,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
|
||||
return {key: elem.data for key, elem in self.items()}
|
||||
|
||||
|
||||
class MultiModalKwargs:
|
||||
class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
|
||||
"""
|
||||
A dictionary that represents the keyword arguments to
|
||||
[`torch.nn.Module.forward`][].
|
||||
|
||||
The metadata `items` enables us to obtain the keyword arguments
|
||||
corresponding to each data item in
|
||||
[`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
|
||||
[`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
|
||||
[`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
|
||||
A dictionary of
|
||||
[`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s
|
||||
by modality.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -712,19 +707,64 @@ class MultiModalKwargs:
|
||||
elems = [v[item_idx] for v in elems_in_modality.values()]
|
||||
items.append(MultiModalKwargsItem.from_elems(elems))
|
||||
|
||||
return MultiModalKwargs(items)
|
||||
|
||||
def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None:
|
||||
super().__init__()
|
||||
return MultiModalKwargsItems.from_seq(items)
|
||||
|
||||
@staticmethod
|
||||
def from_seq(items: Sequence[MultiModalKwargsItem]):
|
||||
items_by_modality = full_groupby(items, key=lambda x: x.modality)
|
||||
self._items_by_modality = dict(items_by_modality)
|
||||
return MultiModalKwargsItems(items_by_modality)
|
||||
|
||||
self._data: Optional[dict[str, NestedTensors]] = None
|
||||
def __getitem__(self, modality: str):
|
||||
if modality not in self:
|
||||
raise KeyError(f"Modality {modality!r} not found. "
|
||||
f"Available modalities: {set(self.keys())}")
|
||||
|
||||
@property
|
||||
def modalities(self):
|
||||
return self._items_by_modality.keys()
|
||||
return super().__getitem__(modality)
|
||||
|
||||
def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
|
||||
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
|
||||
for items in self.values():
|
||||
for item in items:
|
||||
for key, elem in item.items():
|
||||
elems_by_key[key].append(elem)
|
||||
|
||||
return MultiModalKwargs({
|
||||
key:
|
||||
elems[0].field.reduce_data(elems, pin_memory=pin_memory)
|
||||
for key, elems in elems_by_key.items() if len(elems) > 0
|
||||
})
|
||||
|
||||
|
||||
class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||
"""
|
||||
A dictionary that represents the keyword arguments to
|
||||
[`torch.nn.Module.forward`][].
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@deprecated("`MultiModalKwargs.from_hf_inputs` is deprecated and "
|
||||
"will be removed in v0.13. "
|
||||
"Please use `MultiModalKwargsItems.from_hf_inputs` and "
|
||||
"access the tensor data using `.get_data()`.")
|
||||
def from_hf_inputs(
|
||||
hf_inputs: "BatchFeature",
|
||||
config_by_key: Mapping[str, MultiModalFieldConfig],
|
||||
):
|
||||
return MultiModalKwargsItems.from_hf_inputs(hf_inputs, config_by_key) \
|
||||
.get_data()
|
||||
|
||||
@staticmethod
|
||||
@deprecated("`MultiModalKwargs.from_items` is deprecated and "
|
||||
"will be removed in v0.13. "
|
||||
"Please use `MultiModalKwargsItems.from_seq` and "
|
||||
"access the tensor data using `.get_data()`.")
|
||||
def from_items(
|
||||
items: Sequence[MultiModalKwargsItem],
|
||||
*,
|
||||
pin_memory: bool = False,
|
||||
):
|
||||
return MultiModalKwargsItems.from_seq(items) \
|
||||
.get_data(pin_memory=pin_memory)
|
||||
|
||||
@staticmethod
|
||||
def _try_stack(nested_tensors: NestedTensors,
|
||||
@ -813,92 +853,24 @@ class MultiModalKwargs:
|
||||
|
||||
return cast(BatchedTensorInputs, json_mapped)
|
||||
|
||||
def keys(self):
|
||||
return self.get_data().keys()
|
||||
|
||||
def values(self):
|
||||
return self.get_data().values()
|
||||
|
||||
def items(self):
|
||||
return self.get_data().items()
|
||||
|
||||
def get(self, key: str, /, default=None):
|
||||
return self.get_data().get(key, default)
|
||||
|
||||
def pop(self, key: str, *args, **kwargs):
|
||||
data = dict(self.get_data())
|
||||
res = data.pop(key, *args, **kwargs)
|
||||
|
||||
for items in self._items_by_modality.values():
|
||||
for item in items:
|
||||
item.pop(key, *args, **kwargs)
|
||||
|
||||
self._data = None
|
||||
|
||||
return res
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.get_data())
|
||||
|
||||
def __getitem__(self, key: str):
|
||||
return self.get_data()[key]
|
||||
if key not in self:
|
||||
raise KeyError(f"Keyword argument {key!r} not found. "
|
||||
f"Available keys: {set(self.keys())}")
|
||||
|
||||
return super().__getitem__(key)
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, self.__class__):
|
||||
return False
|
||||
|
||||
return self._items_by_modality == other._items_by_modality
|
||||
for k in self:
|
||||
if k not in other:
|
||||
return False
|
||||
if not nested_tensors_equal(self[k], other[k]):
|
||||
return False
|
||||
|
||||
def _validate_modality(self, method_name: str, modality: str) -> None:
|
||||
if not self._items_by_modality:
|
||||
raise RuntimeError(
|
||||
f"`{method_name}` is not supported when "
|
||||
"MultiModalKwargs is not initialized with `items`")
|
||||
|
||||
if modality not in self._items_by_modality:
|
||||
available_modalities = set(self._items_by_modality.keys())
|
||||
raise KeyError(f"Modality {modality!r} not found. "
|
||||
f"Available modalities: {available_modalities}")
|
||||
|
||||
def get_item_count(self, modality: str) -> int:
|
||||
"""Get the number of items belonging to a modality."""
|
||||
self._validate_modality("get_item_count", modality)
|
||||
return len(self._items_by_modality[modality])
|
||||
|
||||
def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
|
||||
"""
|
||||
Get the keyword arguments corresponding to an item identified by
|
||||
its modality and index.
|
||||
"""
|
||||
self._validate_modality("get_item", modality)
|
||||
return self._items_by_modality[modality][item_index]
|
||||
|
||||
def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
|
||||
"""
|
||||
Get the keyword arguments corresponding to each item belonging to
|
||||
a modality.
|
||||
"""
|
||||
self._validate_modality("get_items", modality)
|
||||
return self._items_by_modality[modality]
|
||||
|
||||
def get_data(self,
|
||||
*,
|
||||
pin_memory: bool = False) -> dict[str, NestedTensors]:
|
||||
if self._data is not None:
|
||||
return self._data
|
||||
|
||||
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
|
||||
for items in self._items_by_modality.values():
|
||||
for item in items:
|
||||
for key, elem in item.items():
|
||||
elems_by_key[key].append(elem)
|
||||
|
||||
data = {
|
||||
key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
|
||||
for key, elems in elems_by_key.items() if len(elems) > 0
|
||||
}
|
||||
self._data = data
|
||||
return data
|
||||
return True
|
||||
|
||||
|
||||
MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]]
|
||||
@ -926,7 +898,7 @@ class MultiModalInputs(TypedDict):
|
||||
token_type_ids: NotRequired[list[int]]
|
||||
"""The token type IDs of the prompt."""
|
||||
|
||||
mm_kwargs: MultiModalKwargs
|
||||
mm_kwargs: MultiModalKwargsItems
|
||||
"""Keyword arguments to be directly passed to the model after batching."""
|
||||
|
||||
mm_hashes: Optional["MultiModalHashDict"]
|
||||
|
||||
@ -16,7 +16,7 @@ from vllm.utils import LazyLoader, is_list_of
|
||||
from .audio import AudioResampler
|
||||
from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
|
||||
ImageItem, ModalityData, MultiModalDataDict,
|
||||
MultiModalFieldConfig, MultiModalKwargs, VideoItem)
|
||||
MultiModalFieldConfig, MultiModalKwargsItems, VideoItem)
|
||||
|
||||
_T = TypeVar("_T")
|
||||
_I = TypeVar("_I")
|
||||
@ -157,19 +157,16 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
|
||||
self.fields_config = fields_config
|
||||
self.required_fields = required_fields
|
||||
|
||||
self._kwargs = MultiModalKwargs.from_hf_inputs(
|
||||
self._kwargs = MultiModalKwargsItems.from_hf_inputs(
|
||||
BatchFeature(dict(data)),
|
||||
fields_config,
|
||||
)
|
||||
|
||||
def get_count(self) -> int:
|
||||
return self._kwargs.get_item_count(self.modality)
|
||||
return len(self._kwargs[self.modality])
|
||||
|
||||
def get(self, index: int) -> Mapping[str, torch.Tensor]:
|
||||
return {
|
||||
k: v.data
|
||||
for k, v in self._kwargs.get_item(self.modality, index).items()
|
||||
}
|
||||
return self._kwargs[self.modality][index].get_data()
|
||||
|
||||
def get_processor_data(self) -> Mapping[str, object]:
|
||||
return {}
|
||||
|
||||
@ -23,8 +23,9 @@ from vllm.utils import flatten_2d_lists, full_groupby
|
||||
from .cache import MultiModalCache
|
||||
from .hasher import MultiModalHasher
|
||||
from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
||||
MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
MultiModalFieldConfig, MultiModalInputs,
|
||||
MultiModalKwargsItem, MultiModalKwargsItems,
|
||||
PlaceholderRange)
|
||||
from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
|
||||
MultiModalDataParser)
|
||||
|
||||
@ -985,7 +986,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
|
||||
MultiModalHashes = dict[str, list[str]]
|
||||
"""
|
||||
A collection of hashes with a similar structure as
|
||||
[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
|
||||
[`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
|
||||
"""
|
||||
|
||||
|
||||
@ -1095,7 +1096,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
"""
|
||||
Given the original multi-modal items for this modality
|
||||
@ -1361,7 +1362,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
self,
|
||||
cache: ProcessingCache,
|
||||
mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
|
||||
mm_missing_kwargs: MultiModalKwargs,
|
||||
mm_missing_kwargs: MultiModalKwargsItems,
|
||||
) -> dict[str, list[MultiModalKwargsItem]]:
|
||||
mm_missing_next_idx = defaultdict[str, int](lambda: 0)
|
||||
|
||||
@ -1369,10 +1370,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
for modality, items_or_hashes in mm_cache_items_or_hashes.items():
|
||||
for item_or_hash in items_or_hashes:
|
||||
if isinstance(item_or_hash, str):
|
||||
kw_item = mm_missing_kwargs.get_item(
|
||||
modality,
|
||||
mm_missing_next_idx[modality],
|
||||
)
|
||||
kw_item = mm_missing_kwargs[modality][
|
||||
mm_missing_next_idx[modality]]
|
||||
cache.put(item_or_hash, kw_item)
|
||||
mm_missing_next_idx[modality] += 1
|
||||
else:
|
||||
@ -1390,7 +1389,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
|
||||
bool]:
|
||||
(
|
||||
prompt_ids,
|
||||
mm_processed_data,
|
||||
@ -1403,7 +1403,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
enable_hf_prompt_update=True,
|
||||
)
|
||||
|
||||
mm_kwargs = MultiModalKwargs.from_hf_inputs(
|
||||
mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
|
||||
mm_processed_data,
|
||||
self._get_mm_fields_config(mm_processed_data,
|
||||
hf_processor_mm_kwargs),
|
||||
@ -1423,7 +1423,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
|
||||
bool]:
|
||||
"""
|
||||
Apply the HF processor on the full prompt text,
|
||||
caching the results and reusing cached results.
|
||||
@ -1468,7 +1469,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
enable_hf_prompt_update=False,
|
||||
)
|
||||
|
||||
mm_missing_kwargs = MultiModalKwargs.from_hf_inputs(
|
||||
mm_missing_kwargs = MultiModalKwargsItems.from_hf_inputs(
|
||||
mm_missing_processed_data,
|
||||
self._get_mm_fields_config(mm_missing_processed_data,
|
||||
hf_processor_mm_kwargs),
|
||||
@ -1480,7 +1481,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_missing_kwargs=mm_missing_kwargs,
|
||||
)
|
||||
|
||||
mm_kwargs = MultiModalKwargs([
|
||||
mm_kwargs = MultiModalKwargsItems.from_seq([
|
||||
item for cache_items in mm_cache_items_merged.values()
|
||||
for item in cache_items
|
||||
])
|
||||
@ -1585,14 +1586,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
|
||||
def _validate_mm_kwargs(
|
||||
self,
|
||||
mm_kwargs: MultiModalKwargs,
|
||||
mm_kwargs: MultiModalKwargsItems,
|
||||
mm_item_counts: Mapping[str, int],
|
||||
) -> None:
|
||||
for modality, item_count in mm_item_counts.items():
|
||||
if modality in mm_kwargs.modalities:
|
||||
items = mm_kwargs.get_items(modality)
|
||||
else:
|
||||
items = []
|
||||
items = mm_kwargs.get(modality, [])
|
||||
|
||||
if len(items) != item_count:
|
||||
raise RuntimeError(
|
||||
@ -1630,7 +1628,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
prompt_ids: list[int],
|
||||
mm_kwargs: MultiModalKwargs,
|
||||
mm_kwargs: MultiModalKwargsItems,
|
||||
is_update_applied: bool,
|
||||
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||
unbound_prompt_updates = self._get_prompt_updates(
|
||||
|
||||
@ -13,7 +13,7 @@ import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
||||
MultiModalInputs, MultiModalKwargs,
|
||||
MultiModalInputs, MultiModalKwargsItems,
|
||||
MultiModalPlaceholderDict)
|
||||
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
|
||||
EncDecMultiModalProcessor)
|
||||
@ -43,7 +43,7 @@ class DummyDecoderData(NamedTuple):
|
||||
"""Dummy data used for profiling."""
|
||||
|
||||
prompt_token_ids: list[int]
|
||||
multi_modal_data: MultiModalKwargs
|
||||
multi_modal_data: MultiModalKwargsItems
|
||||
multi_modal_placeholders: MultiModalPlaceholderDict
|
||||
|
||||
|
||||
|
||||
@ -32,11 +32,13 @@ _M = TypeVar("_M")
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .inputs import (BatchedTensorInputs, MultiModalKwargs,
|
||||
MultiModalKwargsItem, MultiModalPlaceholderDict)
|
||||
MultiModalKwargsItem, MultiModalKwargsItems,
|
||||
MultiModalPlaceholderDict)
|
||||
else:
|
||||
BatchedTensorInputs = Any
|
||||
MultiModalKwargs = Any
|
||||
MultiModalKwargsItem = Any
|
||||
MultiModalKwargsItems = Any
|
||||
MultiModalPlaceholderDict = Any
|
||||
|
||||
global_thread_pool = ThreadPoolExecutor(
|
||||
@ -359,18 +361,20 @@ def argsort_mm_positions(
|
||||
"`group_mm_kwargs_by_modality` and will be removed in v0.13. "
|
||||
"Please use `group_mm_kwargs_by_modality` instead.")
|
||||
def group_mm_inputs_by_modality(
|
||||
mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
|
||||
mm_inputs: list[MultiModalKwargsItems]
|
||||
) -> list[list[MultiModalKwargsItems]]:
|
||||
if not mm_inputs:
|
||||
return []
|
||||
|
||||
def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
|
||||
def modality_group_func(
|
||||
mm_input: MultiModalKwargsItems) -> Union[str, int]:
|
||||
# If the input has multiple modalities, return a id as the unique key
|
||||
# for the mm_input input.
|
||||
if len(mm_input.modalities) > 1:
|
||||
if len(mm_input) > 1:
|
||||
return id(mm_input)
|
||||
|
||||
elif len(mm_input.modalities) == 1:
|
||||
return list(mm_input.modalities)[0]
|
||||
elif len(mm_input) == 1:
|
||||
return next(iter(mm_input.keys()))
|
||||
|
||||
# FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
|
||||
# this is used to make InternVL with legacy pipeline still work with v1.
|
||||
@ -397,12 +401,12 @@ def group_mm_kwargs_by_modality(
|
||||
Yields:
|
||||
A tuple `(modality, num_items, grouped_kwargs)`.
|
||||
"""
|
||||
from vllm.multimodal.inputs import MultiModalKwargs
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
|
||||
|
||||
for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
|
||||
items_lst = list(items)
|
||||
|
||||
# mm_kwargs_group = MultiModalKwargs(items_lst) \
|
||||
# mm_kwargs_group = MultiModalKwargsItems.from_items(items_lst) \
|
||||
# .get_data(pin_memory=pin_memory)
|
||||
|
||||
# if device is not None:
|
||||
@ -417,7 +421,10 @@ def group_mm_kwargs_by_modality(
|
||||
# We will also need to update each model to remove `flatten_bn`.
|
||||
mm_kwargs_group = MultiModalKwargs.as_kwargs(
|
||||
MultiModalKwargs.batch(
|
||||
[MultiModalKwargs([item]) for item in items_lst],
|
||||
[
|
||||
MultiModalKwargsItems.from_seq([item]).get_data()
|
||||
for item in items_lst
|
||||
],
|
||||
pin_memory=pin_memory,
|
||||
),
|
||||
device=device,
|
||||
|
||||
@ -22,7 +22,6 @@ from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.multimodal.inputs import NestedTensors
|
||||
from vllm.v1.worker.kv_connector_model_runner_mixin import (
|
||||
KVConnectorOutput)
|
||||
|
||||
@ -523,7 +522,7 @@ class Sequence:
|
||||
@property
|
||||
def multi_modal_data(self) -> MultiModalKwargs:
|
||||
if self.inputs["type"] == "multimodal":
|
||||
return self.inputs["mm_kwargs"]
|
||||
return self.inputs["mm_kwargs"].get_data()
|
||||
|
||||
return MultiModalKwargs()
|
||||
|
||||
@ -979,8 +978,7 @@ class SequenceGroupMetadata(
|
||||
state: Optional[SequenceGroupState] = msgspec.field(
|
||||
default_factory=lambda: SequenceGroupState())
|
||||
token_type_ids: Optional[list[int]] = None
|
||||
multi_modal_data: Optional[Union[MultiModalKwargs,
|
||||
dict[str, "NestedTensors"]]] = None
|
||||
multi_modal_data: Optional[MultiModalKwargs] = None
|
||||
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
|
||||
encoder_seq_data: Optional[SequenceData] = None
|
||||
cross_block_table: Optional[list[int]] = None
|
||||
|
||||
@ -310,7 +310,7 @@ class Processor:
|
||||
sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
|
||||
|
||||
orig_sorted_mm_inputs = [
|
||||
decoder_mm_inputs.get_item(modality, idx)
|
||||
decoder_mm_inputs[modality][idx]
|
||||
for modality, idx in sorted_mm_idxs
|
||||
]
|
||||
sorted_mm_positions = [
|
||||
|
||||
@ -18,12 +18,15 @@ from msgspec import msgpack
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
# yapf: disable
|
||||
from vllm.multimodal.inputs import (BaseMultiModalField,
|
||||
MultiModalBatchedField,
|
||||
MultiModalFieldConfig, MultiModalFieldElem,
|
||||
MultiModalFlatField, MultiModalKwargs,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalKwargsItems,
|
||||
MultiModalSharedField, NestedTensors)
|
||||
# yapf: enable
|
||||
from vllm.v1.engine import UtilityResult
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -116,12 +119,11 @@ class MsgpackEncoder:
|
||||
if isinstance(obj, MultiModalKwargsItem):
|
||||
return self._encode_mm_item(obj)
|
||||
|
||||
if isinstance(obj, MultiModalKwargsItems):
|
||||
return self._encode_mm_items(obj)
|
||||
|
||||
if isinstance(obj, MultiModalKwargs):
|
||||
return [
|
||||
self._encode_mm_item(item)
|
||||
for itemlist in obj._items_by_modality.values()
|
||||
for item in itemlist
|
||||
]
|
||||
return self._encode_mm_kwargs(obj)
|
||||
|
||||
if isinstance(obj, UtilityResult):
|
||||
result = obj.result
|
||||
@ -183,6 +185,12 @@ class MsgpackEncoder:
|
||||
dtype = str(obj.dtype).removeprefix("torch.")
|
||||
return dtype, obj.shape, data
|
||||
|
||||
def _encode_mm_items(self, items: MultiModalKwargsItems) -> dict[str, Any]:
|
||||
return {
|
||||
modality: [self._encode_mm_item(item) for item in itemlist]
|
||||
for modality, itemlist in items.items()
|
||||
}
|
||||
|
||||
def _encode_mm_item(self,
|
||||
item: MultiModalKwargsItem) -> list[dict[str, Any]]:
|
||||
return [self._encode_mm_field_elem(elem) for elem in item.values()]
|
||||
@ -200,6 +208,12 @@ class MsgpackEncoder:
|
||||
self._encode_mm_field(elem.field),
|
||||
}
|
||||
|
||||
def _encode_mm_kwargs(self, kw: MultiModalKwargs) -> dict[str, Any]:
|
||||
return {
|
||||
modality: self._encode_nested_tensors(data)
|
||||
for modality, data in kw.items()
|
||||
}
|
||||
|
||||
def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
|
||||
if isinstance(nt, torch.Tensor):
|
||||
return self._encode_tensor(nt)
|
||||
@ -260,8 +274,10 @@ class MsgpackDecoder:
|
||||
return slice(*obj)
|
||||
if issubclass(t, MultiModalKwargsItem):
|
||||
return self._decode_mm_item(obj)
|
||||
if issubclass(t, MultiModalKwargsItems):
|
||||
return self._decode_mm_items(obj)
|
||||
if issubclass(t, MultiModalKwargs):
|
||||
return MultiModalKwargs(self._decode_mm_items(obj))
|
||||
return self._decode_mm_kwargs(obj)
|
||||
if t is UtilityResult:
|
||||
return self._decode_utility_result(obj)
|
||||
return obj
|
||||
@ -315,8 +331,11 @@ class MsgpackDecoder:
|
||||
# Convert back to proper shape & type
|
||||
return arr.view(torch_dtype).view(shape)
|
||||
|
||||
def _decode_mm_items(self, obj: list[Any]) -> list[MultiModalKwargsItem]:
|
||||
return [self._decode_mm_item(v) for v in obj]
|
||||
def _decode_mm_items(self, obj: dict[str, Any]) -> MultiModalKwargsItems:
|
||||
return MultiModalKwargsItems({
|
||||
modality: [self._decode_mm_item(item) for item in itemlist]
|
||||
for modality, itemlist in obj.items()
|
||||
})
|
||||
|
||||
def _decode_mm_item(self, obj: list[Any]) -> MultiModalKwargsItem:
|
||||
return MultiModalKwargsItem.from_elems(
|
||||
@ -339,6 +358,12 @@ class MsgpackDecoder:
|
||||
obj["field"] = factory_meth(None, *field_args).field
|
||||
return MultiModalFieldElem(**obj)
|
||||
|
||||
def _decode_mm_kwargs(self, obj: dict[str, Any]) -> MultiModalKwargs:
|
||||
return MultiModalKwargs({
|
||||
modality: self._decode_nested_tensors(data)
|
||||
for modality, data in obj.items()
|
||||
})
|
||||
|
||||
def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
|
||||
if isinstance(obj, (int, float)):
|
||||
# Although it violates NestedTensors type, MultiModalKwargs
|
||||
|
||||
@ -10,8 +10,8 @@ import torch
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem,
|
||||
PlaceholderRange)
|
||||
from vllm.multimodal.inputs import (MultiModalKwargsItem,
|
||||
MultiModalKwargsItems, PlaceholderRange)
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams, SamplingType
|
||||
from vllm.utils import swap_dict_values
|
||||
@ -57,8 +57,10 @@ class CachedRequestState:
|
||||
@property
|
||||
@deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
|
||||
"removed in v0.13. Please use `mm_kwargs` instead.")
|
||||
def mm_inputs(self) -> list[MultiModalKwargs]:
|
||||
return [MultiModalKwargs([item]) for item in self.mm_kwargs]
|
||||
def mm_inputs(self) -> list[MultiModalKwargsItems]:
|
||||
return [
|
||||
MultiModalKwargsItems.from_seq([item]) for item in self.mm_kwargs
|
||||
]
|
||||
|
||||
def get_token_id(self, idx: int) -> int:
|
||||
if idx < self.num_prompt_tokens:
|
||||
|
||||
@ -2218,11 +2218,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
dummy_mm_data = dummy_decoder_data.multi_modal_data
|
||||
|
||||
# Result in the maximum GPU consumption of the model
|
||||
dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
|
||||
dummy_mm_item = dummy_mm_data[modality][0]
|
||||
dummy_mm_items = [dummy_mm_item] * max_items_per_batch
|
||||
|
||||
return next(mm_kwargs_group
|
||||
for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
|
||||
[dummy_mm_item] * max_items_per_batch,
|
||||
dummy_mm_items,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
))
|
||||
|
||||
@ -1824,11 +1824,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
dummy_mm_data = dummy_decoder_data.multi_modal_data
|
||||
|
||||
# Result in the maximum GPU consumption of the model
|
||||
dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
|
||||
dummy_mm_item = dummy_mm_data[modality][0]
|
||||
dummy_mm_items = [dummy_mm_item] * max_items_per_batch
|
||||
|
||||
return next(grouped_mm_kwargs
|
||||
for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
|
||||
[dummy_mm_item] * max_items_per_batch,
|
||||
dummy_mm_items,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory,
|
||||
))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user