mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 11:15:01 +08:00
[0/N] Rename MultiModalInputs to MultiModalKwargs (#10040)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
d7edca1dee
commit
e0191a95d8
@ -53,7 +53,7 @@ Base Classes
|
||||
|
||||
.. autodata:: vllm.multimodal.MultiModalDataDict
|
||||
|
||||
.. autoclass:: vllm.multimodal.MultiModalInputs
|
||||
.. autoclass:: vllm.multimodal.MultiModalKwargs
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ import torch
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm.inputs import InputContext, token_inputs
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.base import MultiModalKwargs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
from .....conftest import IMAGE_ASSETS
|
||||
@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
||||
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||
# Ensure that we get the appropriately shaped pixel_values
|
||||
# for images and image embeddings, respectively.
|
||||
assert isinstance(mapped_img_data, MultiModalInputs)
|
||||
assert isinstance(mapped_img_data, MultiModalKwargs)
|
||||
assert "pixel_values" in mapped_img_data
|
||||
assert mapped_img_data["pixel_values"].shape == expected_shape
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.base import MultiModalInputs, NestedTensors
|
||||
from vllm.multimodal.base import MultiModalKwargs, NestedTensors
|
||||
|
||||
|
||||
def assert_nested_tensors_equal(expected: NestedTensors,
|
||||
@ -13,8 +13,8 @@ def assert_nested_tensors_equal(expected: NestedTensors,
|
||||
assert_nested_tensors_equal(expected_item, actual_item)
|
||||
|
||||
|
||||
def assert_multimodal_inputs_equal(expected: MultiModalInputs,
|
||||
actual: MultiModalInputs):
|
||||
def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
|
||||
actual: MultiModalKwargs):
|
||||
assert set(expected.keys()) == set(actual.keys())
|
||||
for key in expected:
|
||||
assert_nested_tensors_equal(expected[key], actual[key])
|
||||
@ -22,7 +22,7 @@ def assert_multimodal_inputs_equal(expected: MultiModalInputs,
|
||||
|
||||
def test_multimodal_input_batch_single_tensor():
|
||||
t = torch.rand([1, 2])
|
||||
result = MultiModalInputs.batch([{"image": t}])
|
||||
result = MultiModalKwargs.batch([{"image": t}])
|
||||
assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@ def test_multimodal_input_batch_multiple_tensors():
|
||||
a = torch.rand([1, 1, 2])
|
||||
b = torch.rand([1, 1, 2])
|
||||
c = torch.rand([1, 1, 2])
|
||||
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
|
||||
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
|
||||
assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ def test_multimodal_input_batch_multiple_heterogeneous_tensors():
|
||||
a = torch.rand([1, 2, 2])
|
||||
b = torch.rand([1, 3, 2])
|
||||
c = torch.rand([1, 4, 2])
|
||||
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
|
||||
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
|
||||
assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ def test_multimodal_input_batch_nested_tensors():
|
||||
a = torch.rand([2, 3])
|
||||
b = torch.rand([2, 3])
|
||||
c = torch.rand([2, 3])
|
||||
result = MultiModalInputs.batch([{
|
||||
result = MultiModalKwargs.batch([{
|
||||
"image": [a]
|
||||
}, {
|
||||
"image": [b]
|
||||
@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists():
|
||||
a = torch.rand([1, 2, 3])
|
||||
b = torch.rand([1, 2, 3])
|
||||
c = torch.rand([1, 2, 3])
|
||||
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
|
||||
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
|
||||
assert_multimodal_inputs_equal(
|
||||
result,
|
||||
{"image": [torch.stack([a, b]), c.unsqueeze(0)]})
|
||||
@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists():
|
||||
b = torch.rand([1, 2, 3])
|
||||
c = torch.rand([1, 2, 3])
|
||||
d = torch.rand([1, 2, 3])
|
||||
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}])
|
||||
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
|
||||
assert_multimodal_inputs_equal(
|
||||
result,
|
||||
{"image": torch.stack([torch.stack([a, b]),
|
||||
@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths():
|
||||
b = torch.rand([1, 3, 3])
|
||||
c = torch.rand([1, 4, 3])
|
||||
|
||||
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
|
||||
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
|
||||
assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
|
||||
|
||||
result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}])
|
||||
result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
|
||||
assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
|
||||
|
||||
@ -30,7 +30,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal.base import MultiModalData
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
|
||||
@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv(
|
||||
raise
|
||||
pixel_values = raw_batch_data['images']
|
||||
|
||||
return MultiModalInputs({'pixel_values': pixel_values})
|
||||
return MultiModalKwargs({'pixel_values': pixel_values})
|
||||
|
||||
|
||||
def merge_glm_vision_embeddings(
|
||||
|
||||
@ -34,7 +34,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.base import MultiModalKwargs
|
||||
from vllm.multimodal.image import cached_get_image_processor
|
||||
from vllm.multimodal.utils import (cached_get_tokenizer,
|
||||
consecutive_placeholder_ranges)
|
||||
@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
|
||||
])
|
||||
|
||||
# image has been processed with prompt in input processor
|
||||
return MultiModalInputs({"pixel_values": data})
|
||||
return MultiModalKwargs({"pixel_values": data})
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
|
||||
|
||||
@ -16,7 +16,7 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
|
||||
token_inputs)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.base import MultiModalKwargs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
@ -324,12 +324,12 @@ class H2OVLInputPipeline(InternVLInputPipeline):
|
||||
data: object,
|
||||
*,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
) -> MultiModalInputs:
|
||||
) -> MultiModalKwargs:
|
||||
|
||||
# NOTE: Preprocessing for the image data is done in the
|
||||
# 'input_processor' function during actual inference.
|
||||
if isinstance(data, dict):
|
||||
return MultiModalInputs(data)
|
||||
return MultiModalKwargs(data)
|
||||
|
||||
# The section below is only used with dummy data during
|
||||
# memory profiling.
|
||||
@ -347,7 +347,7 @@ class H2OVLInputPipeline(InternVLInputPipeline):
|
||||
pixel_values = [image_pixel_values_mapper(img) for img in data]
|
||||
|
||||
else:
|
||||
return MultiModalInputs({"image_embeds": data})
|
||||
return MultiModalKwargs({"image_embeds": data})
|
||||
model_config = ctx.model_config
|
||||
tokenizer = cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
@ -359,7 +359,7 @@ class H2OVLInputPipeline(InternVLInputPipeline):
|
||||
return_tensors="pt",
|
||||
)[0]
|
||||
|
||||
return MultiModalInputs({
|
||||
return MultiModalKwargs({
|
||||
"pixel_values": pixel_values,
|
||||
"image_token_id": image_token_id
|
||||
})
|
||||
|
||||
@ -36,7 +36,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal.image import cached_get_image_processor
|
||||
from vllm.sequence import IntermediateTensors, SequenceData
|
||||
from vllm.transformers_utils.processor import cached_get_processor
|
||||
@ -127,7 +127,7 @@ def input_mapper_for_idefics3(
|
||||
logger.error("Failed to process image (%s)", data)
|
||||
raise
|
||||
|
||||
return MultiModalInputs(batch_data)
|
||||
return MultiModalKwargs(batch_data)
|
||||
|
||||
|
||||
def _resize_output_size(height: int,
|
||||
|
||||
@ -26,7 +26,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
|
||||
InternVisionPatchModel)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.base import MultiModalKwargs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import is_list_of
|
||||
@ -346,7 +346,7 @@ class InternVLInputPipeline:
|
||||
# we can't stack here because images may have different num_patches
|
||||
data = [image_pixel_values_mapper(img) for img in data]
|
||||
else:
|
||||
return MultiModalInputs({"image_embeds": data})
|
||||
return MultiModalKwargs({"image_embeds": data})
|
||||
model_config = ctx.model_config
|
||||
tokenizer = cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
@ -355,7 +355,7 @@ class InternVLInputPipeline:
|
||||
add_special_tokens=False,
|
||||
return_tensors="pt")[0]
|
||||
|
||||
return MultiModalInputs({
|
||||
return MultiModalKwargs({
|
||||
"pixel_values": data,
|
||||
"image_token_id": image_token_id
|
||||
})
|
||||
|
||||
@ -52,7 +52,7 @@ from vllm.model_executor.models.qwen2 import Qwen2Model
|
||||
from vllm.model_executor.models.utils import LLMWrapper
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.base import MultiModalKwargs
|
||||
from vllm.multimodal.image import cached_get_image_processor
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import IntermediateTensors, SequenceData
|
||||
@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
|
||||
batch_data["slice_start_id"] = data[0]["slice_start_id"]
|
||||
batch_data["slice_end_id"] = data[0]["slice_end_id"]
|
||||
|
||||
return MultiModalInputs(batch_data)
|
||||
return MultiModalKwargs(batch_data)
|
||||
|
||||
|
||||
class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
|
||||
@ -1162,7 +1162,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
|
||||
def _parse_and_validate_image_input(self, **kwargs: object):
|
||||
# tensor with the same shape will be batched together by
|
||||
# MultiModalInputs.batch, so pixel_values here can be:
|
||||
# MultiModalKwargs.batch, so pixel_values here can be:
|
||||
# - List[List[torch.Tensor]]:
|
||||
# with shape (num_tiles, 3, image_res, image_res)
|
||||
# - List[torch.Tensor]:
|
||||
|
||||
@ -37,7 +37,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead, VocabParallelEmbedding)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
|
||||
SequenceData)
|
||||
@ -866,7 +866,7 @@ def image_input_mapper_for_molmo(
|
||||
ctx: InputContext,
|
||||
data: object,
|
||||
):
|
||||
return MultiModalInputs(data)
|
||||
return MultiModalKwargs(data)
|
||||
|
||||
|
||||
def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
|
||||
|
||||
@ -30,7 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.utils import merge_multimodal_embeddings
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.base import MultiModalKwargs
|
||||
from vllm.multimodal.utils import (cached_get_tokenizer,
|
||||
consecutive_placeholder_ranges)
|
||||
from vllm.sequence import IntermediateTensors, SequenceData
|
||||
@ -94,8 +94,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
|
||||
|
||||
|
||||
def input_mapper_for_pixtral(ctx: InputContext,
|
||||
data: object) -> MultiModalInputs:
|
||||
"""Maps the input data to its MultiModalInputs (if any).
|
||||
data: object) -> MultiModalKwargs:
|
||||
"""Maps the input data to its MultiModalKwargs (if any).
|
||||
|
||||
Args:
|
||||
ctx: Context of the loaded model.
|
||||
@ -103,7 +103,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
|
||||
to pixel_values in .forward() for a visual QWenLMHeadModel model.
|
||||
|
||||
Returns:
|
||||
MultiModalInputs containing the stacked normalized images tensor or
|
||||
MultiModalKwargs containing the stacked normalized images tensor or
|
||||
image embeddings.
|
||||
"""
|
||||
# Early exit if we have provided an image to a language only Qwen model
|
||||
@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
|
||||
dtype=torch.float16)
|
||||
images.append(image)
|
||||
|
||||
return MultiModalInputs({"images": images})
|
||||
return MultiModalKwargs({"images": images})
|
||||
|
||||
|
||||
def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
|
||||
|
||||
@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.base import MultiModalKwargs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import IntermediateTensors, SequenceData
|
||||
from vllm.utils import is_list_of
|
||||
@ -722,8 +722,8 @@ def input_processor_for_qwen(ctx: InputContext,
|
||||
multi_modal_data=multi_modal_data)
|
||||
|
||||
|
||||
def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
|
||||
"""Maps the input data to its MultiModalInputs (if any).
|
||||
def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
|
||||
"""Maps the input data to its MultiModalKwargs (if any).
|
||||
|
||||
Args:
|
||||
ctx: Context of the loaded model.
|
||||
@ -731,7 +731,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
|
||||
to pixel_values in .forward() for a visual QWenLMHeadModel model.
|
||||
|
||||
Returns:
|
||||
MultiModalInputs containing the stacked normalized images tensor or
|
||||
MultiModalKwargs containing the stacked normalized images tensor or
|
||||
image embeddings.
|
||||
"""
|
||||
# Early exit if we have provided an image to a language only Qwen model
|
||||
@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
|
||||
logger.warning(
|
||||
"Images were provided but this model has no visual config; "
|
||||
"multimodal inputs will not be forwarded to the model.")
|
||||
return MultiModalInputs()
|
||||
return MultiModalKwargs()
|
||||
|
||||
model_config = ctx.model_config
|
||||
tokenizer = cached_get_tokenizer(
|
||||
@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
|
||||
data = [data]
|
||||
transformed_images = [transform(datum) for datum in data]
|
||||
pixel_values = torch.stack(transformed_images, dim=0)
|
||||
return MultiModalInputs({"pixel_values": pixel_values})
|
||||
return MultiModalKwargs({"pixel_values": pixel_values})
|
||||
|
||||
|
||||
def build_normalization_transform(image_size: int) -> transforms.Compose:
|
||||
|
||||
@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import (
|
||||
default_weight_loader, maybe_remap_kv_scale_name)
|
||||
from vllm.model_executor.models.qwen2 import Qwen2Model
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal.utils import consecutive_placeholder_ranges
|
||||
from vllm.sequence import IntermediateTensors, SequenceData
|
||||
|
||||
@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio(
|
||||
def input_mapper_for_qwen2_audio(
|
||||
ctx: InputContext,
|
||||
multi_modal_data: Union[np.ndarray, List[np.ndarray]],
|
||||
) -> MultiModalInputs:
|
||||
) -> MultiModalKwargs:
|
||||
"""Input mapper for Qwen2-Audio."""
|
||||
if not isinstance(multi_modal_data, list):
|
||||
multi_modal_data = [multi_modal_data]
|
||||
|
||||
if len(multi_modal_data) == 0:
|
||||
return MultiModalInputs()
|
||||
return MultiModalKwargs()
|
||||
|
||||
processor = cached_get_processor(ctx.model_config.model)
|
||||
audio_feature_extractor = processor.feature_extractor
|
||||
@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio(
|
||||
logger.error("Failed to process audio (%s)", multi_modal_data)
|
||||
raise
|
||||
|
||||
return MultiModalInputs(batch_data)
|
||||
return MultiModalKwargs(batch_data)
|
||||
|
||||
|
||||
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)
|
||||
|
||||
@ -57,7 +57,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.qwen2 import Qwen2Model
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
|
||||
MultiModalInputs)
|
||||
MultiModalKwargs)
|
||||
from vllm.multimodal.base import MultiModalData
|
||||
from vllm.multimodal.image import cached_get_image_processor
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
@ -576,10 +576,10 @@ def mm_input_mapper_for_qwen2_vl(
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
) -> MultiModalInputs:
|
||||
) -> MultiModalKwargs:
|
||||
"""Input mapper for Qwen2-VL."""
|
||||
if data_type_key == "image" and isinstance(data, dict):
|
||||
return MultiModalInputs({
|
||||
return MultiModalKwargs({
|
||||
"image_embeds": data.get("image_embeds"),
|
||||
"image_grid_thw": data.get("image_grid_thw"),
|
||||
})
|
||||
@ -613,7 +613,7 @@ def mm_input_mapper_for_qwen2_vl(
|
||||
logger.error("Failed to process image (%s)", data)
|
||||
raise
|
||||
|
||||
return MultiModalInputs(batch_data)
|
||||
return MultiModalKwargs(batch_data)
|
||||
|
||||
|
||||
image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
|
||||
|
||||
@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.model_loader.loader import DefaultModelLoader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.utils import (cached_get_tokenizer,
|
||||
consecutive_placeholder_ranges,
|
||||
@ -116,11 +116,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
|
||||
data = [data]
|
||||
|
||||
if len(data) == 0:
|
||||
return MultiModalInputs()
|
||||
return MultiModalKwargs()
|
||||
|
||||
# If the audio inputs are embeddings, no need for preprocessing
|
||||
if is_list_of(data, torch.Tensor, check="all"):
|
||||
return MultiModalInputs({"audio_embeds": data})
|
||||
return MultiModalKwargs({"audio_embeds": data})
|
||||
|
||||
audio_features = []
|
||||
for audio_input in data:
|
||||
@ -154,7 +154,7 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
|
||||
# Remove the batch dimension because we're wrapping it in a list.
|
||||
audio_features.append(single_audio_features.squeeze(0))
|
||||
|
||||
return MultiModalInputs({"audio_features": audio_features})
|
||||
return MultiModalKwargs({"audio_features": audio_features})
|
||||
|
||||
|
||||
def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
|
||||
MultiModalDataDict, MultiModalInputs,
|
||||
MultiModalDataDict, MultiModalKwargs,
|
||||
MultiModalPlaceholderDict, MultiModalPlaceholderMap,
|
||||
MultiModalPlugin, NestedTensors)
|
||||
from .registry import MultiModalRegistry
|
||||
@ -17,7 +17,7 @@ __all__ = [
|
||||
"BatchedTensorInputs",
|
||||
"MultiModalDataBuiltins",
|
||||
"MultiModalDataDict",
|
||||
"MultiModalInputs",
|
||||
"MultiModalKwargs",
|
||||
"MultiModalPlaceholderDict",
|
||||
"MultiModalPlaceholderMap",
|
||||
"MultiModalPlugin",
|
||||
@ -25,3 +25,18 @@ __all__ = [
|
||||
"MULTIMODAL_REGISTRY",
|
||||
"MultiModalRegistry",
|
||||
]
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
import warnings
|
||||
|
||||
if name == "MultiModalInputs":
|
||||
msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
|
||||
"The original name will take another meaning in an upcoming "
|
||||
"version.")
|
||||
|
||||
warnings.warn(DeprecationWarning(msg), stacklevel=2)
|
||||
|
||||
return MultiModalKwargs
|
||||
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from vllm.inputs.registry import InputContext
|
||||
from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin
|
||||
from vllm.multimodal.base import MultiModalKwargs, MultiModalPlugin
|
||||
|
||||
|
||||
class AudioPlugin(MultiModalPlugin):
|
||||
@ -9,7 +9,7 @@ class AudioPlugin(MultiModalPlugin):
|
||||
return "audio"
|
||||
|
||||
def _default_input_mapper(self, ctx: InputContext, data: object,
|
||||
**mm_processor_kwargs) -> MultiModalInputs:
|
||||
**mm_processor_kwargs) -> MultiModalKwargs:
|
||||
raise NotImplementedError("There is no default audio input mapper")
|
||||
|
||||
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
||||
|
||||
@ -30,15 +30,15 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
|
||||
BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
|
||||
"""
|
||||
A dictionary containing nested tensors which have been batched via
|
||||
:meth:`MultiModalInputs.batch`.
|
||||
:meth:`MultiModalKwargs.batch`.
|
||||
"""
|
||||
|
||||
|
||||
class _MultiModalInputsBase(UserDict[str, NestedTensors]):
|
||||
class _MultiModalKwargsBase(UserDict[str, NestedTensors]):
|
||||
pass
|
||||
|
||||
|
||||
class MultiModalInputs(_MultiModalInputsBase):
|
||||
class MultiModalKwargs(_MultiModalKwargsBase):
|
||||
"""
|
||||
A dictionary that represents the keyword arguments to
|
||||
:meth:`~torch.nn.Module.forward`.
|
||||
@ -58,7 +58,7 @@ class MultiModalInputs(_MultiModalInputsBase):
|
||||
if isinstance(nested_tensors, (int, float)):
|
||||
return torch.tensor(nested_tensors)
|
||||
|
||||
stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
|
||||
stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
|
||||
if not is_list_of(stacked, torch.Tensor, check="all"):
|
||||
# Only tensors (not lists) can be stacked.
|
||||
return stacked
|
||||
@ -71,7 +71,7 @@ class MultiModalInputs(_MultiModalInputsBase):
|
||||
return torch.stack(tensors_)
|
||||
|
||||
@staticmethod
|
||||
def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
|
||||
def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
|
||||
"""
|
||||
Batch multiple inputs together into a dictionary.
|
||||
|
||||
@ -95,7 +95,7 @@ class MultiModalInputs(_MultiModalInputsBase):
|
||||
item_lists[k].append(v)
|
||||
|
||||
return {
|
||||
k: MultiModalInputs._try_stack(item_list)
|
||||
k: MultiModalKwargs._try_stack(item_list)
|
||||
for k, item_list in item_lists.items()
|
||||
}
|
||||
|
||||
@ -177,7 +177,7 @@ A dictionary containing placeholder ranges.
|
||||
"""
|
||||
|
||||
MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
|
||||
MultiModalInputs]
|
||||
MultiModalKwargs]
|
||||
"""
|
||||
Return a dictionary to be passed as keyword arguments to
|
||||
:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
|
||||
@ -226,7 +226,7 @@ class MultiModalPlugin(ABC):
|
||||
ctx: InputContext,
|
||||
data: MultiModalData[object],
|
||||
**mm_processor_kwargs,
|
||||
) -> MultiModalInputs:
|
||||
) -> MultiModalKwargs:
|
||||
"""
|
||||
Return a dictionary to be passed as keyword arguments to
|
||||
:meth:`~torch.nn.Module.forward`. This is similar in concept to
|
||||
@ -275,7 +275,7 @@ class MultiModalPlugin(ABC):
|
||||
model_config: "ModelConfig",
|
||||
data: MultiModalData[object],
|
||||
mm_processor_kwargs: Dict[str, Any],
|
||||
) -> MultiModalInputs:
|
||||
) -> MultiModalKwargs:
|
||||
"""
|
||||
Transform the data into a dictionary of model inputs using the
|
||||
input mapper registered for that model.
|
||||
@ -585,3 +585,18 @@ class MultiModalPlaceholderMap:
|
||||
|
||||
return MultiModalPlaceholderMap.IndexMap(src=src_indices,
|
||||
dest=dest_indices)
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
import warnings
|
||||
|
||||
if name == "MultiModalInputs":
|
||||
msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
|
||||
"The original name will take another meaning in an upcoming "
|
||||
"version.")
|
||||
|
||||
warnings.warn(DeprecationWarning(msg), stacklevel=2)
|
||||
|
||||
return MultiModalKwargs
|
||||
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import get_image_processor
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .base import MultiModalData, MultiModalInputs, MultiModalPlugin
|
||||
from .base import MultiModalData, MultiModalKwargs, MultiModalPlugin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
@ -43,12 +43,12 @@ class ImagePlugin(MultiModalPlugin):
|
||||
ctx: InputContext,
|
||||
data: MultiModalData[object],
|
||||
**mm_processor_kwargs,
|
||||
) -> MultiModalInputs:
|
||||
) -> MultiModalKwargs:
|
||||
model_config = ctx.model_config
|
||||
|
||||
# Processed by input processor
|
||||
if isinstance(data, BatchFeature):
|
||||
return MultiModalInputs(data.data)
|
||||
return MultiModalKwargs(data.data)
|
||||
|
||||
# PIL image
|
||||
if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
|
||||
@ -78,11 +78,11 @@ class ImagePlugin(MultiModalPlugin):
|
||||
type(image_processor).__name__)
|
||||
raise
|
||||
|
||||
return MultiModalInputs(batch_data)
|
||||
return MultiModalKwargs(batch_data)
|
||||
|
||||
# Image embedding
|
||||
elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor):
|
||||
return MultiModalInputs({"image_embeds": data})
|
||||
return MultiModalKwargs({"image_embeds": data})
|
||||
|
||||
raise TypeError(f"Invalid image type: {type(data)}")
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .audio import AudioPlugin
|
||||
from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
|
||||
from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalKwargs,
|
||||
MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
|
||||
from .image import ImagePlugin
|
||||
from .video import VideoPlugin
|
||||
@ -103,7 +103,7 @@ class MultiModalRegistry:
|
||||
model_config: "ModelConfig",
|
||||
data: MultiModalDataDict,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> MultiModalInputs:
|
||||
) -> MultiModalKwargs:
|
||||
"""
|
||||
Apply an input mapper to the data passed to the model.
|
||||
|
||||
@ -139,7 +139,7 @@ class MultiModalRegistry:
|
||||
|
||||
merged_dict[input_key] = input_tensor
|
||||
|
||||
return MultiModalInputs(merged_dict)
|
||||
return MultiModalKwargs(merged_dict)
|
||||
|
||||
def create_input_mapper(self, model_config: "ModelConfig"):
|
||||
"""
|
||||
|
||||
@ -9,7 +9,7 @@ from vllm.transformers_utils.processor import get_video_processor
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .base import MultiModalData, MultiModalInputs
|
||||
from .base import MultiModalData, MultiModalKwargs
|
||||
from .image import ImagePlugin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -55,7 +55,7 @@ class VideoPlugin(ImagePlugin):
|
||||
ctx: InputContext,
|
||||
data: MultiModalData[object],
|
||||
**mm_processor_kwargs,
|
||||
) -> MultiModalInputs:
|
||||
) -> MultiModalKwargs:
|
||||
model_config = ctx.model_config
|
||||
|
||||
if isinstance(data, list) and len(data) == 1:
|
||||
@ -79,7 +79,7 @@ class VideoPlugin(ImagePlugin):
|
||||
logger.error("Failed to process video (%s)", data)
|
||||
raise
|
||||
|
||||
return MultiModalInputs(batch_data)
|
||||
return MultiModalKwargs(batch_data)
|
||||
|
||||
raise TypeError(f"Invalid video type: {type(data)}")
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ except (ModuleNotFoundError, ImportError) as err:
|
||||
"CUDA and ROCm flash attention backend.") from err
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MultiModalInputs
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
|
||||
from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
|
||||
ModelRunner)
|
||||
@ -280,7 +280,7 @@ class TP1DraftModelRunner(ModelRunner):
|
||||
kv_caches=kv_caches,
|
||||
attn_metadata=model_input.attn_metadata,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
**MultiModalInputs.as_kwargs(multi_modal_kwargs,
|
||||
**MultiModalKwargs.as_kwargs(multi_modal_kwargs,
|
||||
device=self.device),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -5,7 +5,7 @@ import torch
|
||||
|
||||
from vllm.attention import AttentionMetadata
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.multimodal import MultiModalInputs
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
||||
from vllm.utils import make_tensor_with_pad
|
||||
from vllm.worker.cpu_model_runner import (CPUModelRunner,
|
||||
@ -287,7 +287,7 @@ class CPUEncoderDecoderModelRunner(CPUModelRunner):
|
||||
kv_caches,
|
||||
"attn_metadata":
|
||||
model_input.attn_metadata,
|
||||
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||
device=self.device),
|
||||
"intermediate_tensors":
|
||||
intermediate_tensors,
|
||||
|
||||
@ -15,7 +15,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||
MultiModalInputs, MultiModalPlaceholderMap)
|
||||
MultiModalKwargs, MultiModalPlaceholderMap)
|
||||
from vllm.sequence import (IntermediateTensors, SequenceData,
|
||||
SequenceGroupMetadata)
|
||||
from vllm.utils import make_tensor_with_pad
|
||||
@ -200,7 +200,7 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
|
||||
|
||||
slot_mapping: List[int] = []
|
||||
seq_lens: List[int] = []
|
||||
multi_modal_inputs_list: List[MultiModalInputs] = []
|
||||
multi_model_kwargs_list: List[MultiModalKwargs] = []
|
||||
multi_modal_placeholder_maps: Dict[
|
||||
str,
|
||||
MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
|
||||
@ -225,7 +225,7 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
|
||||
._compute_multi_modal_input(
|
||||
seq_group_metadata, seq_data, computed_len,
|
||||
seq_group_metadata.mm_processor_kwargs)
|
||||
multi_modal_inputs_list.append(mm_kwargs)
|
||||
multi_model_kwargs_list.append(mm_kwargs)
|
||||
for modality, placeholder_map in placeholder_maps.items():
|
||||
multi_modal_placeholder_maps[modality].extend(
|
||||
placeholder_map)
|
||||
@ -297,7 +297,7 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
|
||||
multi_modal_placeholder_index_maps=placeholder_index_maps,
|
||||
)
|
||||
|
||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
|
||||
|
||||
return (input_tokens, input_positions, attn_metadata, seq_lens,
|
||||
multi_modal_kwargs)
|
||||
@ -520,7 +520,7 @@ class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]):
|
||||
kv_caches,
|
||||
"attn_metadata":
|
||||
model_input.attn_metadata,
|
||||
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||
device=self.device),
|
||||
"intermediate_tensors":
|
||||
intermediate_tensors,
|
||||
|
||||
@ -8,7 +8,7 @@ from vllm.distributed import get_pp_group
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.pooling_metadata import PoolingMetadata
|
||||
from vllm.multimodal import MultiModalInputs
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
|
||||
SequenceGroupMetadata)
|
||||
@ -104,7 +104,7 @@ class EmbeddingModelRunner(
|
||||
kv_caches=kv_caches,
|
||||
attn_metadata=model_input.attn_metadata,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
**MultiModalInputs.as_kwargs(multi_modal_kwargs,
|
||||
**MultiModalKwargs.as_kwargs(multi_modal_kwargs,
|
||||
device=self.device))
|
||||
|
||||
if (self.observability_config is not None
|
||||
|
||||
@ -18,7 +18,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor import SamplingMetadata
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.model_loader.utils import get_architecture_class_name
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
|
||||
MultiModalRegistry)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (IntermediateTensors, PoolerOutput,
|
||||
@ -206,7 +206,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
|
||||
kv_caches=kv_caches,
|
||||
attn_metadata=model_input.attn_metadata,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
**MultiModalInputs.as_kwargs(multi_modal_kwargs,
|
||||
**MultiModalKwargs.as_kwargs(multi_modal_kwargs,
|
||||
device=self.device),
|
||||
**seqlen_agnostic_kwargs)
|
||||
|
||||
|
||||
@ -36,7 +36,7 @@ from vllm.model_executor import SamplingMetadata
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||
MultiModalInputs)
|
||||
MultiModalKwargs)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (IntermediateTensors, SequenceData,
|
||||
SequenceGroupMetadata)
|
||||
@ -716,7 +716,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
|
||||
context_lens: List[int] = []
|
||||
query_lens: List[int] = []
|
||||
prefix_block_tables: List[List[int]] = []
|
||||
multi_modal_inputs_list: List[MultiModalInputs] = []
|
||||
multi_model_kwargs_list: List[MultiModalKwargs] = []
|
||||
|
||||
if len(seq_group_metadata_list) == 0:
|
||||
return PreparePromptMetadata.empty()
|
||||
@ -777,7 +777,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
|
||||
mm_data = seq_group_metadata.multi_modal_data
|
||||
if mm_data:
|
||||
mm_kwargs = self.multi_modal_input_mapper(mm_data)
|
||||
multi_modal_inputs_list.append(mm_kwargs)
|
||||
multi_model_kwargs_list.append(mm_kwargs)
|
||||
|
||||
if seq_group_metadata.block_tables is None:
|
||||
# During memory profiling, the block tables are not initialized
|
||||
@ -876,7 +876,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
|
||||
multi_modal_placeholder_index_maps=
|
||||
None # FIXME(kzawora): mutli-modality will not work here
|
||||
)
|
||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
|
||||
|
||||
return PreparePromptMetadata(input_tokens=input_tokens,
|
||||
input_positions=input_positions,
|
||||
|
||||
@ -38,7 +38,7 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.model_executor.models import supports_lora, supports_multimodal
|
||||
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||
MultiModalInputs, MultiModalPlaceholderMap,
|
||||
MultiModalKwargs, MultiModalPlaceholderMap,
|
||||
MultiModalRegistry)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.prompt_adapter.layers import PromptAdapterMapping
|
||||
@ -252,7 +252,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
|
||||
# Multi-modal inputs.
|
||||
multi_modal_inputs: Optional[MultiModalInputs] = None,
|
||||
multi_model_kwargs: Optional[MultiModalKwargs] = None,
|
||||
multi_modal_placeholder_maps: Optional[Dict[
|
||||
str, MultiModalPlaceholderMap]] = None,
|
||||
|
||||
@ -373,7 +373,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
|
||||
prompt_adapter_prompt_mapping or [])
|
||||
|
||||
self.prompt_adapter_request = prompt_adapter_request
|
||||
self.multi_modal_inputs = multi_modal_inputs
|
||||
self.multi_model_kwargs = multi_model_kwargs
|
||||
self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
|
||||
self.prefix_cache_hit = prefix_cache_hit
|
||||
|
||||
@ -661,7 +661,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
|
||||
mm_kwargs = self.multi_modal_input_mapper(
|
||||
mm_data,
|
||||
mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs)
|
||||
inter_data.multi_modal_inputs = mm_kwargs
|
||||
inter_data.multi_model_kwargs = mm_kwargs
|
||||
inter_data.multi_modal_placeholder_maps = placeholder_maps
|
||||
|
||||
# special processing for mrope position deltas.
|
||||
@ -935,11 +935,11 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
|
||||
)
|
||||
|
||||
# Multi-modal data.
|
||||
multi_modal_inputs_list = [
|
||||
data.multi_modal_inputs for data in self.inter_data_list
|
||||
if data.multi_modal_inputs is not None
|
||||
multi_model_kwargs_list = [
|
||||
data.multi_model_kwargs for data in self.inter_data_list
|
||||
if data.multi_model_kwargs is not None
|
||||
]
|
||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
|
||||
|
||||
return self.model_input_cls(
|
||||
input_tokens=input_tokens_tensor,
|
||||
@ -1649,7 +1649,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
|
||||
kv_caches=kv_caches,
|
||||
attn_metadata=model_input.attn_metadata,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
**MultiModalInputs.as_kwargs(multi_modal_kwargs,
|
||||
**MultiModalKwargs.as_kwargs(multi_modal_kwargs,
|
||||
device=self.device),
|
||||
**seqlen_agnostic_kwargs)
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.model_executor import SamplingMetadata
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.model_loader.neuron import get_neuron_model
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||
MultiModalInputs)
|
||||
MultiModalKwargs)
|
||||
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
||||
from vllm.utils import is_pin_memory_available, make_tensor_with_pad
|
||||
from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
|
||||
@ -122,7 +122,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
|
||||
input_block_ids: List[int] = []
|
||||
|
||||
seq_lens: List[int] = []
|
||||
multi_modal_inputs_list: List[MultiModalInputs] = []
|
||||
multi_model_kwargs_list: List[MultiModalKwargs] = []
|
||||
for seq_group_metadata in seq_group_metadata_list:
|
||||
assert seq_group_metadata.is_prompt
|
||||
seq_ids = list(seq_group_metadata.seq_data.keys())
|
||||
@ -149,7 +149,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
|
||||
mm_data,
|
||||
mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs,
|
||||
)
|
||||
multi_modal_inputs_list.append(mm_kwargs)
|
||||
multi_model_kwargs_list.append(mm_kwargs)
|
||||
|
||||
max_seq_len = max(seq_lens)
|
||||
assert max_seq_len > 0
|
||||
@ -167,7 +167,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
|
||||
dtype=torch.long,
|
||||
device=self.device)
|
||||
|
||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
|
||||
|
||||
return (input_tokens, input_positions, input_block_ids, seq_lens,
|
||||
multi_modal_kwargs)
|
||||
@ -314,7 +314,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
|
||||
input_ids=model_input.input_tokens,
|
||||
positions=model_input.input_positions,
|
||||
input_block_ids=model_input.input_block_ids,
|
||||
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||
device=self.device),
|
||||
)
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.model_executor import SamplingMetadata
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.model_loader.openvino import get_model
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||
MultiModalInputs, MultiModalPlaceholderMap)
|
||||
MultiModalKwargs, MultiModalPlaceholderMap)
|
||||
from vllm.sequence import SequenceGroupMetadata
|
||||
from vllm.worker.model_runner_base import ModelRunnerBase
|
||||
|
||||
@ -102,7 +102,7 @@ class OpenVINOModelRunner(ModelRunnerBase):
|
||||
seq_lens: List[int] = []
|
||||
past_lens: List[int] = []
|
||||
query_lens: List[int] = []
|
||||
multi_modal_inputs_list: List[MultiModalInputs] = []
|
||||
multi_model_kwargs_list: List[MultiModalKwargs] = []
|
||||
multi_modal_placeholder_maps: Dict[
|
||||
str,
|
||||
MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
|
||||
@ -226,7 +226,7 @@ class OpenVINOModelRunner(ModelRunnerBase):
|
||||
mm_data,
|
||||
mm_processor_kwargs=seq_group_metadata.
|
||||
mm_processor_kwargs)
|
||||
multi_modal_inputs_list.append(mm_kwargs)
|
||||
multi_model_kwargs_list.append(mm_kwargs)
|
||||
|
||||
for modality, placeholder_map in placeholder_maps.items():
|
||||
multi_modal_placeholder_maps[modality].extend(
|
||||
@ -275,7 +275,7 @@ class OpenVINOModelRunner(ModelRunnerBase):
|
||||
multi_modal_placeholder_index_maps=placeholder_index_maps,
|
||||
)
|
||||
|
||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
|
||||
|
||||
return ModelInput(
|
||||
input_tokens,
|
||||
@ -341,7 +341,7 @@ class OpenVINOModelRunner(ModelRunnerBase):
|
||||
kv_caches,
|
||||
"attn_metadata":
|
||||
attn_metadata,
|
||||
**MultiModalInputs.as_kwargs(multi_modal_kwargs or {},
|
||||
**MultiModalKwargs.as_kwargs(multi_modal_kwargs or {},
|
||||
device=self.device),
|
||||
}
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ from vllm.model_executor import SamplingMetadataCache
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||
MultiModalInputs, MultiModalPlaceholderMap,
|
||||
MultiModalKwargs, MultiModalPlaceholderMap,
|
||||
MultiModalRegistry)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
||||
@ -160,7 +160,7 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]):
|
||||
input_positions: List[int] = []
|
||||
slot_mapping: List[int] = []
|
||||
seq_lens: List[int] = []
|
||||
multi_modal_inputs_list: List[MultiModalInputs] = []
|
||||
multi_model_kwargs_list: List[MultiModalKwargs] = []
|
||||
multi_modal_placeholder_maps: Dict[
|
||||
str,
|
||||
MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
|
||||
@ -192,7 +192,7 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]):
|
||||
.from_seq_group(seq_group_metadata, positions_range)
|
||||
|
||||
mm_kwargs = self.runner.multi_modal_input_mapper(mm_data)
|
||||
multi_modal_inputs_list.append(mm_kwargs)
|
||||
multi_model_kwargs_list.append(mm_kwargs)
|
||||
|
||||
for modality, placeholder_map in placeholder_maps.items():
|
||||
multi_modal_placeholder_maps[modality].extend(
|
||||
@ -264,7 +264,7 @@ class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]):
|
||||
block_tables=torch.tensor([], device=self.device, dtype=torch.int),
|
||||
)
|
||||
|
||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
|
||||
|
||||
return (input_tokens, input_positions, attn_metadata, seq_lens,
|
||||
multi_modal_kwargs)
|
||||
@ -565,7 +565,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
||||
kv_caches=kv_caches,
|
||||
attn_metadata=model_input.attn_metadata,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||
device=self.device))
|
||||
# Compute the logits in the last pipeline stage.
|
||||
if not get_pp_group().is_last_rank:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user