mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 20:35:26 +08:00
[Misc] Rename MultiModalInputsV2 -> MultiModalInputs (#12244)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
2fc6944c5e
commit
96912550c8
@ -43,7 +43,7 @@
|
|||||||
```
|
```
|
||||||
|
|
||||||
```{eval-rst}
|
```{eval-rst}
|
||||||
.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
|
.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
|
||||||
:members:
|
:members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
```
|
```
|
||||||
|
|||||||
@ -9,7 +9,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
|
from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
|
||||||
MultiModalPlaceholderDict)
|
MultiModalPlaceholderDict)
|
||||||
from vllm.multimodal.inputs import MultiModalInputsV2
|
from vllm.multimodal.inputs import MultiModalInputs
|
||||||
|
|
||||||
|
|
||||||
class TextPrompt(TypedDict):
|
class TextPrompt(TypedDict):
|
||||||
@ -207,7 +207,7 @@ def token_inputs(
|
|||||||
return inputs
|
return inputs
|
||||||
|
|
||||||
|
|
||||||
DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"]
|
DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"]
|
||||||
"""
|
"""
|
||||||
The inputs in :class:`~vllm.LLMEngine` before they are
|
The inputs in :class:`~vllm.LLMEngine` before they are
|
||||||
passed to the model executor.
|
passed to the model executor.
|
||||||
@ -222,14 +222,14 @@ class EncoderDecoderInputs(TypedDict):
|
|||||||
|
|
||||||
This specifies the required data for encoder-decoder models.
|
This specifies the required data for encoder-decoder models.
|
||||||
"""
|
"""
|
||||||
encoder: Union[TokenInputs, "MultiModalInputsV2"]
|
encoder: Union[TokenInputs, "MultiModalInputs"]
|
||||||
"""The inputs for the encoder portion."""
|
"""The inputs for the encoder portion."""
|
||||||
|
|
||||||
decoder: Union[TokenInputs, "MultiModalInputsV2"]
|
decoder: Union[TokenInputs, "MultiModalInputs"]
|
||||||
"""The inputs for the decoder portion."""
|
"""The inputs for the decoder portion."""
|
||||||
|
|
||||||
|
|
||||||
SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"]
|
SingletonInputs = Union[TokenInputs, "MultiModalInputs"]
|
||||||
"""
|
"""
|
||||||
A processed :class:`SingletonPrompt` which can be passed to
|
A processed :class:`SingletonPrompt` which can be passed to
|
||||||
:class:`vllm.sequence.Sequence`.
|
:class:`vllm.sequence.Sequence`.
|
||||||
@ -311,7 +311,7 @@ class SingletonInputsAdapter:
|
|||||||
return inputs.get("multi_modal_hashes", [])
|
return inputs.get("multi_modal_hashes", [])
|
||||||
|
|
||||||
if inputs["type"] == "multimodal":
|
if inputs["type"] == "multimodal":
|
||||||
# only the case when we use MultiModalInputsV2
|
# only the case when we use MultiModalInputs
|
||||||
return inputs.get("mm_hashes", []) # type: ignore[return-value]
|
return inputs.get("mm_hashes", []) # type: ignore[return-value]
|
||||||
|
|
||||||
assert_never(inputs) # type: ignore[arg-type]
|
assert_never(inputs) # type: ignore[arg-type]
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from vllm.config import ModelConfig
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
|
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
|
||||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||||
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
||||||
|
|
||||||
@ -247,7 +247,7 @@ class InputPreprocessor:
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||||
lora_request: Optional[LoRARequest],
|
lora_request: Optional[LoRARequest],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
"""
|
"""
|
||||||
Apply the model's multi-modal processor to a multi-modal prompt,
|
Apply the model's multi-modal processor to a multi-modal prompt,
|
||||||
returning the corresponding token IDs and metadata.
|
returning the corresponding token IDs and metadata.
|
||||||
@ -271,7 +271,7 @@ class InputPreprocessor:
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||||
lora_request: Optional[LoRARequest],
|
lora_request: Optional[LoRARequest],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
"""Async version of :meth:`_process_multimodal`."""
|
"""Async version of :meth:`_process_multimodal`."""
|
||||||
tokenizer_group = self.get_tokenizer_group()
|
tokenizer_group = self.get_tokenizer_group()
|
||||||
tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
|
tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
|
||||||
|
|||||||
@ -15,7 +15,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
|||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
MultiModalInputsV2, MultiModalKwargs,
|
MultiModalInputs, MultiModalKwargs,
|
||||||
NestedTensors, PlaceholderRange)
|
NestedTensors, PlaceholderRange)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
@ -490,7 +490,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||||
|
|
||||||
# Only <image> tokens should be considered as placeholders,
|
# Only <image> tokens should be considered as placeholders,
|
||||||
|
|||||||
@ -29,7 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
MultiModalInputsV2, MultiModalKwargs,
|
MultiModalInputs, MultiModalKwargs,
|
||||||
NestedTensors, PlaceholderRange)
|
NestedTensors, PlaceholderRange)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
@ -159,7 +159,7 @@ class ChameleonMultiModalProcessor(
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||||
|
|
||||||
# Only <image> tokens should be considered as placeholders,
|
# Only <image> tokens should be considered as placeholders,
|
||||||
|
|||||||
@ -31,7 +31,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM
|
|||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
MultiModalInputsV2, MultiModalKwargs,
|
MultiModalInputs, MultiModalKwargs,
|
||||||
NestedTensors, PlaceholderRange)
|
NestedTensors, PlaceholderRange)
|
||||||
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
@ -232,7 +232,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||||
|
|
||||||
# Only |SPEAKER| (image) tokens should be considered as placeholders,
|
# Only |SPEAKER| (image) tokens should be considered as placeholders,
|
||||||
|
|||||||
@ -24,7 +24,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
|||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
MultiModalInputsV2, MultiModalKwargs,
|
MultiModalInputs, MultiModalKwargs,
|
||||||
NestedTensors)
|
NestedTensors)
|
||||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||||
ImageSize, MultiModalDataItems)
|
ImageSize, MultiModalDataItems)
|
||||||
@ -746,7 +746,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
image_token_id = hf_config.image_token_index
|
image_token_id = hf_config.image_token_index
|
||||||
|
|
||||||
@ -805,7 +805,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
|||||||
for modality, placeholders in mm_placeholders.items()
|
for modality, placeholders in mm_placeholders.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
return MultiModalInputsV2(
|
return MultiModalInputs(
|
||||||
type="multimodal",
|
type="multimodal",
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_token_ids=prompt_ids,
|
prompt_token_ids=prompt_ids,
|
||||||
|
|||||||
@ -31,7 +31,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
MultiModalInputsV2, MultiModalKwargs,
|
MultiModalInputs, MultiModalKwargs,
|
||||||
NestedTensors, PlaceholderRange)
|
NestedTensors, PlaceholderRange)
|
||||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||||
ImageSize, MultiModalDataItems)
|
ImageSize, MultiModalDataItems)
|
||||||
@ -484,7 +484,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||||
|
|
||||||
# Only <|image|> tokens should be considered as placeholders,
|
# Only <|image|> tokens should be considered as placeholders,
|
||||||
|
|||||||
@ -37,7 +37,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
|||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
MultiModalInputsV2, MultiModalKwargs,
|
MultiModalInputs, MultiModalKwargs,
|
||||||
NestedTensors, PlaceholderRange)
|
NestedTensors, PlaceholderRange)
|
||||||
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
||||||
MultiModalDataParser)
|
MultiModalDataParser)
|
||||||
@ -245,7 +245,7 @@ class Qwen2AudioMultiModalProcessor(
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||||
|
|
||||||
# Only <|AUDIO|> tokens should be considered as placeholders,
|
# Only <|AUDIO|> tokens should be considered as placeholders,
|
||||||
|
|||||||
@ -491,7 +491,7 @@ A dictionary containing placeholder ranges for each modality.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class MultiModalInputsV2(TypedDict):
|
class MultiModalInputs(TypedDict):
|
||||||
"""
|
"""
|
||||||
Represents the outputs of
|
Represents the outputs of
|
||||||
:class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
|
:class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
|
||||||
|
|||||||
@ -18,8 +18,8 @@ from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
|
|||||||
|
|
||||||
from .hasher import MultiModalHasher
|
from .hasher import MultiModalHasher
|
||||||
from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
MultiModalInputsV2, MultiModalKwargs,
|
MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem,
|
||||||
MultiModalKwargsItem, PlaceholderRange)
|
PlaceholderRange)
|
||||||
from .parse import MultiModalDataItems, MultiModalDataParser
|
from .parse import MultiModalDataItems, MultiModalDataParser
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -609,7 +609,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||||
|
|
||||||
def _get_data_parser(self) -> MultiModalDataParser:
|
def _get_data_parser(self) -> MultiModalDataParser:
|
||||||
@ -1067,7 +1067,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
"""
|
"""
|
||||||
Process multi-modal inputs to be used in vLLM.
|
Process multi-modal inputs to be used in vLLM.
|
||||||
|
|
||||||
@ -1169,7 +1169,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
for modality, placeholders in mm_placeholders.items()
|
for modality, placeholders in mm_placeholders.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
return MultiModalInputsV2(
|
return MultiModalInputs(
|
||||||
type="multimodal",
|
type="multimodal",
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_token_ids=prompt_ids,
|
prompt_token_ids=prompt_ids,
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import vllm.envs as envs
|
|||||||
from vllm.inputs import DummyData
|
from vllm.inputs import DummyData
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
from .inputs import MultiModalDataDict, MultiModalInputsV2
|
from .inputs import MultiModalDataDict, MultiModalInputs
|
||||||
from .processing import BaseMultiModalProcessor, BaseProcessingInfo
|
from .processing import BaseMultiModalProcessor, BaseProcessingInfo
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -131,7 +131,7 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> MultiModalInputsV2:
|
) -> MultiModalInputs:
|
||||||
factory = self.dummy_inputs
|
factory = self.dummy_inputs
|
||||||
processor_inputs = factory.get_dummy_processor_inputs(
|
processor_inputs = factory.get_dummy_processor_inputs(
|
||||||
seq_len, mm_counts)
|
seq_len, mm_counts)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user