diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index 970a59eca8ec..955c74d262a0 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -152,8 +152,8 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( *, tokenization_kwargs=None, lora_request=None, - mm_hash_overrides=None): - captured["mm_hash_overrides"] = mm_hash_overrides + mm_uuids=None): + captured["mm_uuids"] = mm_uuids # Minimal processed inputs for decoder-only flow return {"type": "token", "prompt_token_ids": [1]} @@ -180,7 +180,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( params=SamplingParams(), ) - assert captured["mm_hash_overrides"] == mm_uuids + assert captured["mm_uuids"] == mm_uuids def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): @@ -196,8 +196,8 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): *, tokenization_kwargs=None, lora_request=None, - mm_hash_overrides=None): - captured["mm_hash_overrides"] = mm_hash_overrides + mm_uuids=None): + captured["mm_uuids"] = mm_uuids return {"type": "token", "prompt_token_ids": [1]} monkeypatch.setattr(processor.input_preprocessor, @@ -223,7 +223,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): ) # Expect request-id-based overrides are passed through - assert captured["mm_hash_overrides"] == { + assert captured["mm_uuids"] == { "image": [f"{request_id}-image-0", f"{request_id}-image-1"], "video": [f"{request_id}-video-0"], } diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index ec82be831e0d..22287aa6f41e 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -258,8 +258,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, @@ -281,7 +280,7 @@ class InputPreprocessor: mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) mm_hashes = mm_input["mm_hashes"] @@ -302,8 +301,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Async version of @@ -325,7 +323,7 @@ class InputPreprocessor: mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) mm_hashes = mm_input["mm_hashes"] @@ -390,8 +388,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = self._truncate_inputs( parsed_content["prompt_token_ids"], tokenization_kwargs) @@ -404,7 +401,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) else: inputs = token_inputs(prompt_token_ids=prompt_token_ids) @@ -420,8 +417,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_token_ids = self._truncate_inputs( parsed_content["prompt_token_ids"], tokenization_kwargs) @@ -434,7 +430,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) else: inputs = token_inputs(prompt_token_ids=prompt_token_ids, ) @@ -450,8 +446,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_text = parsed_content["prompt"] @@ -463,7 +458,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) else: prompt_token_ids = self._tokenize_prompt( @@ -487,8 +482,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> Union[TokenInputs, MultiModalInputs]: prompt_text = parsed_content["prompt"] @@ -500,7 +494,7 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) else: prompt_token_ids = await self._tokenize_prompt_async( @@ -524,8 +518,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> SingletonInputs: """ Extract the singleton inputs from a prompt. @@ -547,21 +540,21 @@ class InputPreprocessor: return self._process_tokens( parsed["content"], lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if parsed["type"] == "text": return self._process_text( parsed["content"], tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if parsed["type"] == "str": return self._process_text( TextPrompt(prompt=parsed["content"]), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) assert_never(parsed) @@ -572,8 +565,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> SingletonInputs: """ Async version of @@ -587,21 +579,21 @@ class InputPreprocessor: return await self._process_tokens_async( parsed["content"], lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if parsed["type"] == "text": return await self._process_text_async( parsed["content"], tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if parsed["type"] == "str": return await self._process_text_async( TextPrompt(prompt=parsed["content"]), tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) assert_never(parsed) @@ -712,8 +704,7 @@ class InputPreprocessor: prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> EncoderDecoderInputs: """ For encoder/decoder models only: @@ -755,7 +746,7 @@ class InputPreprocessor: encoder_inputs = self._prompt_to_llm_inputs( prompt["encoder_prompt"], tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if (decoder_input := prompt["decoder_prompt"]) is None: decoder_inputs = None @@ -771,7 +762,7 @@ class InputPreprocessor: inputs = self._prompt_to_llm_inputs( prompt, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model @@ -788,8 +779,7 @@ class InputPreprocessor: prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> EncoderDecoderInputs: """ Async version of @@ -802,7 +792,7 @@ class InputPreprocessor: encoder_task = self._prompt_to_llm_inputs_async( prompt["encoder_prompt"], tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if (decoder_input := prompt["decoder_prompt"]) is None: @@ -812,7 +802,7 @@ class InputPreprocessor: decoder_task = self._prompt_to_llm_inputs_async( decoder_input, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) encoder_inputs, decoder_inputs = await asyncio.gather( @@ -828,7 +818,7 @@ class InputPreprocessor: inputs = await self._prompt_to_llm_inputs_async( prompt, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model @@ -856,8 +846,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> DecoderOnlyInputs: """ For decoder-only models: @@ -878,7 +867,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -889,8 +878,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> DecoderOnlyInputs: """ Async version of @@ -900,7 +888,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -911,8 +899,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" if self.model_config.is_encoder_decoder: @@ -921,7 +908,7 @@ class InputPreprocessor: return self._process_encoder_decoder_prompt( prompt, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if is_explicit_encoder_decoder_prompt(prompt): @@ -933,7 +920,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) async def preprocess_async( @@ -942,8 +929,7 @@ class InputPreprocessor: tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> ProcessorInputs: """ Async version of @@ -955,7 +941,7 @@ class InputPreprocessor: return await self._process_encoder_decoder_prompt_async( prompt, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) if is_explicit_encoder_decoder_prompt(prompt): @@ -967,7 +953,7 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) def clear_cache(self) -> None: diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 5eab02b17151..d7ae8206baca 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -21,7 +21,8 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.models.transformers import replace_linear_class from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems, NestedTensors) + MultiModalKwargsItems, MultiModalUUIDDict, + NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -290,7 +291,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: # The processor logic is different for len(images) <= 2 vs > 2 # Since the processing cache assumes that the processor output is @@ -302,7 +303,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return super()._cached_apply_hf_processor( @@ -310,7 +311,7 @@ class DeepseekVL2MultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 306775af6806..b42df3ad8650 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -17,7 +17,7 @@ from transformers import PretrainedConfig from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargsItems +from vllm.multimodal.inputs import MultiModalKwargsItems, MultiModalUUIDDict from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) from vllm.multimodal.processing import (MultiModalProcessingInfo, @@ -479,7 +479,7 @@ class H2OVLMultiModalProcessor( mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: # The processor logic is different for len(images) <= 1 vs > 1 # Since the processing cache assumes that the processor output is @@ -491,7 +491,7 @@ class H2OVLMultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return super()._cached_apply_hf_processor( @@ -499,7 +499,7 @@ class H2OVLMultiModalProcessor( mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 8a847a6180f3..d692b2783048 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -24,7 +24,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, MultiModalKwargsItems) + MultiModalInputs, MultiModalKwargsItems, + MultiModalUUIDDict) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -795,7 +796,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -810,7 +811,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) mm_items = self._to_mm_items(mm_data) mm_item_counts = mm_items.get_all_counts() diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index f441287a4d08..68aa16f8b9ec 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -57,7 +57,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalFieldConfig, - MultiModalKwargsItems) + MultiModalKwargsItems, MultiModalUUIDDict) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseProcessingInfo, @@ -184,13 +184,13 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalEncDecInputs: mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) image_token_id = self.info.get_hf_config().image_token_index # Check that the number of image tokens in the decoder prompt matches diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index b74a09ee92c3..d6eec77ebcee 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -12,7 +12,8 @@ from vllm.logger import init_logger from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, MultiModalKwargsItems) + MultiModalInputs, MultiModalKwargsItems, + MultiModalUUIDDict) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -203,13 +204,13 @@ class PaliGemmaMultiModalProcessor( mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) prompt_token_ids = mm_inputs["prompt_token_ids"] tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index e7f5799a8006..142d3251bc67 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -35,7 +35,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - NestedTensors) + MultiModalUUIDDict, NestedTensors) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -316,14 +316,14 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) # NOTE: The tokens are already inserted by the chat template diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 453da1a51d98..b9dfa8e9b6f5 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -36,7 +36,7 @@ from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, - PlaceholderRange) + MultiModalUUIDDict, PlaceholderRange) from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -164,7 +164,7 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: if "image" in mm_data: image_data = mm_data["image"] @@ -177,7 +177,7 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor): mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} mm_processed_data = BatchFeature(image_data) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 5ad0482330ec..a386f47e1929 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -44,7 +44,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, PlaceholderRange) + MultiModalInputs, MultiModalUUIDDict, + PlaceholderRange) from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo) @@ -347,7 +348,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -415,9 +416,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): num_image_patches), ) # Use overrides if provided; fallback to data-dependent hashing. - mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else - self._hash_mm_items(mm_items, hf_processor_mm_kwargs, - tokenization_kwargs)) + mm_hashes = (mm_uuids if mm_uuids is not None else self._hash_mm_items( + mm_items, hf_processor_mm_kwargs, tokenization_kwargs)) return MultiModalInputs( type="multimodal", diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index f3731b389cfe..2831c4df78ba 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -31,7 +31,8 @@ from vllm.model_executor.models.whisper import WhisperEncoder from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems, NestedTensors) + MultiModalKwargsItems, MultiModalUUIDDict, + NestedTensors) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -290,14 +291,14 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo] mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], - mm_hash_overrides: Optional[dict[str, list[str]]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) # NOTE: The tokens are already inserted by the chat template diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0531b7bd9f0a..e5db356b635f 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1022,13 +1022,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: return self.apply(prompt, mm_data, hf_processor_mm_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) def _get_data_parser(self) -> MultiModalDataParser: """ @@ -1364,8 +1363,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalHashes: """Create MM hashes to be returned (only used in V1). @@ -1376,30 +1374,30 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): model_id = self.info.model_id hashes: MultiModalHashes = {} - mm_hash_overrides = mm_hash_overrides or {} + mm_uuids = mm_uuids or {} for modality, items in mm_items.items(): - if modality in mm_hash_overrides: - mm_hashes = mm_hash_overrides[modality] - if isinstance(mm_hashes, str): - mm_hashes = [mm_hashes] + if modality in mm_uuids: + mm_uuids_per_modality = mm_uuids[modality] + if isinstance(mm_uuids_per_modality, str): + mm_uuids_per_modality = [mm_uuids_per_modality] # For None entries, compute a hash; otherwise, use provided ID. computed: list[str] = [] for i, item in enumerate(items): - mm_hash = mm_hashes[i] + item_uuid = mm_uuids_per_modality[i] - # NOTE: Even if a mm_hash is provided, we still compute a + # NOTE: Even if a item_uuid is provided, we still compute a # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs` # are provided. This is because the processed multimodal # inputs can be different depending on the processor kwargs. - if mm_hash is None or \ + if item_uuid is None or \ hf_processor_mm_kwargs or \ tokenization_kwargs: # NOTE: use provided hash string to hash with kwargs # if available for better performance. - item = mm_hash if mm_hash is not None else item + item = item_uuid if item_uuid is not None else item computed.append( MultiModalHasher.hash_kwargs( model_id=model_id, @@ -1407,7 +1405,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): **hf_processor_mm_kwargs, **tokenization_kwargs)) else: - computed.append(mm_hash) + computed.append(item_uuid) hashes[modality] = computed else: hashes[modality] = [ @@ -1514,8 +1512,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: ( prompt_ids, @@ -1539,7 +1536,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) mm_prompt_updates = self._get_mm_prompt_updates( mm_data_items, @@ -1562,8 +1559,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object], *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1578,13 +1574,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides) + mm_uuids=mm_uuids) mm_missing_data_items = self._get_cache_missing_items( cache=cache, @@ -1785,8 +1781,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1815,7 +1810,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_items, hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) # NOTE: tokenization_kwargs are not required to init processor @@ -1901,8 +1896,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): hf_processor_mm_kwargs: Mapping[str, object], tokenization_kwargs: Optional[Mapping[str, object]] = None, *, - mm_hash_overrides: Optional[Union[dict[str, list[str]], - MultiModalUUIDDict]] = None, + mm_uuids: Optional[MultiModalUUIDDict] = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1917,7 +1911,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): mm_data, hf_processor_mm_kwargs, tokenization_kwargs, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) return self._get_enc_dec_inputs( diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index baade243140d..8ce070e4d6fb 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -12,7 +12,7 @@ from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import processor_cache_from_config -from vllm.multimodal.inputs import MultiModalFeatureSpec +from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams @@ -276,11 +276,11 @@ class Processor: # Remember that this backend was set automatically params.guided_decoding.backend_was_auto = True - def _maybe_build_mm_hash_overrides( + def _maybe_build_mm_uuids( self, request_id: str, prompt: PromptType, - ) -> Optional[dict[str, list[str]]]: + ) -> Optional[MultiModalUUIDDict]: """Build per-item multimodal hash overrides when enabled. In this case, multimodal data items are identified by their request id, modality and index rather than their content. @@ -303,13 +303,13 @@ class Processor: if not mm_data: return None - overrides: dict[str, list[str]] = {} + mm_uuids: MultiModalUUIDDict = {} for modality, data in mm_data.items(): n = len(data) if isinstance(data, list) else 1 - overrides[modality] = [ + mm_uuids[modality] = [ f"{request_id}-{modality}-{i}" for i in range(n) ] - return overrides + return mm_uuids def process_inputs( self, @@ -351,16 +351,15 @@ class Processor: if (self.model_config.multimodal_config and self.model_config.multimodal_config.mm_processor_cache_gb == 0 and not self.cache_config.enable_prefix_caching): - mm_hash_overrides = self._maybe_build_mm_hash_overrides( - request_id, prompt) + mm_uuids = self._maybe_build_mm_uuids(request_id, prompt) else: # Otherwise, use user-provided uuids as multimodal hash overrides # if provided. self._validate_multi_modal_uuids(prompt) if isinstance(prompt, dict): - mm_hash_overrides = prompt.get("multi_modal_uuids") + mm_uuids = prompt.get("multi_modal_uuids") else: - mm_hash_overrides = None + mm_uuids = None # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. @@ -370,7 +369,7 @@ class Processor: prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - mm_hash_overrides=mm_hash_overrides, + mm_uuids=mm_uuids, ) from vllm.platforms import current_platform current_platform.validate_request(