From 73e0225ee9defc6fa57db3ac53d178b8a3fc169b Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Thu, 27 Feb 2025 21:00:45 -0700 Subject: [PATCH] [Bugfix] Check that number of images matches number of <|image|> tokens with mllama (#13911) Signed-off-by: Travis Johnson --- .../vision_language/test_mllama.py | 5 ++-- vllm/model_executor/models/mllama.py | 24 ++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 202516f4c2097..4fee04fdb7b6f 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -479,8 +479,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens, # Regression tests for https://github.com/vllm-project/vllm/issues/10648 - # Number of image tags is greater than the number of images provided - prompt = "<|begin_of_text|><|image|><|image|> Compare the two images" # noqa: E501 + # Number of groups of image tokens is greater than the number of images + # provided (the whitespace between the tags is necessary) + prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501 image = stop_sign with pytest.raises(ValueError): vllm_model.generate_greedy_logprobs([prompt], diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 7122fea2b3a80..2a829bf0e61e7 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -54,7 +54,8 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs +from vllm.multimodal.inputs import (MultiModalEncDecInputs, + MultiModalFieldConfig, MultiModalKwargs) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataDict, MultiModalDataItems) from vllm.multimodal.processing import (BaseProcessingInfo, @@ -169,6 +170,27 @@ class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]): class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] ): + def apply( + self, + prompt: Union[str, list[int]], + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalEncDecInputs: + mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs) + + # Check that the number of image tokens in the decoder prompt matches + # the number of images provided in mm_data + num_image_tokens = mm_inputs['prompt_token_ids'].count( + self.info.get_hf_config().image_token_index) + image_data = mm_data.get("image", []) + num_images = 1 if isinstance(image_data, Image) else len(image_data) + if num_image_tokens != num_images: + raise ValueError( + f"The number of image tokens ({num_image_tokens}) must be" + f" the same as the number of images ({num_images})") + + return mm_inputs + def _call_hf_processor( self, prompt: str,