From b7e8e4e6be42e32332bf54da459508938d9ff02a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 5 Oct 2025 18:10:20 +0800 Subject: [PATCH] [Bugfix] Always apply MM processor even when no MM items are passed (#26240) Signed-off-by: DarkLight1337 --- tests/conftest.py | 29 ++++++----- tests/test_inputs.py | 50 +++++++++++++++++++ vllm/inputs/preprocess.py | 16 ++++-- vllm/model_executor/models/phi3v.py | 4 +- .../models/qwen2_5_omni_thinker.py | 1 + vllm/multimodal/processing.py | 32 ++++++++---- 6 files changed, 102 insertions(+), 30 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c61a8f8dd539a..fd48c66341bb6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,7 +46,6 @@ from vllm.connections import global_http_connection from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) -from vllm.inputs import TextPrompt from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.multimodal.utils import fetch_image @@ -760,17 +759,24 @@ class VllmRunner: images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, - ) -> list[TextPrompt]: - + ) -> list[dict[str, Any]]: if any(x is not None and len(x) != len(prompts) for x in [images, videos, audios]): raise ValueError( "All non-None multimodal inputs must have the same length as " "prompts") - inputs = [] + inputs = list[dict[str, Any]]() for i, prompt in enumerate(prompts): - multi_modal_data = {} + prompt_dict = dict[str, Any]() + if isinstance(prompt, str): + prompt_dict["prompt"] = prompt + elif isinstance(prompt, list): + prompt_dict["prompt_token_ids"] = prompt + else: + prompt_dict["prompt_embeds"] = prompt + + multi_modal_data = dict[str, Any]() if images is not None and (image := images[i]) is not None: multi_modal_data["image"] = image if videos is not None and (video := videos[i]) is not None: @@ -778,17 +784,10 @@ class VllmRunner: if audios is not None and (audio := audios[i]) is not None: multi_modal_data["audio"] = audio - text_prompt_kwargs: dict[str, Any] = { - "multi_modal_data": multi_modal_data or None - } - if isinstance(prompt, str): - text_prompt_kwargs["prompt"] = prompt - elif isinstance(prompt, list): - text_prompt_kwargs["prompt_token_ids"] = prompt - else: - text_prompt_kwargs["prompt_embeds"] = prompt + if multi_modal_data: + prompt_dict["multi_modal_data"] = multi_modal_data - inputs.append(TextPrompt(**text_prompt_kwargs)) + inputs.append(prompt_dict) return inputs diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 10a18e2d871fb..02cd103795742 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -3,8 +3,11 @@ import pytest +from vllm.config import ModelConfig from vllm.inputs import zip_enc_dec_prompts from vllm.inputs.parse import parse_raw_prompts +from vllm.inputs.preprocess import InputPreprocessor +from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs pytestmark = pytest.mark.cpu_test @@ -80,3 +83,50 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): assert zipped['encoder_prompt'] == enc assert zipped['decoder_prompt'] == dec assert zipped['mm_processor_kwargs'] == exp_kwargs + + +@pytest.mark.parametrize("model_id", [ + "facebook/opt-125m", +]) +@pytest.mark.parametrize("prompt", [ + { + "prompt": "", + "multi_modal_data": { + "dummy": [] + }, + }, + { + "prompt_token_ids": [], + "multi_modal_data": { + "dummy": [] + }, + }, +]) +def test_preprocessor_text_no_mm_inputs(model_id, prompt): + model_config = ModelConfig(model=model_id) + tokenizer = init_tokenizer_from_configs(model_config) + input_preprocessor = InputPreprocessor(model_config, tokenizer) + + with pytest.raises(ValueError, match="does not support multimodal inputs"): + input_preprocessor.preprocess(prompt) + + +@pytest.mark.parametrize("model_id", [ + "facebook/chameleon-7b", +]) +@pytest.mark.parametrize("prompt", [ + "", + { + "prompt_token_ids": [] + }, +]) +def test_preprocessor_always_mm_code_path(model_id, prompt): + model_config = ModelConfig(model=model_id) + tokenizer = init_tokenizer_from_configs(model_config) + input_preprocessor = InputPreprocessor(model_config, tokenizer) + + # HF processor adds sep token + sep_token_id = tokenizer.vocab[tokenizer.sep_token] + + processed_inputs = input_preprocessor.preprocess(prompt) + assert sep_token_id in processed_inputs["prompt_token_ids"] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 65460b46cb5a6..c82daf39be7ad 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -314,15 +314,19 @@ class InputPreprocessor: parsed_content["prompt_token_ids"], tokenization_kwargs) inputs: Union[TokenInputs, MultiModalInputs] - if multi_modal_data := parsed_content.get("multi_modal_data"): + if self.model_config.is_multimodal_model: inputs = self._process_multimodal( prompt_token_ids, - multi_modal_data, + parsed_content.get("multi_modal_data", {}), parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, ) else: + if parsed_content.get("multi_modal_data"): + raise ValueError( + "This model does not support multimodal inputs") + inputs = token_inputs(prompt_token_ids) if cache_salt := parsed_content.get("cache_salt"): @@ -340,15 +344,19 @@ class InputPreprocessor: prompt_text = parsed_content["prompt"] inputs: Union[TokenInputs, MultiModalInputs] - if multi_modal_data := parsed_content.get("multi_modal_data"): + if self.model_config.is_multimodal_model: inputs = self._process_multimodal( prompt_text, - multi_modal_data, + parsed_content.get("multi_modal_data", {}), parsed_content.get("mm_processor_kwargs"), tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, ) else: + if parsed_content.get("multi_modal_data"): + raise ValueError( + "This model does not support multimodal inputs") + prompt_token_ids = self._tokenize_prompt( prompt_text, tokenization_kwargs=tokenization_kwargs, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 2415f3696f001..df5f0f0039d3c 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -507,8 +507,8 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]): ) # Keep the behavior in line with HF processor - if token_ids[:2] == tokenizer.encode(" <|image|>", - add_special_tokens=False): + if len(mm_prompt_updates) and (token_ids[:2] == tokenizer.encode( + " <|image|>", add_special_tokens=False)): token_ids = [token_ids[0], *token_ids[2:]] placeholders = { modality: [ diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index b5c2aee7f2314..219769b07b08e 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -331,6 +331,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor( """ mm_item_counts = mm_items.get_all_counts() self._validate_mm_kwargs(mm_kwargs, mm_item_counts) + self._validate_mm_updates(mm_prompt_updates, mm_item_counts) use_audio_in_video = False if "video" in mm_kwargs: diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index bc998dc2785f0..21fa467b3331e 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1946,6 +1946,24 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): "model (usually arising from an inconsistency between " "`_call_hf_processor` and `_get_mm_fields_config`).") + def _validate_mm_updates( + self, + mm_updates: MultiModalPromptUpdates, + mm_item_counts: Mapping[str, int], + ) -> None: + for modality, item_count in mm_item_counts.items(): + placeholders = mm_updates.get(modality, []) + + if len(placeholders) != item_count: + raise RuntimeError( + f"Expected there to be {item_count} prompt updates " + f"corresponding to {item_count} {modality} items, but " + f"instead found {len(placeholders)} prompt updates! " + "This is likely because you forgot to include input " + "placeholder tokens (e.g., ``, `<|image_pad|>`) " + "in the prompt. If the model has a chat template, make " + "sure you have applied it before calling `LLM.generate`.") + def _validate_mm_placeholders( self, mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], @@ -1955,17 +1973,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): placeholders = mm_placeholders.get(modality, []) if len(placeholders) != item_count: - # NOTE: If you are a model developer, this can also arise from - # an inconsistency between `_call_hf_processor` and - # `_get_mm_fields_config` implementations raise RuntimeError( - f"Expected there to be {item_count} prompt updates " + f"Expected there to be {item_count} prompt placeholders " f"corresponding to {item_count} {modality} items, but " - f"instead found {len(placeholders)} prompt updates! " - "This is likely because you forgot to include input " - "placeholder tokens (e.g., ``, `<|image_pad|>`) " - "in the prompt. If the model has a chat template, make " - "sure you have applied it before calling `LLM.generate`.") + f"instead found {len(placeholders)} prompt placeholders! " + "Make sure the implementation of `_call_hf_processor` and " + "`_get_mm_fields_config` are consistent with each other.") def _maybe_apply_prompt_updates( self, @@ -1977,6 +1990,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]: mm_item_counts = mm_items.get_all_counts() self._validate_mm_kwargs(mm_kwargs, mm_item_counts) + self._validate_mm_updates(mm_prompt_updates, mm_item_counts) if is_update_applied: mm_placeholders = self._find_mm_placeholders(