From 7f064491f80ba20e782f33f4da566ec7da5118d7 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 17 Nov 2025 06:49:25 -0800 Subject: [PATCH] [Bugfix][Perf] Revert applying HF processor on text-only inputs for multimodal models (#28858) Signed-off-by: Roger Wang --- tests/test_inputs.py | 35 +++++++---------------------------- vllm/inputs/preprocess.py | 14 ++++---------- 2 files changed, 11 insertions(+), 38 deletions(-) diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 50a273016ab8..b1fb4e06a690 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -86,34 +86,6 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): assert zipped["mm_processor_kwargs"] == exp_kwargs -@pytest.mark.parametrize( - "model_id", - [ - "facebook/opt-125m", - ], -) -@pytest.mark.parametrize( - "prompt", - [ - { - "prompt": "", - "multi_modal_data": {"dummy": []}, - }, - { - "prompt_token_ids": [], - "multi_modal_data": {"dummy": []}, - }, - ], -) -def test_preprocessor_text_no_mm_inputs(model_id, prompt): - model_config = ModelConfig(model=model_id) - tokenizer = init_tokenizer_from_configs(model_config) - input_preprocessor = InputPreprocessor(model_config, tokenizer) - - with pytest.raises(ValueError, match="does not support multimodal inputs"): - input_preprocessor.preprocess(prompt) - - @pytest.mark.parametrize( "model_id", [ @@ -127,6 +99,13 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt): {"prompt_token_ids": []}, ], ) +@pytest.mark.skip( + reason=( + "Applying huggingface processor on text inputs results in " + "significant performance regression for multimodal models. " + "See https://github.com/vllm-project/vllm/issues/26320" + ) +) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) tokenizer = init_tokenizer_from_configs(model_config) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 80d5322a34c3..839c13868a16 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -348,18 +348,15 @@ class InputPreprocessor: ) inputs: TokenInputs | MultiModalInputs - if self.model_config.is_multimodal_model: + if multi_modal_data := parsed_content.get("multi_modal_data"): inputs = self._process_multimodal( prompt_token_ids, - parsed_content.get("multi_modal_data") or {}, + multi_modal_data, parsed_content.get("mm_processor_kwargs") or {}, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, ) else: - if parsed_content.get("multi_modal_data"): - raise ValueError("This model does not support multimodal inputs") - inputs = token_inputs(prompt_token_ids) if cache_salt := parsed_content.get("cache_salt"): @@ -377,18 +374,15 @@ class InputPreprocessor: prompt_text = parsed_content["prompt"] inputs: TokenInputs | MultiModalInputs - if self.model_config.is_multimodal_model: + if multi_modal_data := parsed_content.get("multi_modal_data"): inputs = self._process_multimodal( prompt_text, - parsed_content.get("multi_modal_data") or {}, + multi_modal_data, parsed_content.get("mm_processor_kwargs") or {}, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, ) else: - if parsed_content.get("multi_modal_data"): - raise ValueError("This model does not support multimodal inputs") - prompt_token_ids = self._tokenize_prompt( prompt_text, tokenization_kwargs=tokenization_kwargs,