From 7f064491f80ba20e782f33f4da566ec7da5118d7 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Mon, 17 Nov 2025 06:49:25 -0800
Subject: [PATCH] [Bugfix][Perf] Revert applying HF processor on text-only
 inputs for multimodal models  (#28858)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 tests/test_inputs.py      | 35 +++++++----------------------------
 vllm/inputs/preprocess.py | 14 ++++----------
 2 files changed, 11 insertions(+), 38 deletions(-)

diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index 50a273016ab8..b1fb4e06a690 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -86,34 +86,6 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
         assert zipped["mm_processor_kwargs"] == exp_kwargs
 
 
-@pytest.mark.parametrize(
-    "model_id",
-    [
-        "facebook/opt-125m",
-    ],
-)
-@pytest.mark.parametrize(
-    "prompt",
-    [
-        {
-            "prompt": "",
-            "multi_modal_data": {"dummy": []},
-        },
-        {
-            "prompt_token_ids": [],
-            "multi_modal_data": {"dummy": []},
-        },
-    ],
-)
-def test_preprocessor_text_no_mm_inputs(model_id, prompt):
-    model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_configs(model_config)
-    input_preprocessor = InputPreprocessor(model_config, tokenizer)
-
-    with pytest.raises(ValueError, match="does not support multimodal inputs"):
-        input_preprocessor.preprocess(prompt)
-
-
 @pytest.mark.parametrize(
     "model_id",
     [
@@ -127,6 +99,13 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
         {"prompt_token_ids": []},
     ],
 )
+@pytest.mark.skip(
+    reason=(
+        "Applying huggingface processor on text inputs results in "
+        "significant performance regression for multimodal models. "
+        "See https://github.com/vllm-project/vllm/issues/26320"
+    )
+)
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
     tokenizer = init_tokenizer_from_configs(model_config)
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 80d5322a34c3..839c13868a16 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -348,18 +348,15 @@ class InputPreprocessor:
         )
 
         inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_token_ids,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                 parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            if parsed_content.get("multi_modal_data"):
-                raise ValueError("This model does not support multimodal inputs")
-
             inputs = token_inputs(prompt_token_ids)
 
         if cache_salt := parsed_content.get("cache_salt"):
@@ -377,18 +374,15 @@ class InputPreprocessor:
         prompt_text = parsed_content["prompt"]
 
         inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_text,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                 parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            if parsed_content.get("multi_modal_data"):
-                raise ValueError("This model does not support multimodal inputs")
-
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
                 tokenization_kwargs=tokenization_kwargs,