[Bugfix][Perf] Revert applying HF processor on text-only inputs for multimodal models (#28858)

Signed-off-by: Roger Wang <hey@rogerw.io>
2026-07-10 00:57:22 +08:00 · 2025-11-17 06:49:25 -08:00 · 2025-11-17 06:49:25 -08:00 · 7f064491f8
commit 7f064491f8
parent 64e39d667c
2 changed files with 11 additions and 38 deletions
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@ -86,34 +86,6 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
        assert zipped["mm_processor_kwargs"] == exp_kwargs
@pytest.mark.parametrize(
    "model_id",
    [
        "facebook/opt-125m",
    ],
 )
@pytest.mark.parametrize(
    "prompt",
    [
        {
            "prompt": "",
            "multi_modal_data": {"dummy": []},
        },
        {
            "prompt_token_ids": [],
            "multi_modal_data": {"dummy": []},
        },
    ],
 )
 def test_preprocessor_text_no_mm_inputs(model_id, prompt):
    model_config = ModelConfig(model=model_id)
    tokenizer = init_tokenizer_from_configs(model_config)
    input_preprocessor = InputPreprocessor(model_config, tokenizer)
    with pytest.raises(ValueError, match="does not support multimodal inputs"):
        input_preprocessor.preprocess(prompt)
@pytest.mark.parametrize(
    "model_id",
    [
@ -127,6 +99,13 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
        {"prompt_token_ids": []},
    ],
 )
@pytest.mark.skip(
    reason=(
        "Applying huggingface processor on text inputs results in "
        "significant performance regression for multimodal models. "
        "See https://github.com/vllm-project/vllm/issues/26320"
    )
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
    model_config = ModelConfig(model=model_id)
    tokenizer = init_tokenizer_from_configs(model_config)
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@ -348,18 +348,15 @@ class InputPreprocessor:
        )
        inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
            inputs = self._process_multimodal(
                prompt_token_ids,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                parsed_content.get("mm_processor_kwargs") or {},
                tokenization_kwargs=tokenization_kwargs,
                mm_uuids=mm_uuids,
            )
        else:
            if parsed_content.get("multi_modal_data"):
                raise ValueError("This model does not support multimodal inputs")
            inputs = token_inputs(prompt_token_ids)
        if cache_salt := parsed_content.get("cache_salt"):
@ -377,18 +374,15 @@ class InputPreprocessor:
        prompt_text = parsed_content["prompt"]
        inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
            inputs = self._process_multimodal(
                prompt_text,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                parsed_content.get("mm_processor_kwargs") or {},
                tokenization_kwargs=tokenization_kwargs,
                mm_uuids=mm_uuids,
            )
        else:
            if parsed_content.get("multi_modal_data"):
                raise ValueError("This model does not support multimodal inputs")
            prompt_token_ids = self._tokenize_prompt(
                prompt_text,
                tokenization_kwargs=tokenization_kwargs,