[Bugfix][Perf] Revert applying HF processor on text-only inputs for multimodal models (#28858)

Signed-off-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
Roger Wang 2025-11-17 06:49:25 -08:00 committed by GitHub
parent 64e39d667c
commit 7f064491f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 11 additions and 38 deletions

View File

@ -86,34 +86,6 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
assert zipped["mm_processor_kwargs"] == exp_kwargs
@pytest.mark.parametrize(
"model_id",
[
"facebook/opt-125m",
],
)
@pytest.mark.parametrize(
"prompt",
[
{
"prompt": "",
"multi_modal_data": {"dummy": []},
},
{
"prompt_token_ids": [],
"multi_modal_data": {"dummy": []},
},
],
)
def test_preprocessor_text_no_mm_inputs(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)
with pytest.raises(ValueError, match="does not support multimodal inputs"):
input_preprocessor.preprocess(prompt)
@pytest.mark.parametrize(
"model_id",
[
@ -127,6 +99,13 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
{"prompt_token_ids": []},
],
)
@pytest.mark.skip(
reason=(
"Applying huggingface processor on text inputs results in "
"significant performance regression for multimodal models. "
"See https://github.com/vllm-project/vllm/issues/26320"
)
)
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config)

View File

@ -348,18 +348,15 @@ class InputPreprocessor:
)
inputs: TokenInputs | MultiModalInputs
if self.model_config.is_multimodal_model:
if multi_modal_data := parsed_content.get("multi_modal_data"):
inputs = self._process_multimodal(
prompt_token_ids,
parsed_content.get("multi_modal_data") or {},
multi_modal_data,
parsed_content.get("mm_processor_kwargs") or {},
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)
else:
if parsed_content.get("multi_modal_data"):
raise ValueError("This model does not support multimodal inputs")
inputs = token_inputs(prompt_token_ids)
if cache_salt := parsed_content.get("cache_salt"):
@ -377,18 +374,15 @@ class InputPreprocessor:
prompt_text = parsed_content["prompt"]
inputs: TokenInputs | MultiModalInputs
if self.model_config.is_multimodal_model:
if multi_modal_data := parsed_content.get("multi_modal_data"):
inputs = self._process_multimodal(
prompt_text,
parsed_content.get("multi_modal_data") or {},
multi_modal_data,
parsed_content.get("mm_processor_kwargs") or {},
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)
else:
if parsed_content.get("multi_modal_data"):
raise ValueError("This model does not support multimodal inputs")
prompt_token_ids = self._tokenize_prompt(
prompt_text,
tokenization_kwargs=tokenization_kwargs,