[bugfix] Missing cached item in beam search (#27874)

Signed-off-by: fake0fan <645327136@qq.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-12-11 01:25:25 +08:00 · 2025-11-01 01:34:27 +08:00 · 2025-11-01 01:34:27 +08:00 · 103a468bbf
commit 103a468bbf
parent 70bfbd7b16
1 changed files with 10 additions and 18 deletions
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -345,22 +345,7 @@ class OpenAIServing:
        if is_explicit_encoder_decoder_prompt(prompt):
            raise NotImplementedError
        else:
            processed_inputs = processor.input_preprocessor._prompt_to_llm_inputs(
                prompt
            )
        if processed_inputs["type"] == "embeds":
            raise NotImplementedError
        # This is a workaround to fix multimodal beam search; this is a
        # bandaid fix for 2 small problems:
        # 1. Multi_modal_data on the processed_inputs currently resolves to
        #    `None`.
        # 2. preprocessing above expands the multimodal placeholders. However,
        #    this happens again in generation, so the double expansion causes
        #    a mismatch.
        # TODO - would be ideal to handle this more gracefully.
        prompt_text: str | None
        prompt_token_ids: list[int]
        multi_modal_data: MultiModalDataDict | None
@ -373,9 +358,16 @@ class OpenAIServing:
            prompt_token_ids = prompt.get("prompt_token_ids", [])  # type: ignore
            multi_modal_data = prompt.get("multi_modal_data")  # type: ignore
-        mm_processor_kwargs: dict[str, Any] | None = processed_inputs.get(
+        mm_processor_kwargs: dict[str, Any] | None = None
-            "mm_processor_kwargs"
+
-        )  # type: ignore
+        # This is a workaround to fix multimodal beam search; this is a
        # bandaid fix for 2 small problems:
        # 1. Multi_modal_data on the processed_inputs currently resolves to
        #    `None`.
        # 2. preprocessing above expands the multimodal placeholders. However,
        #    this happens again in generation, so the double expansion causes
        #    a mismatch.
        # TODO - would be ideal to handle this more gracefully.
        tokenized_length = len(prompt_token_ids)