From 92b0ce2ac75e251fe683f5b720f07001782054ff Mon Sep 17 00:00:00 2001 From: Chauncey Date: Tue, 11 Mar 2025 02:24:51 +0800 Subject: [PATCH] [Bugfix][v1] fixed llava-hf/llava-1.5-7b-hf is broken on V1 (#14554) Signed-off-by: chaunceyjiang Signed-off-by: DarkLight1337 Co-authored-by: DarkLight1337 --- vllm/model_executor/models/llava.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index e83dfd320bb67..0c0d8e109c92e 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -783,15 +783,19 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): if image_input is None: return None vision_embeddings = self._process_image_input(image_input) - if kwargs.get("v0_path", False): + + if kwargs.get("v0_path", False) or \ + image_input.get("feat_is_patch") is None or \ + image_input.get("embed_is_patch") is None: + # The path is used for pixtral (V0 only) and llava (V0/V1) return vision_embeddings - else: - nested_emb = [ - self._get_mm_embeds(*args) for args in zip( - vision_embeddings, image_input["feat_is_patch"], - image_input["num_crops"], image_input["embed_is_patch"]) - ] - return flatten_2d_lists(nested_emb) + + nested_emb = [ + self._get_mm_embeds(*args) for args in zip( + vision_embeddings, image_input["feat_is_patch"], + image_input["num_crops"], image_input["embed_is_patch"]) + ] + return flatten_2d_lists(nested_emb) def get_input_embeddings( self,