diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 42c769f79e202..f4b7e4478c164 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -669,8 +669,11 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): image_embeds = self.extract_feature(image_input["data"]) patches_per_image = image_input["patches_per_image"] + + # Only one image in the current batch if len(patches_per_image) == 1: - image_embeds = image_embeds.unsqueeze(0) + image_embeds = image_embeds.view( + -1, self.config.text_config.hidden_size).unsqueeze(0) return image_embeds # NOTE: Image embeddings are split into separate tensors for each image