From 8dd0671baca64527ddc53dbe93a78b800fdc3e54 Mon Sep 17 00:00:00 2001
From: Jin Huang <jinhuang1992@gmail.com>
Date: Tue, 13 May 2025 03:10:07 -0400
Subject: [PATCH] [Bugfix][V1] Only get input embeddings w/ multi-modal models
 if first PP (#17916)

Signed-off-by: Jin Huang <jinhun@amazon.com>
Co-authored-by: Jin Huang <jinhun@amazon.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bd833735b6953..31895cc0832aa 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1107,7 +1107,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         else:
             mm_embeds = []
 
-        if self.is_multimodal_model:
+        if self.is_multimodal_model and get_pp_group().is_first_rank:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.