From 6de3d431d911eff03a3e1b69233f68b5e7f8a472 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 27 Sep 2025 19:17:58 -0700 Subject: [PATCH] [MM] Optimize memory profiling for scattered multimodal embeddings (#25810) Signed-off-by: Roger Wang Signed-off-by: simon-mo --- vllm/v1/worker/gpu_model_runner.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e277c69e9611..e7cd418f115d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3351,6 +3351,23 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): expected_num_items=max_mm_items_per_batch, ) + # NOTE: This happens when encoder cache needs to store + # the embeddings that encoder outputs are scattered onto. + # In this case we create dummy embeddings of size + # (encode_budget, hidden_size) and scatter encoder + # output into it. + encoder_output_shape = dummy_encoder_outputs[0].shape + if encoder_output_shape[0] < encoder_budget: + expanded_outputs = [] + for output in dummy_encoder_outputs: + expanded = output.new_zeros( + (encoder_budget, encoder_output_shape[-1])) + num_tokens = output.shape[0] + expanded[:num_tokens].copy_(output) + expanded_outputs.append(expanded) + + dummy_encoder_outputs = expanded_outputs + # Cache the dummy encoder outputs. self.encoder_cache["tmp"] = dict( enumerate(dummy_encoder_outputs))