From c2c661af9be413fb22adc59fc17fe5f5a680b313 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 25 Nov 2025 04:38:36 -0800 Subject: [PATCH] [Bugfix] Fix overallocation in MM profiling (#29386) Signed-off-by: Roger Wang --- vllm/v1/worker/gpu_model_runner.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0ce6c4a3204b..e78d3c71af77 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4245,14 +4245,18 @@ class GPUModelRunner( # NOTE: This happens when encoder cache needs to store # the embeddings that encoder outputs are scattered onto. # In this case we create dummy embeddings of size - # (encode_budget, hidden_size) and scatter encoder - # output into it. + # (max_tokens_for_modality, hidden_size) and scatter + # encoder output into it. encoder_output_shape = dummy_encoder_outputs[0].shape - if encoder_output_shape[0] < encoder_budget: + max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[ + dummy_modality + ] + if encoder_output_shape[0] < max_mm_tokens_per_item: + encoder_hidden_size = encoder_output_shape[-1] expanded_outputs = [] for output in dummy_encoder_outputs: expanded = output.new_zeros( - (encoder_budget, encoder_output_shape[-1]) + (max_mm_tokens_per_item, encoder_hidden_size) ) num_tokens = output.shape[0] expanded[:num_tokens].copy_(output)