From 6bbf1795b73a89a72672785c41a046ac6db9d54f Mon Sep 17 00:00:00 2001
From: B-201 <Joy25810@foxmail.com>
Date: Wed, 9 Jul 2025 11:15:44 +0800
Subject: [PATCH] [Misc] Fix the size of batched_dummy_mm_inputs in profile_run
 (#20434)

Signed-off-by: bk-201 <joy25810@foxmail.com>
---
 tests/models/registry.py           |  3 ++-
 vllm/v1/worker/gpu_model_runner.py | 12 +++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 48302f9d66484..04fff03862fce 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -412,7 +412,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
-    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
+                                                          max_model_len=4096),
     "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
     "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8658d7d916f00..ef03626cf14dc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2219,8 +2219,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             encoder_budget = min(self.max_num_encoder_input_tokens,
                                  self.encoder_cache_size)
 
-            max_num_mm_items_encoder_budget = cdiv(encoder_budget,
-                                                   max_tokens_per_mm_item)
+            max_num_mm_items_encoder_budget = encoder_budget // \
+                max_tokens_per_mm_item
 
             # Check how many items of this modality can be supported by
             # the decoder budget.
@@ -2233,8 +2233,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             max_num_mm_items_decoder_budget = self.max_num_reqs * \
                 max_mm_items_per_req
 
-            max_num_mm_items = min(max_num_mm_items_encoder_budget,
-                                   max_num_mm_items_decoder_budget)
+            max_num_mm_items = max(
+                1,
+                min(max_num_mm_items_encoder_budget,
+                    max_num_mm_items_decoder_budget))
 
             logger.info(
                 "Encoder cache will be initialized with a budget of %s tokens,"
@@ -2244,7 +2246,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Create dummy batch of multimodal inputs.
             dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
                 model_config=self.model_config,
-                seq_len=self.max_num_tokens,
+                seq_len=max_tokens_per_mm_item,
                 mm_counts={
                     dummy_data_modality: 1
                 },