From 6bbf1795b73a89a72672785c41a046ac6db9d54f Mon Sep 17 00:00:00 2001 From: B-201 Date: Wed, 9 Jul 2025 11:15:44 +0800 Subject: [PATCH] [Misc] Fix the size of batched_dummy_mm_inputs in profile_run (#20434) Signed-off-by: bk-201 --- tests/models/registry.py | 3 ++- vllm/v1/worker/gpu_model_runner.py | 12 +++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 48302f9d66484..04fff03862fce 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -412,7 +412,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501 "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 - "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501 + "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 + max_model_len=4096), "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8658d7d916f00..ef03626cf14dc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2219,8 +2219,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): encoder_budget = min(self.max_num_encoder_input_tokens, self.encoder_cache_size) - max_num_mm_items_encoder_budget = cdiv(encoder_budget, - max_tokens_per_mm_item) + max_num_mm_items_encoder_budget = encoder_budget // \ + max_tokens_per_mm_item # Check how many items of this modality can be supported by # the decoder budget. @@ -2233,8 +2233,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): max_num_mm_items_decoder_budget = self.max_num_reqs * \ max_mm_items_per_req - max_num_mm_items = min(max_num_mm_items_encoder_budget, - max_num_mm_items_decoder_budget) + max_num_mm_items = max( + 1, + min(max_num_mm_items_encoder_budget, + max_num_mm_items_decoder_budget)) logger.info( "Encoder cache will be initialized with a budget of %s tokens," @@ -2244,7 +2246,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Create dummy batch of multimodal inputs. dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, - seq_len=self.max_num_tokens, + seq_len=max_tokens_per_mm_item, mm_counts={ dummy_data_modality: 1 },