mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-31 04:37:02 +08:00
[Misc] Fix the size of batched_dummy_mm_inputs in profile_run (#20434)
Signed-off-by: bk-201 <joy25810@foxmail.com>
This commit is contained in:
parent
9e0ef888f0
commit
6bbf1795b7
@ -412,7 +412,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501
|
||||
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501
|
||||
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
|
||||
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501
|
||||
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
|
||||
max_model_len=4096),
|
||||
"Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
|
||||
"Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501
|
||||
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
|
||||
|
||||
@ -2219,8 +2219,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
encoder_budget = min(self.max_num_encoder_input_tokens,
|
||||
self.encoder_cache_size)
|
||||
|
||||
max_num_mm_items_encoder_budget = cdiv(encoder_budget,
|
||||
max_tokens_per_mm_item)
|
||||
max_num_mm_items_encoder_budget = encoder_budget // \
|
||||
max_tokens_per_mm_item
|
||||
|
||||
# Check how many items of this modality can be supported by
|
||||
# the decoder budget.
|
||||
@ -2233,8 +2233,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
max_num_mm_items_decoder_budget = self.max_num_reqs * \
|
||||
max_mm_items_per_req
|
||||
|
||||
max_num_mm_items = min(max_num_mm_items_encoder_budget,
|
||||
max_num_mm_items_decoder_budget)
|
||||
max_num_mm_items = max(
|
||||
1,
|
||||
min(max_num_mm_items_encoder_budget,
|
||||
max_num_mm_items_decoder_budget))
|
||||
|
||||
logger.info(
|
||||
"Encoder cache will be initialized with a budget of %s tokens,"
|
||||
@ -2244,7 +2246,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
# Create dummy batch of multimodal inputs.
|
||||
dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
|
||||
model_config=self.model_config,
|
||||
seq_len=self.max_num_tokens,
|
||||
seq_len=max_tokens_per_mm_item,
|
||||
mm_counts={
|
||||
dummy_data_modality: 1
|
||||
},
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user