From 2c5302fadd81c06f61e5a3973ed4c0e6a4a2be40 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 21 Jun 2025 13:01:07 -0700 Subject: [PATCH] [Multimodal] Optimize Qwen2/2.5-VL startup time (#19756) Signed-off-by: Woosuk Kwon Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/model_executor/models/qwen2_vl.py | 8 ++++++++ vllm/multimodal/processing.py | 21 +++++++++++++++++++++ vllm/multimodal/profiling.py | 22 +++++++++++++++++++++- 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 7a6ebe10c516..899fc57c7a0e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -823,6 +823,14 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} + def get_max_tokens_per_item( + self, seq_len: int, + mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: + + max_image_tokens = self.get_max_image_tokens() + max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts) + return {"image": max_image_tokens, "video": max_video_tokens} + def _get_vision_info( self, *, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 5cfca57bffee..38f3a7cb932f 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1100,6 +1100,27 @@ class BaseProcessingInfo: return allowed_limits + def get_max_tokens_per_item( + self, seq_len: int, + mm_counts: Optional[Mapping[str, + int]]) -> Optional[Mapping[str, int]]: + """Return the maximum number of tokens per item of for each modality. + By default, returns `None`. When `None` is returned, vLLM will generate + dummy inputs (images/videos) at maximum possible sizes and process them + to determine the maximum token count per modality. + This approach works but can be very slow for certain models (e.g., + Qwen2.5-VL), leading to very long startup time. For better performance, + each model can override this method to return pre-computed maximum token + counts, avoiding the need for dummy input generation and processing. + + NOTE: The maximum number of tokens per item of each modality returned + from this function should respect to the model maximum sequence length + and the maximum number of items of each modality allowed, and agrees + with dummy inputs (images/videos) at maximum possible sizes. + + """ + return None + _I = TypeVar("_I", bound=BaseProcessingInfo) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 1faecb7bd24a..67bcb31f23f7 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -253,6 +253,26 @@ class MultiModalProfiler(Generic[_I]): seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, ) -> Mapping[str, int]: - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + max_tokens_per_item = self.processing_info.get_max_tokens_per_item( + seq_len=seq_len, mm_counts=mm_counts) + if max_tokens_per_item is not None: + if mm_counts is None: + total_mm_tokens = sum(max_tokens_per_item.values()) + else: + total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k] + for k in max_tokens_per_item.keys() + & mm_counts.keys()) + if total_mm_tokens > seq_len: + logger.warning_once( + "The sequence length (%d) is smaller than the pre-defined" + " wosrt-case total number of multimodal tokens (%d). " + "This may cause certain multi-modal inputs to fail during " + "inference. To avoid this, you should increase " + "`max_model_len` or reduce `mm_counts`.", + seq_len, + total_mm_tokens, + ) + return max_tokens_per_item + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) return self._get_mm_num_tokens(mm_inputs)