From 2c5302fadd81c06f61e5a3973ed4c0e6a4a2be40 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 21 Jun 2025 13:01:07 -0700
Subject: [PATCH] [Multimodal] Optimize Qwen2/2.5-VL startup time (#19756)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 vllm/model_executor/models/qwen2_vl.py |  8 ++++++++
 vllm/multimodal/processing.py          | 21 +++++++++++++++++++++
 vllm/multimodal/profiling.py           | 22 +++++++++++++++++++++-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 7a6ebe10c516..899fc57c7a0e 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -823,6 +823,14 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
+    def get_max_tokens_per_item(
+            self, seq_len: int,
+            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+
+        max_image_tokens = self.get_max_image_tokens()
+        max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
     def _get_vision_info(
         self,
         *,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 5cfca57bffee..38f3a7cb932f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1100,6 +1100,27 @@ class BaseProcessingInfo:
 
         return allowed_limits
 
+    def get_max_tokens_per_item(
+            self, seq_len: int,
+            mm_counts: Optional[Mapping[str,
+                                        int]]) -> Optional[Mapping[str, int]]:
+        """Return the maximum number of tokens per item of for each modality.
+        By default, returns `None`. When `None` is returned, vLLM will generate
+        dummy inputs (images/videos) at maximum possible sizes and process them
+        to determine the maximum token count per modality.
+        This approach works but can be very slow for certain models (e.g.,
+        Qwen2.5-VL), leading to very long startup time. For better performance,
+        each model can override this method to return pre-computed maximum token
+        counts, avoiding the need for dummy input generation and processing.
+
+        NOTE: The maximum number of tokens per item of each modality returned 
+        from this function should respect to the model maximum sequence length 
+        and the maximum number of items of each modality allowed, and agrees 
+        with dummy inputs (images/videos) at maximum possible sizes.
+
+        """
+        return None
+
 
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 1faecb7bd24a..67bcb31f23f7 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -253,6 +253,26 @@ class MultiModalProfiler(Generic[_I]):
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
     ) -> Mapping[str, int]:
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
+            seq_len=seq_len, mm_counts=mm_counts)
+        if max_tokens_per_item is not None:
+            if mm_counts is None:
+                total_mm_tokens = sum(max_tokens_per_item.values())
+            else:
+                total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
+                                      for k in max_tokens_per_item.keys()
+                                      & mm_counts.keys())
+            if total_mm_tokens > seq_len:
+                logger.warning_once(
+                    "The sequence length (%d) is smaller than the pre-defined"
+                    " wosrt-case total number of multimodal tokens (%d). "
+                    "This may cause certain multi-modal inputs to fail during "
+                    "inference. To avoid this, you should increase "
+                    "`max_model_len` or reduce `mm_counts`.",
+                    seq_len,
+                    total_mm_tokens,
+                )
+            return max_tokens_per_item
 
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         return self._get_mm_num_tokens(mm_inputs)