[Multimodal] Optimize Qwen2/2.5-VL startup time (#19756)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.me>
2025-12-11 14:25:01 +08:00 · 2025-06-21 13:01:07 -07:00 · 2025-06-21 13:01:07 -07:00 · 2c5302fadd
commit 2c5302fadd
parent caa680fd2e
3 changed files with 50 additions and 1 deletions
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@ -823,6 +823,14 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None, "video": None}
    def get_max_tokens_per_item(
            self, seq_len: int,
            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
        max_image_tokens = self.get_max_image_tokens()
        max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
        return {"image": max_image_tokens, "video": max_video_tokens}
    def _get_vision_info(
        self,
        *,
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@ -1100,6 +1100,27 @@ class BaseProcessingInfo:
        return allowed_limits
    def get_max_tokens_per_item(
            self, seq_len: int,
            mm_counts: Optional[Mapping[str,
                                        int]]) -> Optional[Mapping[str, int]]:
        """Return the maximum number of tokens per item of for each modality.
        By default, returns `None`. When `None` is returned, vLLM will generate
        dummy inputs (images/videos) at maximum possible sizes and process them
        to determine the maximum token count per modality.
        This approach works but can be very slow for certain models (e.g.,
        Qwen2.5-VL), leading to very long startup time. For better performance,
        each model can override this method to return pre-computed maximum token
        counts, avoiding the need for dummy input generation and processing.
        NOTE: The maximum number of tokens per item of each modality returned 
        from this function should respect to the model maximum sequence length 
        and the maximum number of items of each modality allowed, and agrees 
        with dummy inputs (images/videos) at maximum possible sizes.
        """
        return None
 _I = TypeVar("_I", bound=BaseProcessingInfo)
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@ -253,6 +253,26 @@ class MultiModalProfiler(Generic[_I]):
        seq_len: int,
        mm_counts: Optional[Mapping[str, int]] = None,
    ) -> Mapping[str, int]:
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
            seq_len=seq_len, mm_counts=mm_counts)
        if max_tokens_per_item is not None:
            if mm_counts is None:
                total_mm_tokens = sum(max_tokens_per_item.values())
            else:
                total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
                                      for k in max_tokens_per_item.keys()
                                      & mm_counts.keys())
            if total_mm_tokens > seq_len:
                logger.warning_once(
                    "The sequence length (%d) is smaller than the pre-defined"
                    " wosrt-case total number of multimodal tokens (%d). "
                    "This may cause certain multi-modal inputs to fail during "
                    "inference. To avoid this, you should increase "
                    "`max_model_len` or reduce `mm_counts`.",
                    seq_len,
                    total_mm_tokens,
                )
            return max_tokens_per_item
        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
        return self._get_mm_num_tokens(mm_inputs)