[Core] Rename get_max_tokens_per_item for backward compatibility (#20630)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-05-10 13:55:45 +08:00 · 2025-07-09 07:11:30 +08:00 · 2025-07-09 07:11:30 +08:00 · 32dffc2772
commit 32dffc2772
parent c438183e99
3 changed files with 30 additions and 19 deletions
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@ -823,10 +823,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None, "video": None}
-    def get_max_tokens_per_item(
+    def get_mm_max_tokens_per_item(
-            self, seq_len: int,
+        self,
-            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+        seq_len: int,
-
+        mm_counts: Mapping[str, int],
    ) -> Mapping[str, int]:
        max_image_tokens = self.get_max_image_tokens()
        max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
        return {"image": max_image_tokens, "video": max_video_tokens}
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@ -1100,24 +1100,29 @@ class BaseProcessingInfo:
        return allowed_limits
-    def get_max_tokens_per_item(
+    def get_mm_max_tokens_per_item(
-            self, seq_len: int,
+        self,
-            mm_counts: Optional[Mapping[str,
+        seq_len: int,
-                                        int]]) -> Optional[Mapping[str, int]]:
+        mm_counts: Mapping[str, int],
-        """Return the maximum number of tokens per item of for each modality.
+    ) -> Optional[Mapping[str, int]]:
-        By default, returns `None`. When `None` is returned, vLLM will generate
+        """
-        dummy inputs (images/videos) at maximum possible sizes and process them
+        Return the maximum number of tokens per item of for each modality.
-        to determine the maximum token count per modality.
+        
        When `None` (the default) is returned, vLLM will generate dummy inputs
        (images/videos) at maximum possible sizes and process them to determine
        the maximum token count per modality.
        This approach works but can be very slow for certain models (e.g.,
        Qwen2.5-VL), leading to very long startup time. For better performance,
        each model can override this method to return pre-computed maximum token
        counts, avoiding the need for dummy input generation and processing.
-        NOTE: The maximum number of tokens per item of each modality returned 
+        Note:
-        from this function should respect to the model maximum sequence length 
+            The maximum number of tokens per item of each modality returned 
-        and the maximum number of items of each modality allowed, and agrees 
+            from this function should respect the model's maximum sequence
-        with dummy inputs (images/videos) at maximum possible sizes.
+            length and the maximum number of items of each modality allowed,
-
+            and agree with dummy inputs (images/videos) at maximum possible
            sizes.
        """
        return None
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@ -258,8 +258,13 @@ class MultiModalProfiler(Generic[_I]):
        seq_len: int,
        mm_counts: Optional[Mapping[str, int]] = None,
    ) -> Mapping[str, int]:
-        max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
+        if mm_counts is None:
-            seq_len=seq_len, mm_counts=mm_counts)
+            mm_counts = self.get_mm_limits()
        max_tokens_per_item = self.processing_info.get_mm_max_tokens_per_item(
            seq_len=seq_len,
            mm_counts=mm_counts,
        )
        if max_tokens_per_item is not None:
            if mm_counts is None:
                total_mm_tokens = sum(max_tokens_per_item.values())