[Core] Rename get_max_tokens_per_item for backward compatibility (#20630)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-07-09 07:11:30 +08:00 committed by GitHub
parent c438183e99
commit 32dffc2772
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 30 additions and 19 deletions

View File

@ -823,10 +823,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None} return {"image": None, "video": None}
def get_max_tokens_per_item( def get_mm_max_tokens_per_item(
self, seq_len: int, self,
mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
max_image_tokens = self.get_max_image_tokens() max_image_tokens = self.get_max_image_tokens()
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts) max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
return {"image": max_image_tokens, "video": max_video_tokens} return {"image": max_image_tokens, "video": max_video_tokens}

View File

@ -1100,24 +1100,29 @@ class BaseProcessingInfo:
return allowed_limits return allowed_limits
def get_max_tokens_per_item( def get_mm_max_tokens_per_item(
self, seq_len: int, self,
mm_counts: Optional[Mapping[str, seq_len: int,
int]]) -> Optional[Mapping[str, int]]: mm_counts: Mapping[str, int],
"""Return the maximum number of tokens per item of for each modality. ) -> Optional[Mapping[str, int]]:
By default, returns `None`. When `None` is returned, vLLM will generate """
dummy inputs (images/videos) at maximum possible sizes and process them Return the maximum number of tokens per item of for each modality.
to determine the maximum token count per modality.
When `None` (the default) is returned, vLLM will generate dummy inputs
(images/videos) at maximum possible sizes and process them to determine
the maximum token count per modality.
This approach works but can be very slow for certain models (e.g., This approach works but can be very slow for certain models (e.g.,
Qwen2.5-VL), leading to very long startup time. For better performance, Qwen2.5-VL), leading to very long startup time. For better performance,
each model can override this method to return pre-computed maximum token each model can override this method to return pre-computed maximum token
counts, avoiding the need for dummy input generation and processing. counts, avoiding the need for dummy input generation and processing.
NOTE: The maximum number of tokens per item of each modality returned Note:
from this function should respect to the model maximum sequence length The maximum number of tokens per item of each modality returned
and the maximum number of items of each modality allowed, and agrees from this function should respect the model's maximum sequence
with dummy inputs (images/videos) at maximum possible sizes. length and the maximum number of items of each modality allowed,
and agree with dummy inputs (images/videos) at maximum possible
sizes.
""" """
return None return None

View File

@ -258,8 +258,13 @@ class MultiModalProfiler(Generic[_I]):
seq_len: int, seq_len: int,
mm_counts: Optional[Mapping[str, int]] = None, mm_counts: Optional[Mapping[str, int]] = None,
) -> Mapping[str, int]: ) -> Mapping[str, int]:
max_tokens_per_item = self.processing_info.get_max_tokens_per_item( if mm_counts is None:
seq_len=seq_len, mm_counts=mm_counts) mm_counts = self.get_mm_limits()
max_tokens_per_item = self.processing_info.get_mm_max_tokens_per_item(
seq_len=seq_len,
mm_counts=mm_counts,
)
if max_tokens_per_item is not None: if max_tokens_per_item is not None:
if mm_counts is None: if mm_counts is None:
total_mm_tokens = sum(max_tokens_per_item.values()) total_mm_tokens = sum(max_tokens_per_item.values())