diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 41b38b855ebf..ad63bb4af4e9 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -823,10 +823,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def get_max_tokens_per_item( - self, seq_len: int, - mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: - + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: max_image_tokens = self.get_max_image_tokens() max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts) return {"image": max_image_tokens, "video": max_video_tokens} diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index aa7889fc3cc5..78d244a6b4fc 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1100,24 +1100,29 @@ class BaseProcessingInfo: return allowed_limits - def get_max_tokens_per_item( - self, seq_len: int, - mm_counts: Optional[Mapping[str, - int]]) -> Optional[Mapping[str, int]]: - """Return the maximum number of tokens per item of for each modality. - By default, returns `None`. When `None` is returned, vLLM will generate - dummy inputs (images/videos) at maximum possible sizes and process them - to determine the maximum token count per modality. + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Optional[Mapping[str, int]]: + """ + Return the maximum number of tokens per item of for each modality. + + When `None` (the default) is returned, vLLM will generate dummy inputs + (images/videos) at maximum possible sizes and process them to determine + the maximum token count per modality. + This approach works but can be very slow for certain models (e.g., Qwen2.5-VL), leading to very long startup time. For better performance, each model can override this method to return pre-computed maximum token counts, avoiding the need for dummy input generation and processing. - NOTE: The maximum number of tokens per item of each modality returned - from this function should respect to the model maximum sequence length - and the maximum number of items of each modality allowed, and agrees - with dummy inputs (images/videos) at maximum possible sizes. - + Note: + The maximum number of tokens per item of each modality returned + from this function should respect the model's maximum sequence + length and the maximum number of items of each modality allowed, + and agree with dummy inputs (images/videos) at maximum possible + sizes. """ return None diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index fb5a7b64c419..cdec783ef9cf 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -258,8 +258,13 @@ class MultiModalProfiler(Generic[_I]): seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, ) -> Mapping[str, int]: - max_tokens_per_item = self.processing_info.get_max_tokens_per_item( - seq_len=seq_len, mm_counts=mm_counts) + if mm_counts is None: + mm_counts = self.get_mm_limits() + + max_tokens_per_item = self.processing_info.get_mm_max_tokens_per_item( + seq_len=seq_len, + mm_counts=mm_counts, + ) if max_tokens_per_item is not None: if mm_counts is None: total_mm_tokens = sum(max_tokens_per_item.values())