mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 02:25:01 +08:00
[Multimodal] Optimize Qwen2/2.5-VL startup time (#19756)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.me>
This commit is contained in:
parent
caa680fd2e
commit
2c5302fadd
@ -823,6 +823,14 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": None}
|
||||
|
||||
def get_max_tokens_per_item(
|
||||
self, seq_len: int,
|
||||
mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
|
||||
|
||||
max_image_tokens = self.get_max_image_tokens()
|
||||
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
|
||||
return {"image": max_image_tokens, "video": max_video_tokens}
|
||||
|
||||
def _get_vision_info(
|
||||
self,
|
||||
*,
|
||||
|
||||
@ -1100,6 +1100,27 @@ class BaseProcessingInfo:
|
||||
|
||||
return allowed_limits
|
||||
|
||||
def get_max_tokens_per_item(
|
||||
self, seq_len: int,
|
||||
mm_counts: Optional[Mapping[str,
|
||||
int]]) -> Optional[Mapping[str, int]]:
|
||||
"""Return the maximum number of tokens per item of for each modality.
|
||||
By default, returns `None`. When `None` is returned, vLLM will generate
|
||||
dummy inputs (images/videos) at maximum possible sizes and process them
|
||||
to determine the maximum token count per modality.
|
||||
This approach works but can be very slow for certain models (e.g.,
|
||||
Qwen2.5-VL), leading to very long startup time. For better performance,
|
||||
each model can override this method to return pre-computed maximum token
|
||||
counts, avoiding the need for dummy input generation and processing.
|
||||
|
||||
NOTE: The maximum number of tokens per item of each modality returned
|
||||
from this function should respect to the model maximum sequence length
|
||||
and the maximum number of items of each modality allowed, and agrees
|
||||
with dummy inputs (images/videos) at maximum possible sizes.
|
||||
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
_I = TypeVar("_I", bound=BaseProcessingInfo)
|
||||
|
||||
|
||||
@ -253,6 +253,26 @@ class MultiModalProfiler(Generic[_I]):
|
||||
seq_len: int,
|
||||
mm_counts: Optional[Mapping[str, int]] = None,
|
||||
) -> Mapping[str, int]:
|
||||
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
|
||||
max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
|
||||
seq_len=seq_len, mm_counts=mm_counts)
|
||||
if max_tokens_per_item is not None:
|
||||
if mm_counts is None:
|
||||
total_mm_tokens = sum(max_tokens_per_item.values())
|
||||
else:
|
||||
total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
|
||||
for k in max_tokens_per_item.keys()
|
||||
& mm_counts.keys())
|
||||
if total_mm_tokens > seq_len:
|
||||
logger.warning_once(
|
||||
"The sequence length (%d) is smaller than the pre-defined"
|
||||
" wosrt-case total number of multimodal tokens (%d). "
|
||||
"This may cause certain multi-modal inputs to fail during "
|
||||
"inference. To avoid this, you should increase "
|
||||
"`max_model_len` or reduce `mm_counts`.",
|
||||
seq_len,
|
||||
total_mm_tokens,
|
||||
)
|
||||
return max_tokens_per_item
|
||||
|
||||
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
|
||||
return self._get_mm_num_tokens(mm_inputs)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user