mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-17 06:45:01 +08:00
[Core] Rename get_max_tokens_per_item for backward compatibility (#20630)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
c438183e99
commit
32dffc2772
@ -823,10 +823,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||||
return {"image": None, "video": None}
|
return {"image": None, "video": None}
|
||||||
|
|
||||||
def get_max_tokens_per_item(
|
def get_mm_max_tokens_per_item(
|
||||||
self, seq_len: int,
|
self,
|
||||||
mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> Mapping[str, int]:
|
||||||
max_image_tokens = self.get_max_image_tokens()
|
max_image_tokens = self.get_max_image_tokens()
|
||||||
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
|
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
|
||||||
return {"image": max_image_tokens, "video": max_video_tokens}
|
return {"image": max_image_tokens, "video": max_video_tokens}
|
||||||
|
|||||||
@ -1100,24 +1100,29 @@ class BaseProcessingInfo:
|
|||||||
|
|
||||||
return allowed_limits
|
return allowed_limits
|
||||||
|
|
||||||
def get_max_tokens_per_item(
|
def get_mm_max_tokens_per_item(
|
||||||
self, seq_len: int,
|
self,
|
||||||
mm_counts: Optional[Mapping[str,
|
seq_len: int,
|
||||||
int]]) -> Optional[Mapping[str, int]]:
|
mm_counts: Mapping[str, int],
|
||||||
"""Return the maximum number of tokens per item of for each modality.
|
) -> Optional[Mapping[str, int]]:
|
||||||
By default, returns `None`. When `None` is returned, vLLM will generate
|
"""
|
||||||
dummy inputs (images/videos) at maximum possible sizes and process them
|
Return the maximum number of tokens per item of for each modality.
|
||||||
to determine the maximum token count per modality.
|
|
||||||
|
When `None` (the default) is returned, vLLM will generate dummy inputs
|
||||||
|
(images/videos) at maximum possible sizes and process them to determine
|
||||||
|
the maximum token count per modality.
|
||||||
|
|
||||||
This approach works but can be very slow for certain models (e.g.,
|
This approach works but can be very slow for certain models (e.g.,
|
||||||
Qwen2.5-VL), leading to very long startup time. For better performance,
|
Qwen2.5-VL), leading to very long startup time. For better performance,
|
||||||
each model can override this method to return pre-computed maximum token
|
each model can override this method to return pre-computed maximum token
|
||||||
counts, avoiding the need for dummy input generation and processing.
|
counts, avoiding the need for dummy input generation and processing.
|
||||||
|
|
||||||
NOTE: The maximum number of tokens per item of each modality returned
|
Note:
|
||||||
from this function should respect to the model maximum sequence length
|
The maximum number of tokens per item of each modality returned
|
||||||
and the maximum number of items of each modality allowed, and agrees
|
from this function should respect the model's maximum sequence
|
||||||
with dummy inputs (images/videos) at maximum possible sizes.
|
length and the maximum number of items of each modality allowed,
|
||||||
|
and agree with dummy inputs (images/videos) at maximum possible
|
||||||
|
sizes.
|
||||||
"""
|
"""
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@ -258,8 +258,13 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Optional[Mapping[str, int]] = None,
|
mm_counts: Optional[Mapping[str, int]] = None,
|
||||||
) -> Mapping[str, int]:
|
) -> Mapping[str, int]:
|
||||||
max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
|
if mm_counts is None:
|
||||||
seq_len=seq_len, mm_counts=mm_counts)
|
mm_counts = self.get_mm_limits()
|
||||||
|
|
||||||
|
max_tokens_per_item = self.processing_info.get_mm_max_tokens_per_item(
|
||||||
|
seq_len=seq_len,
|
||||||
|
mm_counts=mm_counts,
|
||||||
|
)
|
||||||
if max_tokens_per_item is not None:
|
if max_tokens_per_item is not None:
|
||||||
if mm_counts is None:
|
if mm_counts is None:
|
||||||
total_mm_tokens = sum(max_tokens_per_item.values())
|
total_mm_tokens = sum(max_tokens_per_item.values())
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user