mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-23 14:04:29 +08:00
[Misc] Clean up MM profiling warnings (#25222)
Signed-off-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
parent
1a0a04dae9
commit
31a8a2a7bc
@ -234,19 +234,6 @@ class MultiModalProfiler(Generic[_I]):
|
||||
prompt_token_ids = mm_inputs["prompt_token_ids"]
|
||||
total_len = len(prompt_token_ids)
|
||||
|
||||
# V0 does not support chunked prefill.
|
||||
if total_len > seq_len and not envs.VLLM_USE_V1:
|
||||
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
||||
logger.warning_once(
|
||||
"The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) " # noqa: E501
|
||||
"is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). " # noqa: E501
|
||||
"This may cause certain multi-modal inputs to fail during inference, even when the input text is short. " # noqa: E501
|
||||
"To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.", # noqa: E501
|
||||
seq_len,
|
||||
total_len,
|
||||
str(self._get_mm_num_tokens(mm_inputs)),
|
||||
)
|
||||
|
||||
if total_len < seq_len:
|
||||
prompt_token_ids.extend([0] * (seq_len - total_len))
|
||||
|
||||
@ -270,22 +257,6 @@ class MultiModalProfiler(Generic[_I]):
|
||||
mm_counts=mm_counts,
|
||||
)
|
||||
if max_tokens_per_item is not None:
|
||||
if mm_counts is None:
|
||||
total_mm_tokens = sum(max_tokens_per_item.values())
|
||||
else:
|
||||
total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
|
||||
for k in max_tokens_per_item.keys()
|
||||
& mm_counts.keys())
|
||||
if total_mm_tokens > seq_len:
|
||||
logger.warning_once(
|
||||
"The sequence length (%d) is smaller than the pre-defined"
|
||||
" worst-case total number of multimodal tokens (%d). "
|
||||
"This may cause certain multi-modal inputs to fail during "
|
||||
"inference. To avoid this, you should increase "
|
||||
"`max_model_len` or reduce `mm_counts`.",
|
||||
seq_len,
|
||||
total_mm_tokens,
|
||||
)
|
||||
return max_tokens_per_item
|
||||
|
||||
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user