Print the warning only once (#16193)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-12-15 17:56:08 +08:00 · 2025-04-07 14:30:06 -04:00 · 2025-04-07 14:30:06 -04:00 · ad434d4cfe
commit ad434d4cfe
parent 66d433b94f
1 changed files with 12 additions and 10 deletions
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@ -216,17 +216,18 @@ class MultiModalProfiler(Generic[_I]):
        # Encoder-decoder multimodal models only support v0
        if total_len > seq_len:
            # `max_num_batched_tokens` is defined by `SchedulerConfig`
-            logger.warning(
+            logger.warning_once(
                "The encoder sequence length used for profiling ("
-                "max_num_batched_tokens / max_num_seqs = %d) is too short "
+                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
                " is too short "
                "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
+                f"({total_len} tokens in total, out of which "
                f"{total_placeholders_by_modality} are reserved for "
                "multi-modal embeddings). This may cause certain "
                "multi-modal inputs to fail during inference, even when "
                "the input text is short. To avoid this, you should "
                "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.", seq_len, total_len,
+                "and/or reduce `mm_counts`.")
                total_placeholders_by_modality)
        processor = cast(EncDecMultiModalProcessor, self.processor)
        if processor.pad_dummy_encoder_prompt:
@ -251,17 +252,18 @@ class MultiModalProfiler(Generic[_I]):
        # V0 does not support chunked prefill.
        if total_len > seq_len and not envs.VLLM_USE_V1:
            # `max_num_batched_tokens` is defined by `SchedulerConfig`
-            logger.warning(
+            logger.warning_once(
                "The sequence length used for profiling ("
-                "max_num_batched_tokens / max_num_seqs = %d) is too short "
+                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
                "is too short "
                "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
+                f"({total_len} tokens in total, out of which "
                f"{total_placeholders_by_modality} are reserved for "
                "multi-modal embeddings). This may cause certain "
                "multi-modal inputs to fail during inference, even when "
                "the input text is short. To avoid this, you should "
                "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.", seq_len, total_len,
+                "and/or reduce `mm_counts`.")
                total_placeholders_by_modality)
        if total_len < seq_len:
            prompt_token_ids.extend([0] * (seq_len - total_len))