Fix Whisper crash caused by invalid`` max_num_batched_tokens`` config (#17853)

Signed-off-by: inkcherry <mingzhi.liu@intel.com>
2026-05-31 15:17:10 +08:00 · 2025-05-09 17:16:26 +08:00 · 2025-05-09 17:16:26 +08:00 · 5b2dcbf0b8
commit 5b2dcbf0b8
parent 6e4a93e3f7
1 changed files with 14 additions and 0 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -2050,6 +2050,13 @@ class SchedulerConfig:
                    _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                )
            # When using default settings,
            # Ensure max_num_batched_tokens does not exceed model limit.
            # Some models (e.g., Whisper) have embeddings tied to max length.
            self.max_num_batched_tokens = min(
                self.max_num_seqs * self.max_model_len,
                self.max_num_batched_tokens)
        self.max_num_encoder_input_tokens = self.max_num_batched_tokens
        self.encoder_cache_size = self.max_num_batched_tokens
@ -2090,6 +2097,13 @@ class SchedulerConfig:
                "be greater than or equal to max_num_seqs "
                f"({self.max_num_seqs}).")
        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
            logger.warning(
                "max_num_batched_tokens (%d) exceeds max_num_seqs"
                "* max_model_len (%d). This may lead to unexpected behavior.",
                self.max_num_batched_tokens,
                self.max_num_seqs * self.max_model_len)
        if self.num_lookahead_slots < 0:
            raise ValueError(
                "num_lookahead_slots "