[BugFix] Raise error when max_model_len is larger than KV cache (#2163)

2025-12-15 19:06:17 +08:00 · 2023-12-17 17:08:23 -08:00 · 2023-12-17 17:08:23 -08:00 · 8041b7305e
commit 8041b7305e
parent 3ec8c25cd0
1 changed files with 8 additions and 0 deletions
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -227,6 +227,14 @@ class LLMEngine:
            raise ValueError("No available memory for the cache blocks. "
                             "Try increasing `gpu_memory_utilization` when "
                             "initializing the engine.")
        max_seq_len = self.cache_config.block_size * num_gpu_blocks
        if self.model_config.max_model_len > max_seq_len:
            raise ValueError(
                f"The model's max seq len ({self.model_config.max_model_len}) "
                "is larger than the maximum number of tokens that can be "
                f"stored in KV cache ({max_seq_len}). Try increasing "
                "`gpu_memory_utilization` or decreasing `max_model_len` when "
                "initializing the engine.")
        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks