From 8041b7305e93a8626d85cb23b3fcb995882867c1 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Dec 2023 17:08:23 -0800 Subject: [PATCH] [BugFix] Raise error when max_model_len is larger than KV cache (#2163) --- vllm/engine/llm_engine.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d91ab1430735c..d6e388bf135b2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -227,6 +227,14 @@ class LLMEngine: raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " "initializing the engine.") + max_seq_len = self.cache_config.block_size * num_gpu_blocks + if self.model_config.max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({self.model_config.max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks