From 1d24ccb96cdfbb42fc2b0f0591df82727d9537c6 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Thu, 22 Jun 2023 15:30:06 +0800 Subject: [PATCH] [Fix] Better error message when there is OOM during cache initialization (#203) --- vllm/engine/llm_engine.py | 6 ++++++ vllm/outputs.py | 1 + 2 files changed, 7 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3668dd7ee37f..c4aea06ba12f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -127,6 +127,12 @@ class LLMEngine: # FIXME(woosuk): Change to debug log. logger.info(f'# GPU blocks: {num_gpu_blocks}, ' f'# CPU blocks: {num_cpu_blocks}') + + if num_gpu_blocks <= 0 or num_cpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/outputs.py b/vllm/outputs.py index ebb5c19df0ad..384ca020985d 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -53,6 +53,7 @@ class RequestOutput: prompt: The prompt string of the request. prompt_token_ids: The token IDs of the prompt. outputs: The output sequences of the request. + finished: Whether the whole request is finished. """ def __init__( self,