From 0b7db411b5af4bf8a3a0cf989daa024ba5401ac1 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Mon, 26 Jun 2023 11:16:13 -0700 Subject: [PATCH] [Bug] Fix the OOM condition for CPU cache (#260) --- vllm/engine/llm_engine.py | 2 +- vllm/worker/worker.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c4aea06ba12f..e9d616febc14 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -128,7 +128,7 @@ class LLMEngine: logger.info(f'# GPU blocks: {num_gpu_blocks}, ' f'# CPU blocks: {num_cpu_blocks}') - if num_gpu_blocks <= 0 or num_cpu_blocks <= 0: + if num_gpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " "initializing the engine.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2beafc670418..b97011206ee2 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -113,6 +113,8 @@ class Worker: num_gpu_blocks = int((total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size) num_cpu_blocks = int(cpu_swap_space // cache_block_size) + num_gpu_blocks = max(num_gpu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) torch.cuda.empty_cache() # Reset the seed to ensure that the random state is not affected by