From bf5f121c0284a2a06483b585f0d49e8508c69573 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Sun, 18 Jun 2023 17:33:50 +0800 Subject: [PATCH] Reduce GPU memory utilization to make sure OOM doesn't happen (#153) --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 55936cf0986ab..10e6070b42c73 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -21,7 +21,7 @@ class EngineArgs: tensor_parallel_size: int = 1 block_size: int = 16 swap_space: int = 4 # GiB - gpu_memory_utilization: float = 0.95 + gpu_memory_utilization: float = 0.90 max_num_batched_tokens: int = 2560 max_num_seqs: int = 256 disable_log_stats: bool = False