From bf5f121c0284a2a06483b585f0d49e8508c69573 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Sun, 18 Jun 2023 17:33:50 +0800
Subject: [PATCH] Reduce GPU memory utilization to make sure OOM doesn't happen
 (#153)

---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 55936cf0986ab..10e6070b42c73 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -21,7 +21,7 @@ class EngineArgs:
     tensor_parallel_size: int = 1
     block_size: int = 16
     swap_space: int = 4  # GiB
-    gpu_memory_utilization: float = 0.95
+    gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: int = 2560
     max_num_seqs: int = 256
     disable_log_stats: bool = False