Reduce GPU memory utilization to make sure OOM doesn't happen (#153)

2026-07-10 20:57:21 +08:00 · 2023-06-18 17:33:50 +08:00 · 2023-06-18 17:33:50 +08:00 · bf5f121c02
commit bf5f121c02
parent bec7b2dc26
1 changed files with 1 additions and 1 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -21,7 +21,7 @@ class EngineArgs:
    tensor_parallel_size: int = 1
    block_size: int = 16
    swap_space: int = 4  # GiB
-    gpu_memory_utilization: float = 0.95
+    gpu_memory_utilization: float = 0.90
    max_num_batched_tokens: int = 2560
    max_num_seqs: int = 256
    disable_log_stats: bool = False