From bc0644574ca12d754a031596bdcfe8e1f0e6ab39 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Sep 2023 22:16:04 -0700
Subject: [PATCH] Add gpu_memory_utilization and swap_space to LLM (#1090)

---
 vllm/entrypoints/llm.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index ef07ac212d4ae..6361364c75e30 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -37,12 +37,22 @@ class LLM:
             the `torch_dtype` attribute specified in the model config file.
             However, if the `torch_dtype` in the config is `float32`, we will
             use `float16` instead.
-        seed: The seed to initialize the random number generator for sampling.
         quantization: The method used to quantize the model weights. Currently,
             we support "awq". If None, we assume the model weights are not
             quantized and use `dtype` to determine the data type of the weights.
         revision: The specific model version to use. It can be a branch name,
             a tag name, or a commit id.
+        seed: The seed to initialize the random number generator for sampling.
+        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+            reserve for the model weights, activations, and KV cache. Higher
+            values will increase the KV cache size and thus improve the model's
+            throughput. However, if the value is too high, it may cause out-of-
+            memory (OOM) errors.
+        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+            This can be used for temporarily storing the states of the requests
+            when their `best_of` sampling parameters are larger than 1. If all
+            requests will have `best_of=1`, you can safely set this to 0.
+            Otherwise, too small values may cause out-of-memory (OOM) errors.
     """
 
     def __init__(
@@ -53,8 +63,11 @@ class LLM:
         trust_remote_code: bool = False,
         tensor_parallel_size: int = 1,
         dtype: str = "auto",
-        seed: int = 0,
         quantization: Optional[str] = None,
+        revision: Optional[str] = None,
+        seed: int = 0,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: int = 4,
         **kwargs,
     ) -> None:
         if "disable_log_stats" not in kwargs:
@@ -66,8 +79,11 @@ class LLM:
             trust_remote_code=trust_remote_code,
             tensor_parallel_size=tensor_parallel_size,
             dtype=dtype,
-            seed=seed,
             quantization=quantization,
+            revision=revision,
+            seed=seed,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
             **kwargs,
         )
         self.llm_engine = LLMEngine.from_engine_args(engine_args)