From fbe66e1d0b8d1445cb3204150afac74ab075e559 Mon Sep 17 00:00:00 2001
From: orellavie1212 <126397224+orellavie1212@users.noreply.github.com>
Date: Mon, 18 Sep 2023 21:04:21 +0300
Subject: [PATCH] added support for quantize on LLM module (#1080)

---
 vllm/entrypoints/llm.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 6c2afe9e7272..ef07ac212d4a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -38,6 +38,9 @@ class LLM:
             However, if the `torch_dtype` in the config is `float32`, we will
             use `float16` instead.
         seed: The seed to initialize the random number generator for sampling.
+        quantization: The method used to quantize the model weights. Currently,
+            we support "awq". If None, we assume the model weights are not
+            quantized and use `dtype` to determine the data type of the weights.
         revision: The specific model version to use. It can be a branch name,
             a tag name, or a commit id.
     """
@@ -51,6 +54,7 @@ class LLM:
         tensor_parallel_size: int = 1,
         dtype: str = "auto",
         seed: int = 0,
+        quantization: Optional[str] = None,
         **kwargs,
     ) -> None:
         if "disable_log_stats" not in kwargs:
@@ -63,6 +67,7 @@ class LLM:
             tensor_parallel_size=tensor_parallel_size,
             dtype=dtype,
             seed=seed,
+            quantization=quantization,
             **kwargs,
         )
         self.llm_engine = LLMEngine.from_engine_args(engine_args)