From 30fb0956dfee6113765ccb527f4f06703c75bf47 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 17 Dec 2023 01:56:16 -0800
Subject: [PATCH] [Minor] Add more detailed explanation on `quantization`
 argument (#2145)

---
 vllm/engine/arg_utils.py | 7 ++++++-
 vllm/entrypoints/llm.py  | 7 ++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 511ce03fde215..7a571ceefbc85 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -183,7 +183,12 @@ class EngineArgs:
                             type=str,
                             choices=['awq', 'gptq', 'squeezellm', None],
                             default=None,
-                            help='Method used to quantize the weights')
+                            help='Method used to quantize the weights. If '
+                            'None, we first check the `quantization_config` '
+                            'attribute in the model config file. If that is '
+                            'None, we assume the model weights are not '
+                            'quantized and use `dtype` to determine the data '
+                            'type of the weights.')
         parser.add_argument('--enforce-eager',
                             action='store_true',
                             help='Always use eager-mode PyTorch. If False, '
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f24fa56825340..0700298b03a3d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -38,9 +38,10 @@ class LLM:
             However, if the `torch_dtype` in the config is `float32`, we will
             use `float16` instead.
         quantization: The method used to quantize the model weights. Currently,
-            we support "awq", "gptq" and "squeezellm". If None, we assume the
-            model weights are not quantized and use `dtype` to determine the
-            data type of the weights.
+            we support "awq", "gptq" and "squeezellm". If None, we first check
+            the `quantization_config` attribute in the model config file. If
+            that is None, we assume the model weights are not quantized and use
+            `dtype` to determine the data type of the weights.
         revision: The specific model version to use. It can be a branch name,
             a tag name, or a commit id.
         tokenizer_revision: The specific tokenizer version to use. It can be a