From 30fb0956dfee6113765ccb527f4f06703c75bf47 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Dec 2023 01:56:16 -0800 Subject: [PATCH] [Minor] Add more detailed explanation on `quantization` argument (#2145) --- vllm/engine/arg_utils.py | 7 ++++++- vllm/entrypoints/llm.py | 7 ++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 511ce03fde215..7a571ceefbc85 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -183,7 +183,12 @@ class EngineArgs: type=str, choices=['awq', 'gptq', 'squeezellm', None], default=None, - help='Method used to quantize the weights') + help='Method used to quantize the weights. If ' + 'None, we first check the `quantization_config` ' + 'attribute in the model config file. If that is ' + 'None, we assume the model weights are not ' + 'quantized and use `dtype` to determine the data ' + 'type of the weights.') parser.add_argument('--enforce-eager', action='store_true', help='Always use eager-mode PyTorch. If False, ' diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f24fa56825340..0700298b03a3d 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -38,9 +38,10 @@ class LLM: However, if the `torch_dtype` in the config is `float32`, we will use `float16` instead. quantization: The method used to quantize the model weights. Currently, - we support "awq", "gptq" and "squeezellm". If None, we assume the - model weights are not quantized and use `dtype` to determine the - data type of the weights. + we support "awq", "gptq" and "squeezellm". If None, we first check + the `quantization_config` attribute in the model config file. If + that is None, we assume the model weights are not quantized and use + `dtype` to determine the data type of the weights. revision: The specific model version to use. It can be a branch name, a tag name, or a commit id. tokenizer_revision: The specific tokenizer version to use. It can be a