diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 511ce03fde21..7a571ceefbc8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -183,7 +183,12 @@ class EngineArgs: type=str, choices=['awq', 'gptq', 'squeezellm', None], default=None, - help='Method used to quantize the weights') + help='Method used to quantize the weights. If ' + 'None, we first check the `quantization_config` ' + 'attribute in the model config file. If that is ' + 'None, we assume the model weights are not ' + 'quantized and use `dtype` to determine the data ' + 'type of the weights.') parser.add_argument('--enforce-eager', action='store_true', help='Always use eager-mode PyTorch. If False, ' diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f24fa5682534..0700298b03a3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -38,9 +38,10 @@ class LLM: However, if the `torch_dtype` in the config is `float32`, we will use `float16` instead. quantization: The method used to quantize the model weights. Currently, - we support "awq", "gptq" and "squeezellm". If None, we assume the - model weights are not quantized and use `dtype` to determine the - data type of the weights. + we support "awq", "gptq" and "squeezellm". If None, we first check + the `quantization_config` attribute in the model config file. If + that is None, we assume the model weights are not quantized and use + `dtype` to determine the data type of the weights. revision: The specific model version to use. It can be a branch name, a tag name, or a commit id. tokenizer_revision: The specific tokenizer version to use. It can be a