added support for quantize on LLM module (#1080)

2026-01-27 15:12:38 +08:00 · 2023-09-18 21:04:21 +03:00 · 2023-09-18 21:04:21 +03:00 · fbe66e1d0b
commit fbe66e1d0b
parent 90979c38f8
1 changed files with 5 additions and 0 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -38,6 +38,9 @@ class LLM:
            However, if the `torch_dtype` in the config is `float32`, we will
            use `float16` instead.
        seed: The seed to initialize the random number generator for sampling.
+        quantization: The method used to quantize the model weights. Currently,
+            we support "awq". If None, we assume the model weights are not
+            quantized and use `dtype` to determine the data type of the weights.
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id.
    """
@ -51,6 +54,7 @@ class LLM:
        tensor_parallel_size: int = 1,
        dtype: str = "auto",
        seed: int = 0,
+        quantization: Optional[str] = None,
        **kwargs,
    ) -> None:
        if "disable_log_stats" not in kwargs:
@ -63,6 +67,7 @@ class LLM:
            tensor_parallel_size=tensor_parallel_size,
            dtype=dtype,
            seed=seed,
+            quantization=quantization,
            **kwargs,
        )
        self.llm_engine = LLMEngine.from_engine_args(engine_args)