From 8516999495114926c9838c2d6e0feb580d4d983f Mon Sep 17 00:00:00 2001 From: Casper Date: Sun, 5 Nov 2023 06:43:39 +0100 Subject: [PATCH] Add Quantization and AutoAWQ to docs (#1235) --- docs/source/index.rst | 6 +++ docs/source/quantization/auto_awq.rst | 69 +++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 docs/source/quantization/auto_awq.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 60a5b07f32fbb..eb98aa6049bfb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -73,3 +73,9 @@ Documentation models/supported_models models/adding_model + +.. toctree:: + :maxdepth: 1 + :caption: Quantization + + quantization/auto_awq \ No newline at end of file diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst new file mode 100644 index 0000000000000..0a2b442399981 --- /dev/null +++ b/docs/source/quantization/auto_awq.rst @@ -0,0 +1,69 @@ +.. _auto_awq: + +AutoAWQ +================== + +To create a new 4-bit quantized model, you can leverage `AutoAWQ `_. +Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. +The main benefits are lower latency and memory usage. + +You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface `_. + +.. code-block:: console + + $ pip install autoawq + +After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize Vicuna 7B v1.5: + +.. code-block:: python + + from awq import AutoAWQForCausalLM + from transformers import AutoTokenizer + + model_path = 'lmsys/vicuna-7b-v1.5' + quant_path = 'vicuna-7b-v1.5-awq' + quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + + # Load model + model = AutoAWQForCausalLM.from_pretrained(model_path, **{"low_cpu_mem_usage": True}) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + # Quantize + model.quantize(tokenizer, quant_config=quant_config) + + # Save quantized model + model.save_quantized(quant_path) + tokenizer.save_pretrained(quant_path) + +To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ `_ with the following command: + +.. code-block:: console + + $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq + +AWQ models are also supported directly through the LLM entrypoint: + +.. code-block:: python + + from vllm import LLM, SamplingParams + + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM. + llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")