mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-09 14:41:26 +08:00
[doc] add install tips (#17373)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
parent
584f5fb4c6
commit
2ac74d098e
@ -44,6 +44,12 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the
|
|||||||
pip install llmcompressor
|
pip install llmcompressor
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||||
|
|
||||||
|
```console
|
||||||
|
pip install vllm lm-eval==0.4.4
|
||||||
|
```
|
||||||
|
|
||||||
## Quantization Process
|
## Quantization Process
|
||||||
|
|
||||||
The quantization process involves three main steps:
|
The quantization process involves three main steps:
|
||||||
@ -86,7 +92,7 @@ recipe = QuantizationModifier(
|
|||||||
# Apply the quantization algorithm.
|
# Apply the quantization algorithm.
|
||||||
oneshot(model=model, recipe=recipe)
|
oneshot(model=model, recipe=recipe)
|
||||||
|
|
||||||
# Save the model.
|
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
|
||||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
|
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
|
||||||
model.save_pretrained(SAVE_DIR)
|
model.save_pretrained(SAVE_DIR)
|
||||||
tokenizer.save_pretrained(SAVE_DIR)
|
tokenizer.save_pretrained(SAVE_DIR)
|
||||||
@ -94,12 +100,6 @@ tokenizer.save_pretrained(SAVE_DIR)
|
|||||||
|
|
||||||
### 3. Evaluating Accuracy
|
### 3. Evaluating Accuracy
|
||||||
|
|
||||||
Install `vllm` and `lm-evaluation-harness`:
|
|
||||||
|
|
||||||
```console
|
|
||||||
pip install vllm lm-eval==0.4.4
|
|
||||||
```
|
|
||||||
|
|
||||||
Load and run the model in `vllm`:
|
Load and run the model in `vllm`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|||||||
@ -18,6 +18,12 @@ To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](
|
|||||||
pip install llmcompressor
|
pip install llmcompressor
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||||
|
|
||||||
|
```console
|
||||||
|
pip install vllm lm-eval==0.4.4
|
||||||
|
```
|
||||||
|
|
||||||
## Quantization Process
|
## Quantization Process
|
||||||
|
|
||||||
The quantization process involves four main steps:
|
The quantization process involves four main steps:
|
||||||
@ -87,7 +93,7 @@ oneshot(
|
|||||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the compressed model
|
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
|
||||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
|
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
|
||||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||||
tokenizer.save_pretrained(SAVE_DIR)
|
tokenizer.save_pretrained(SAVE_DIR)
|
||||||
|
|||||||
@ -19,6 +19,12 @@ To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](
|
|||||||
pip install llmcompressor
|
pip install llmcompressor
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||||
|
|
||||||
|
```console
|
||||||
|
pip install vllm lm-eval==0.4.4
|
||||||
|
```
|
||||||
|
|
||||||
## Quantization Process
|
## Quantization Process
|
||||||
|
|
||||||
The quantization process involves four main steps:
|
The quantization process involves four main steps:
|
||||||
@ -91,7 +97,7 @@ oneshot(
|
|||||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the compressed model
|
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
|
||||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
|
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
|
||||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||||
tokenizer.save_pretrained(SAVE_DIR)
|
tokenizer.save_pretrained(SAVE_DIR)
|
||||||
|
|||||||
@ -126,7 +126,7 @@ oneshot(
|
|||||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save quantized model
|
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
|
||||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
|
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
|
||||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||||
tokenizer.save_pretrained(SAVE_DIR)
|
tokenizer.save_pretrained(SAVE_DIR)
|
||||||
|
|||||||
@ -19,6 +19,12 @@ pip install amd-quark
|
|||||||
You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
|
You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
|
||||||
for more installation details.
|
for more installation details.
|
||||||
|
|
||||||
|
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
|
||||||
|
|
||||||
|
```console
|
||||||
|
pip install vllm lm-eval==0.4.4
|
||||||
|
```
|
||||||
|
|
||||||
## Quantization Process
|
## Quantization Process
|
||||||
|
|
||||||
After installing Quark, we will use an example to illustrate how to use Quark.
|
After installing Quark, we will use an example to illustrate how to use Quark.
|
||||||
@ -150,6 +156,7 @@ LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
|
|||||||
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
|
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
|
||||||
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
|
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
|
||||||
|
|
||||||
|
# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
|
||||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user