mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 09:51:19 +08:00
[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
parent
493c275352
commit
f17aec0d63
@ -91,7 +91,7 @@ source to unblock the update process.
|
|||||||
### FlashInfer
|
### FlashInfer
|
||||||
Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
|
Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
|
||||||
|
|
||||||
```
|
```bash
|
||||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
|
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
|
||||||
export FLASHINFER_ENABLE_SM90=1
|
export FLASHINFER_ENABLE_SM90=1
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
|
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
|
||||||
@ -105,14 +105,14 @@ team if you want to get the package published there.
|
|||||||
### xFormers
|
### xFormers
|
||||||
Similar to FlashInfer, here is how to build and install xFormers from source:
|
Similar to FlashInfer, here is how to build and install xFormers from source:
|
||||||
|
|
||||||
```
|
```bash
|
||||||
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
||||||
MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
|
MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Mamba
|
### Mamba
|
||||||
|
|
||||||
```
|
```bash
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -16,35 +16,33 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
|
|||||||
|
|
||||||
Start the vLLM OpenAI Compatible API server.
|
Start the vLLM OpenAI Compatible API server.
|
||||||
|
|
||||||
Examples:
|
??? Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start with a model
|
# Start with a model
|
||||||
vllm serve meta-llama/Llama-2-7b-hf
|
vllm serve meta-llama/Llama-2-7b-hf
|
||||||
|
|
||||||
# Specify the port
|
# Specify the port
|
||||||
vllm serve meta-llama/Llama-2-7b-hf --port 8100
|
vllm serve meta-llama/Llama-2-7b-hf --port 8100
|
||||||
|
|
||||||
# Check with --help for more options
|
# Check with --help for more options
|
||||||
# To list all groups
|
# To list all groups
|
||||||
vllm serve --help=listgroup
|
vllm serve --help=listgroup
|
||||||
|
|
||||||
# To view a argument group
|
# To view a argument group
|
||||||
vllm serve --help=ModelConfig
|
vllm serve --help=ModelConfig
|
||||||
|
|
||||||
# To view a single argument
|
# To view a single argument
|
||||||
vllm serve --help=max-num-seqs
|
vllm serve --help=max-num-seqs
|
||||||
|
|
||||||
# To search by keyword
|
# To search by keyword
|
||||||
vllm serve --help=max
|
vllm serve --help=max
|
||||||
```
|
```
|
||||||
|
|
||||||
## chat
|
## chat
|
||||||
|
|
||||||
Generate chat completions via the running API server.
|
Generate chat completions via the running API server.
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Directly connect to localhost API without arguments
|
# Directly connect to localhost API without arguments
|
||||||
vllm chat
|
vllm chat
|
||||||
@ -60,8 +58,6 @@ vllm chat --quick "hi"
|
|||||||
|
|
||||||
Generate text completions based on the given prompt via the running API server.
|
Generate text completions based on the given prompt via the running API server.
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Directly connect to localhost API without arguments
|
# Directly connect to localhost API without arguments
|
||||||
vllm complete
|
vllm complete
|
||||||
@ -73,6 +69,8 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
|||||||
vllm complete --quick "The future of AI is"
|
vllm complete --quick "The future of AI is"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
## bench
|
## bench
|
||||||
|
|
||||||
Run benchmark tests for latency online serving throughput and offline inference throughput.
|
Run benchmark tests for latency online serving throughput and offline inference throughput.
|
||||||
@ -89,8 +87,6 @@ vllm bench {latency, serve, throughput}
|
|||||||
|
|
||||||
Benchmark the latency of a single batch of requests.
|
Benchmark the latency of a single batch of requests.
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm bench latency \
|
vllm bench latency \
|
||||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||||
@ -104,8 +100,6 @@ vllm bench latency \
|
|||||||
|
|
||||||
Benchmark the online serving throughput.
|
Benchmark the online serving throughput.
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||||
@ -120,8 +114,6 @@ vllm bench serve \
|
|||||||
|
|
||||||
Benchmark offline inference throughput.
|
Benchmark offline inference throughput.
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm bench throughput \
|
vllm bench throughput \
|
||||||
--model meta-llama/Llama-3.2-1B-Instruct \
|
--model meta-llama/Llama-3.2-1B-Instruct \
|
||||||
@ -143,7 +135,8 @@ vllm collect-env
|
|||||||
|
|
||||||
Run batch prompts and write results to file.
|
Run batch prompts and write results to file.
|
||||||
|
|
||||||
Examples:
|
<details>
|
||||||
|
<summary>Examples</summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Running with a local file
|
# Running with a local file
|
||||||
@ -159,6 +152,8 @@ vllm run-batch \
|
|||||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
## More Help
|
## More Help
|
||||||
|
|
||||||
For detailed options of any subcommand, use:
|
For detailed options of any subcommand, use:
|
||||||
|
|||||||
@ -57,19 +57,21 @@ By default, we optimize model inference using CUDA graphs which take up extra me
|
|||||||
|
|
||||||
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM
|
|
||||||
from vllm.config import CompilationConfig, CompilationLevel
|
|
||||||
|
|
||||||
llm = LLM(
|
```python
|
||||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
from vllm import LLM
|
||||||
compilation_config=CompilationConfig(
|
from vllm.config import CompilationConfig, CompilationLevel
|
||||||
level=CompilationLevel.PIECEWISE,
|
|
||||||
# By default, it goes up to max_num_seqs
|
llm = LLM(
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||||
),
|
compilation_config=CompilationConfig(
|
||||||
)
|
level=CompilationLevel.PIECEWISE,
|
||||||
```
|
# By default, it goes up to max_num_seqs
|
||||||
|
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
You can disable graph capturing completely via the `enforce_eager` flag:
|
You can disable graph capturing completely via the `enforce_eager` flag:
|
||||||
|
|
||||||
@ -127,18 +129,20 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
|
|||||||
|
|
||||||
Here are some examples:
|
Here are some examples:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
# Available for Qwen2-VL series models
|
```python
|
||||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
from vllm import LLM
|
||||||
mm_processor_kwargs={
|
|
||||||
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
|
|
||||||
})
|
|
||||||
|
|
||||||
# Available for InternVL series models
|
# Available for Qwen2-VL series models
|
||||||
llm = LLM(model="OpenGVLab/InternVL2-2B",
|
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
mm_processor_kwargs={
|
mm_processor_kwargs={
|
||||||
"max_dynamic_patch": 4, # Default is 12
|
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
|
||||||
})
|
})
|
||||||
```
|
|
||||||
|
# Available for InternVL series models
|
||||||
|
llm = LLM(model="OpenGVLab/InternVL2-2B",
|
||||||
|
mm_processor_kwargs={
|
||||||
|
"max_dynamic_patch": 4, # Default is 12
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|||||||
@ -7,6 +7,8 @@ vLLM uses the following environment variables to configure the system:
|
|||||||
|
|
||||||
All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
|
All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/envs.py:env-vars-definition"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/envs.py:env-vars-definition"
|
||||||
|
```
|
||||||
|
|||||||
@ -93,25 +93,27 @@ For additional features and advanced configurations, refer to the official [MkDo
|
|||||||
|
|
||||||
## Testing
|
## Testing
|
||||||
|
|
||||||
```bash
|
??? note "Commands"
|
||||||
pip install -r requirements/dev.txt
|
|
||||||
|
|
||||||
# Linting, formatting and static type checking
|
```bash
|
||||||
pre-commit install --hook-type pre-commit --hook-type commit-msg
|
pip install -r requirements/dev.txt
|
||||||
|
|
||||||
# You can manually run pre-commit with
|
# Linting, formatting and static type checking
|
||||||
pre-commit run --all-files
|
pre-commit install --hook-type pre-commit --hook-type commit-msg
|
||||||
|
|
||||||
# To manually run something from CI that does not run
|
# You can manually run pre-commit with
|
||||||
# locally by default, you can run:
|
pre-commit run --all-files
|
||||||
pre-commit run mypy-3.9 --hook-stage manual --all-files
|
|
||||||
|
|
||||||
# Unit tests
|
# To manually run something from CI that does not run
|
||||||
pytest tests/
|
# locally by default, you can run:
|
||||||
|
pre-commit run mypy-3.9 --hook-stage manual --all-files
|
||||||
|
|
||||||
# Run tests for a single test file with detailed output
|
# Unit tests
|
||||||
pytest -s -v tests/test_logger.py
|
pytest tests/
|
||||||
```
|
|
||||||
|
# Run tests for a single test file with detailed output
|
||||||
|
pytest -s -v tests/test_logger.py
|
||||||
|
```
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
|
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
|
||||||
|
|||||||
@ -27,33 +27,35 @@ All vLLM modules within the model must include a `prefix` argument in their cons
|
|||||||
|
|
||||||
The initialization code should look like this:
|
The initialization code should look like this:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from torch import nn
|
|
||||||
from vllm.config import VllmConfig
|
|
||||||
from vllm.attention import Attention
|
|
||||||
|
|
||||||
class MyAttention(nn.Module):
|
```python
|
||||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
from torch import nn
|
||||||
super().__init__()
|
from vllm.config import VllmConfig
|
||||||
self.attn = Attention(prefix=f"{prefix}.attn")
|
from vllm.attention import Attention
|
||||||
|
|
||||||
class MyDecoderLayer(nn.Module):
|
class MyAttention(nn.Module):
|
||||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
|
self.attn = Attention(prefix=f"{prefix}.attn")
|
||||||
|
|
||||||
class MyModel(nn.Module):
|
class MyDecoderLayer(nn.Module):
|
||||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.layers = nn.ModuleList(
|
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
|
||||||
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
|
|
||||||
)
|
|
||||||
|
|
||||||
class MyModelForCausalLM(nn.Module):
|
class MyModel(nn.Module):
|
||||||
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
|
self.layers = nn.ModuleList(
|
||||||
```
|
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
|
||||||
|
)
|
||||||
|
|
||||||
|
class MyModelForCausalLM(nn.Module):
|
||||||
|
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
|
super().__init__()
|
||||||
|
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
|
||||||
|
```
|
||||||
|
|
||||||
### Computation Code
|
### Computation Code
|
||||||
|
|
||||||
|
|||||||
@ -25,59 +25,63 @@ Further update the model as follows:
|
|||||||
|
|
||||||
- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
|
- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
class YourModelForImage2Seq(nn.Module):
|
|
||||||
...
|
|
||||||
|
|
||||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
```python
|
||||||
|
class YourModelForImage2Seq(nn.Module):
|
||||||
|
...
|
||||||
|
|
||||||
assert self.vision_encoder is not None
|
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||||
image_features = self.vision_encoder(image_input)
|
|
||||||
return self.multi_modal_projector(image_features)
|
|
||||||
|
|
||||||
def get_multimodal_embeddings(
|
assert self.vision_encoder is not None
|
||||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
image_features = self.vision_encoder(image_input)
|
||||||
|
return self.multi_modal_projector(image_features)
|
||||||
|
|
||||||
# Validate the multimodal input keyword arguments
|
def get_multimodal_embeddings(
|
||||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||||
if image_input is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Run multimodal inputs through encoder and projector
|
# Validate the multimodal input keyword arguments
|
||||||
vision_embeddings = self._process_image_input(image_input)
|
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||||
return vision_embeddings
|
if image_input is None:
|
||||||
```
|
return None
|
||||||
|
|
||||||
|
# Run multimodal inputs through encoder and projector
|
||||||
|
vision_embeddings = self._process_image_input(image_input)
|
||||||
|
return vision_embeddings
|
||||||
|
```
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||||
|
|
||||||
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from .utils import merge_multimodal_embeddings
|
|
||||||
|
|
||||||
class YourModelForImage2Seq(nn.Module):
|
```python
|
||||||
...
|
from .utils import merge_multimodal_embeddings
|
||||||
|
|
||||||
def get_input_embeddings(
|
class YourModelForImage2Seq(nn.Module):
|
||||||
self,
|
...
|
||||||
input_ids: torch.Tensor,
|
|
||||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
|
|
||||||
# `get_input_embeddings` should already be implemented for the language
|
def get_input_embeddings(
|
||||||
# model as one of the requirements of basic vLLM model implementation.
|
self,
|
||||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
input_ids: torch.Tensor,
|
||||||
|
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
if multimodal_embeddings is not None:
|
# `get_input_embeddings` should already be implemented for the language
|
||||||
inputs_embeds = merge_multimodal_embeddings(
|
# model as one of the requirements of basic vLLM model implementation.
|
||||||
input_ids=input_ids,
|
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||||
inputs_embeds=inputs_embeds,
|
|
||||||
multimodal_embeddings=multimodal_embeddings,
|
|
||||||
placeholder_token_id=self.config.image_token_index)
|
|
||||||
|
|
||||||
return inputs_embeds
|
if multimodal_embeddings is not None:
|
||||||
```
|
inputs_embeds = merge_multimodal_embeddings(
|
||||||
|
input_ids=input_ids,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
multimodal_embeddings=multimodal_embeddings,
|
||||||
|
placeholder_token_id=self.config.image_token_index)
|
||||||
|
|
||||||
|
return inputs_embeds
|
||||||
|
```
|
||||||
|
|
||||||
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
|
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
|
||||||
|
|
||||||
@ -135,42 +139,46 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
Looking at the code of HF's `LlavaForConditionalGeneration`:
|
Looking at the code of HF's `LlavaForConditionalGeneration`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
|
|
||||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
|
||||||
n_image_features = image_features.shape[0] * image_features.shape[1]
|
|
||||||
|
|
||||||
if n_image_tokens != n_image_features:
|
```python
|
||||||
raise ValueError(
|
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
|
||||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||||
|
n_image_features = image_features.shape[0] * image_features.shape[1]
|
||||||
|
|
||||||
|
if n_image_tokens != n_image_features:
|
||||||
|
raise ValueError(
|
||||||
|
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||||
|
)
|
||||||
|
special_image_mask = (
|
||||||
|
(input_ids == self.config.image_token_index)
|
||||||
|
.unsqueeze(-1)
|
||||||
|
.expand_as(inputs_embeds)
|
||||||
|
.to(inputs_embeds.device)
|
||||||
)
|
)
|
||||||
special_image_mask = (
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||||
(input_ids == self.config.image_token_index)
|
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||||
.unsqueeze(-1)
|
```
|
||||||
.expand_as(inputs_embeds)
|
|
||||||
.to(inputs_embeds.device)
|
|
||||||
)
|
|
||||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
||||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
|
||||||
```
|
|
||||||
|
|
||||||
The number of placeholder feature tokens per image is `image_features.shape[1]`.
|
The number of placeholder feature tokens per image is `image_features.shape[1]`.
|
||||||
`image_features` is calculated inside the `get_image_features` method:
|
`image_features` is calculated inside the `get_image_features` method:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
|
|
||||||
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
|
|
||||||
|
|
||||||
selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
|
```python
|
||||||
if vision_feature_select_strategy == "default":
|
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
|
||||||
selected_image_feature = selected_image_feature[:, 1:]
|
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||||
elif vision_feature_select_strategy == "full":
|
|
||||||
selected_image_feature = selected_image_feature
|
selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
|
||||||
else:
|
if vision_feature_select_strategy == "default":
|
||||||
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
|
selected_image_feature = selected_image_feature[:, 1:]
|
||||||
image_features = self.multi_modal_projector(selected_image_feature)
|
elif vision_feature_select_strategy == "full":
|
||||||
return image_features
|
selected_image_feature = selected_image_feature
|
||||||
```
|
else:
|
||||||
|
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
|
||||||
|
image_features = self.multi_modal_projector(selected_image_feature)
|
||||||
|
return image_features
|
||||||
|
```
|
||||||
|
|
||||||
We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
|
We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
|
||||||
(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
|
(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
|
||||||
@ -193,20 +201,22 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
|
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
|
|
||||||
target_dtype = self.patch_embedding.weight.dtype
|
|
||||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
|
||||||
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
|
||||||
|
|
||||||
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
|
```python
|
||||||
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
|
||||||
if interpolate_pos_encoding:
|
target_dtype = self.patch_embedding.weight.dtype
|
||||||
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
|
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||||
else:
|
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
||||||
embeddings = embeddings + self.position_embedding(self.position_ids)
|
|
||||||
return embeddings
|
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
|
||||||
```
|
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
||||||
|
if interpolate_pos_encoding:
|
||||||
|
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
|
||||||
|
else:
|
||||||
|
embeddings = embeddings + self.position_embedding(self.position_ids)
|
||||||
|
return embeddings
|
||||||
|
```
|
||||||
|
|
||||||
We can infer that `embeddings.shape[1] == self.num_positions`, where
|
We can infer that `embeddings.shape[1] == self.num_positions`, where
|
||||||
|
|
||||||
@ -218,55 +228,59 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
Overall, the number of placeholder feature tokens for an image can be calculated as:
|
Overall, the number of placeholder feature tokens for an image can be calculated as:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
def get_num_image_tokens(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
image_width: int,
|
|
||||||
image_height: int,
|
|
||||||
) -> int:
|
|
||||||
hf_config = self.get_hf_config()
|
|
||||||
hf_processor = self.get_hf_processor()
|
|
||||||
|
|
||||||
image_size = hf_config.vision_config.image_size
|
```python
|
||||||
patch_size = hf_config.vision_config.patch_size
|
def get_num_image_tokens(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
image_width: int,
|
||||||
|
image_height: int,
|
||||||
|
) -> int:
|
||||||
|
hf_config = self.get_hf_config()
|
||||||
|
hf_processor = self.get_hf_processor()
|
||||||
|
|
||||||
num_image_tokens = (image_size // patch_size) ** 2 + 1
|
image_size = hf_config.vision_config.image_size
|
||||||
if hf_processor.vision_feature_select_strategy == "default":
|
patch_size = hf_config.vision_config.patch_size
|
||||||
num_image_tokens -= 1
|
|
||||||
|
|
||||||
return num_image_tokens
|
num_image_tokens = (image_size // patch_size) ** 2 + 1
|
||||||
```
|
if hf_processor.vision_feature_select_strategy == "default":
|
||||||
|
num_image_tokens -= 1
|
||||||
|
|
||||||
|
return num_image_tokens
|
||||||
|
```
|
||||||
|
|
||||||
Notice that the number of image tokens doesn't depend on the image width and height.
|
Notice that the number of image tokens doesn't depend on the image width and height.
|
||||||
We can simply use a dummy `image_size` to calculate the multimodal profiling data:
|
We can simply use a dummy `image_size` to calculate the multimodal profiling data:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# NOTE: In actuality, this is usually implemented as part of the
|
|
||||||
# model's subclass of `BaseProcessingInfo`, but we show it as is
|
|
||||||
# here for simplicity.
|
|
||||||
def get_image_size_with_most_features(self) -> ImageSize:
|
|
||||||
hf_config = self.get_hf_config()
|
|
||||||
width = height = hf_config.image_size
|
|
||||||
return ImageSize(width=width, height=height)
|
|
||||||
|
|
||||||
def get_dummy_mm_data(
|
```python
|
||||||
self,
|
# NOTE: In actuality, this is usually implemented as part of the
|
||||||
seq_len: int,
|
# model's subclass of `BaseProcessingInfo`, but we show it as is
|
||||||
mm_counts: Mapping[str, int],
|
# here for simplicity.
|
||||||
) -> MultiModalDataDict:
|
def get_image_size_with_most_features(self) -> ImageSize:
|
||||||
num_images = mm_counts.get("image", 0)
|
hf_config = self.get_hf_config()
|
||||||
|
width = height = hf_config.image_size
|
||||||
|
return ImageSize(width=width, height=height)
|
||||||
|
|
||||||
target_width, target_height = \
|
def get_dummy_mm_data(
|
||||||
self.info.get_image_size_with_most_features()
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> MultiModalDataDict:
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
return {
|
target_width, target_height = \
|
||||||
"image":
|
self.info.get_image_size_with_most_features()
|
||||||
self._get_dummy_images(width=target_width,
|
|
||||||
height=target_height,
|
return {
|
||||||
num_images=num_images)
|
"image":
|
||||||
}
|
self._get_dummy_images(width=target_width,
|
||||||
```
|
height=target_height,
|
||||||
|
num_images=num_images)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
|
For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
|
||||||
|
|
||||||
@ -284,21 +298,23 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
Looking at the code of HF's `FuyuForCausalLM`:
|
Looking at the code of HF's `FuyuForCausalLM`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
|
|
||||||
if image_patches is not None and past_key_values is None:
|
```python
|
||||||
patch_embeddings = [
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
|
||||||
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
|
if image_patches is not None and past_key_values is None:
|
||||||
.squeeze(0)
|
patch_embeddings = [
|
||||||
.to(inputs_embeds.device)
|
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
|
||||||
for patch in image_patches
|
.squeeze(0)
|
||||||
]
|
.to(inputs_embeds.device)
|
||||||
inputs_embeds = self.gather_continuous_embeddings(
|
for patch in image_patches
|
||||||
word_embeddings=inputs_embeds,
|
]
|
||||||
continuous_embeddings=patch_embeddings,
|
inputs_embeds = self.gather_continuous_embeddings(
|
||||||
image_patch_input_indices=image_patches_indices,
|
word_embeddings=inputs_embeds,
|
||||||
)
|
continuous_embeddings=patch_embeddings,
|
||||||
```
|
image_patch_input_indices=image_patches_indices,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
|
The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
|
||||||
which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
|
which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
|
||||||
@ -312,92 +328,98 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
|
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
|
||||||
returning the dimensions after resizing (but before padding) as metadata.
|
returning the dimensions after resizing (but before padding) as metadata.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
|
|
||||||
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
|
|
||||||
batch_images = image_encoding["images"]
|
|
||||||
image_unpadded_heights = image_encoding["image_unpadded_heights"]
|
|
||||||
image_unpadded_widths = image_encoding["image_unpadded_widths"]
|
|
||||||
|
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
|
```python
|
||||||
if do_resize:
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
|
||||||
batch_images = [
|
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
|
||||||
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
|
batch_images = image_encoding["images"]
|
||||||
for images in batch_images
|
image_unpadded_heights = image_encoding["image_unpadded_heights"]
|
||||||
]
|
image_unpadded_widths = image_encoding["image_unpadded_widths"]
|
||||||
|
|
||||||
image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
|
||||||
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
|
if do_resize:
|
||||||
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
|
batch_images = [
|
||||||
|
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
|
||||||
if do_pad:
|
for images in batch_images
|
||||||
batch_images = [
|
|
||||||
[
|
|
||||||
self.pad_image(
|
|
||||||
image,
|
|
||||||
size=size,
|
|
||||||
mode=padding_mode,
|
|
||||||
constant_values=padding_value,
|
|
||||||
input_data_format=input_data_format,
|
|
||||||
)
|
|
||||||
for image in images
|
|
||||||
]
|
]
|
||||||
for images in batch_images
|
|
||||||
]
|
image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
|
||||||
```
|
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
|
||||||
|
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
|
||||||
|
|
||||||
|
if do_pad:
|
||||||
|
batch_images = [
|
||||||
|
[
|
||||||
|
self.pad_image(
|
||||||
|
image,
|
||||||
|
size=size,
|
||||||
|
mode=padding_mode,
|
||||||
|
constant_values=padding_value,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
)
|
||||||
|
for image in images
|
||||||
|
]
|
||||||
|
for images in batch_images
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
|
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
|
|
||||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
|
||||||
image_input=tensor_batch_images,
|
|
||||||
image_present=image_present,
|
|
||||||
image_unpadded_h=image_unpadded_heights,
|
|
||||||
image_unpadded_w=image_unpadded_widths,
|
|
||||||
image_placeholder_id=image_placeholder_id,
|
|
||||||
image_newline_id=image_newline_id,
|
|
||||||
variable_sized=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
|
```python
|
||||||
image_height, image_width = image.shape[1], image.shape[2]
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
|
||||||
if variable_sized: # variable_sized=True
|
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||||
new_h = min(
|
image_input=tensor_batch_images,
|
||||||
image_height,
|
image_present=image_present,
|
||||||
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
|
image_unpadded_h=image_unpadded_heights,
|
||||||
|
image_unpadded_w=image_unpadded_widths,
|
||||||
|
image_placeholder_id=image_placeholder_id,
|
||||||
|
image_newline_id=image_newline_id,
|
||||||
|
variable_sized=True,
|
||||||
)
|
)
|
||||||
new_w = min(
|
|
||||||
image_width,
|
|
||||||
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
|
|
||||||
)
|
|
||||||
image = image[:, :new_h, :new_w]
|
|
||||||
image_height, image_width = new_h, new_w
|
|
||||||
|
|
||||||
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
|
||||||
tensor_of_image_ids = torch.full(
|
image_height, image_width = image.shape[1], image.shape[2]
|
||||||
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
|
if variable_sized: # variable_sized=True
|
||||||
)
|
new_h = min(
|
||||||
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
|
image_height,
|
||||||
assert num_patches == patches.shape[0]
|
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
|
||||||
```
|
)
|
||||||
|
new_w = min(
|
||||||
|
image_width,
|
||||||
|
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
|
||||||
|
)
|
||||||
|
image = image[:, :new_h, :new_w]
|
||||||
|
image_height, image_width = new_h, new_w
|
||||||
|
|
||||||
|
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
|
||||||
|
tensor_of_image_ids = torch.full(
|
||||||
|
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
|
||||||
|
)
|
||||||
|
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
|
||||||
|
assert num_patches == patches.shape[0]
|
||||||
|
```
|
||||||
|
|
||||||
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
|
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
|
|
||||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
|
||||||
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
|
|
||||||
|
|
||||||
if image_height % patch_height != 0:
|
```python
|
||||||
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
|
||||||
if image_width % patch_width != 0:
|
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||||
raise ValueError(f"{image_width=} must be divisible by {patch_width}")
|
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
|
||||||
|
|
||||||
num_patches_per_dim_h = image_height // patch_height
|
if image_height % patch_height != 0:
|
||||||
num_patches_per_dim_w = image_width // patch_width
|
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
|
||||||
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
|
if image_width % patch_width != 0:
|
||||||
```
|
raise ValueError(f"{image_width=} must be divisible by {patch_width}")
|
||||||
|
|
||||||
|
num_patches_per_dim_h = image_height // patch_height
|
||||||
|
num_patches_per_dim_w = image_width // patch_width
|
||||||
|
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
|
||||||
|
```
|
||||||
|
|
||||||
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
|
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
|
||||||
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
|
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
|
||||||
@ -419,23 +441,25 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
For the multimodal image profiling data, the logic is very similar to LLaVA:
|
For the multimodal image profiling data, the logic is very similar to LLaVA:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
def get_dummy_mm_data(
|
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> MultiModalDataDict:
|
|
||||||
target_width, target_height = \
|
|
||||||
self.info.get_image_size_with_most_features()
|
|
||||||
num_images = mm_counts.get("image", 0)
|
|
||||||
|
|
||||||
return {
|
```python
|
||||||
"image":
|
def get_dummy_mm_data(
|
||||||
self._get_dummy_images(width=target_width,
|
self,
|
||||||
height=target_height,
|
seq_len: int,
|
||||||
num_images=num_images)
|
mm_counts: Mapping[str, int],
|
||||||
}
|
) -> MultiModalDataDict:
|
||||||
```
|
target_width, target_height = \
|
||||||
|
self.info.get_image_size_with_most_features()
|
||||||
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"image":
|
||||||
|
self._get_dummy_images(width=target_width,
|
||||||
|
height=target_height,
|
||||||
|
num_images=num_images)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## 4. Specify processing details
|
## 4. Specify processing details
|
||||||
|
|
||||||
@ -455,6 +479,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
|||||||
The output of `CLIPImageProcessor` is a simple tensor with shape
|
The output of `CLIPImageProcessor` is a simple tensor with shape
|
||||||
`(num_images, num_channels, image_height, image_width)`:
|
`(num_images, num_channels, image_height, image_width)`:
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
|
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
|
||||||
images = [
|
images = [
|
||||||
@ -505,35 +530,37 @@ return a schema of the tensors outputted by the HF processor that are related to
|
|||||||
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
|
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
|
||||||
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
|
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
def _call_hf_processor(
|
|
||||||
self,
|
|
||||||
prompt: str,
|
|
||||||
mm_data: Mapping[str, object],
|
|
||||||
mm_kwargs: Mapping[str, object],
|
|
||||||
) -> BatchFeature:
|
|
||||||
processed_outputs = super()._call_hf_processor(
|
|
||||||
prompt=prompt,
|
|
||||||
mm_data=mm_data,
|
|
||||||
mm_kwargs=mm_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
image_patches = processed_outputs.get("image_patches")
|
```python
|
||||||
if image_patches is not None:
|
def _call_hf_processor(
|
||||||
images = mm_data["images"]
|
self,
|
||||||
assert isinstance(images, list)
|
prompt: str,
|
||||||
|
mm_data: Mapping[str, object],
|
||||||
|
mm_kwargs: Mapping[str, object],
|
||||||
|
) -> BatchFeature:
|
||||||
|
processed_outputs = super()._call_hf_processor(
|
||||||
|
prompt=prompt,
|
||||||
|
mm_data=mm_data,
|
||||||
|
mm_kwargs=mm_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
# Original output: (1, num_images, Pn, Px * Py * C)
|
image_patches = processed_outputs.get("image_patches")
|
||||||
# New output: (num_images, Pn, Px * Py * C)
|
if image_patches is not None:
|
||||||
assert (isinstance(image_patches, list)
|
images = mm_data["images"]
|
||||||
and len(image_patches) == 1)
|
assert isinstance(images, list)
|
||||||
assert (isinstance(image_patches[0], torch.Tensor)
|
|
||||||
and len(image_patches[0]) == len(images))
|
|
||||||
|
|
||||||
processed_outputs["image_patches"] = image_patches[0]
|
# Original output: (1, num_images, Pn, Px * Py * C)
|
||||||
|
# New output: (num_images, Pn, Px * Py * C)
|
||||||
|
assert (isinstance(image_patches, list)
|
||||||
|
and len(image_patches) == 1)
|
||||||
|
assert (isinstance(image_patches[0], torch.Tensor)
|
||||||
|
and len(image_patches[0]) == len(images))
|
||||||
|
|
||||||
return processed_outputs
|
processed_outputs["image_patches"] = image_patches[0]
|
||||||
```
|
|
||||||
|
return processed_outputs
|
||||||
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
||||||
@ -573,35 +600,37 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
|
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
|
||||||
Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
|
Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
def _get_prompt_updates(
|
|
||||||
self,
|
|
||||||
mm_items: MultiModalDataItems,
|
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
|
||||||
out_mm_kwargs: MultiModalKwargs,
|
|
||||||
) -> Sequence[PromptUpdate]:
|
|
||||||
hf_config = self.info.get_hf_config()
|
|
||||||
image_token_id = hf_config.image_token_index
|
|
||||||
|
|
||||||
def get_replacement(item_idx: int):
|
```python
|
||||||
images = mm_items.get_items("image", ImageProcessorItems)
|
def _get_prompt_updates(
|
||||||
|
self,
|
||||||
|
mm_items: MultiModalDataItems,
|
||||||
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
|
out_mm_kwargs: MultiModalKwargs,
|
||||||
|
) -> Sequence[PromptUpdate]:
|
||||||
|
hf_config = self.info.get_hf_config()
|
||||||
|
image_token_id = hf_config.image_token_index
|
||||||
|
|
||||||
image_size = images.get_image_size(item_idx)
|
def get_replacement(item_idx: int):
|
||||||
num_image_tokens = self.info.get_num_image_tokens(
|
images = mm_items.get_items("image", ImageProcessorItems)
|
||||||
image_width=image_size.width,
|
|
||||||
image_height=image_size.height,
|
|
||||||
)
|
|
||||||
|
|
||||||
return [image_token_id] * num_image_tokens
|
image_size = images.get_image_size(item_idx)
|
||||||
|
num_image_tokens = self.info.get_num_image_tokens(
|
||||||
|
image_width=image_size.width,
|
||||||
|
image_height=image_size.height,
|
||||||
|
)
|
||||||
|
|
||||||
return [
|
return [image_token_id] * num_image_tokens
|
||||||
PromptReplacement(
|
|
||||||
modality="image",
|
return [
|
||||||
target=[image_token_id],
|
PromptReplacement(
|
||||||
replacement=get_replacement,
|
modality="image",
|
||||||
),
|
target=[image_token_id],
|
||||||
]
|
replacement=get_replacement,
|
||||||
```
|
),
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
=== "Handling additional tokens: Fuyu"
|
=== "Handling additional tokens: Fuyu"
|
||||||
|
|
||||||
@ -616,117 +645,90 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
|
|
||||||
We define a helper function to return `ncols` and `nrows` directly:
|
We define a helper function to return `ncols` and `nrows` directly:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
def get_image_feature_grid_size(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
image_width: int,
|
|
||||||
image_height: int,
|
|
||||||
) -> tuple[int, int]:
|
|
||||||
image_processor = self.get_image_processor()
|
|
||||||
target_width = image_processor.size["width"]
|
|
||||||
target_height = image_processor.size["height"]
|
|
||||||
patch_width = image_processor.patch_size["width"]
|
|
||||||
patch_height = image_processor.patch_size["height"]
|
|
||||||
|
|
||||||
if not (image_width <= target_width and image_height <= target_height):
|
```python
|
||||||
height_scale_factor = target_height / image_height
|
def get_image_feature_grid_size(
|
||||||
width_scale_factor = target_width / image_width
|
self,
|
||||||
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
*,
|
||||||
|
image_width: int,
|
||||||
|
image_height: int,
|
||||||
|
) -> tuple[int, int]:
|
||||||
|
image_processor = self.get_image_processor()
|
||||||
|
target_width = image_processor.size["width"]
|
||||||
|
target_height = image_processor.size["height"]
|
||||||
|
patch_width = image_processor.patch_size["width"]
|
||||||
|
patch_height = image_processor.patch_size["height"]
|
||||||
|
|
||||||
image_height = int(image_height * optimal_scale_factor)
|
if not (image_width <= target_width and image_height <= target_height):
|
||||||
image_width = int(image_width * optimal_scale_factor)
|
height_scale_factor = target_height / image_height
|
||||||
|
width_scale_factor = target_width / image_width
|
||||||
|
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
||||||
|
|
||||||
ncols = math.ceil(image_width / patch_width)
|
image_height = int(image_height * optimal_scale_factor)
|
||||||
nrows = math.ceil(image_height / patch_height)
|
image_width = int(image_width * optimal_scale_factor)
|
||||||
return ncols, nrows
|
|
||||||
```
|
ncols = math.ceil(image_width / patch_width)
|
||||||
|
nrows = math.ceil(image_height / patch_height)
|
||||||
|
return ncols, nrows
|
||||||
|
```
|
||||||
|
|
||||||
Based on this, we can initially define our replacement tokens as:
|
Based on this, we can initially define our replacement tokens as:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
def get_replacement(item_idx: int):
|
|
||||||
images = mm_items.get_items("image", ImageProcessorItems)
|
|
||||||
image_size = images.get_image_size(item_idx)
|
|
||||||
|
|
||||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
```python
|
||||||
image_width=image_size.width,
|
def get_replacement(item_idx: int):
|
||||||
image_height=image_size.height,
|
images = mm_items.get_items("image", ImageProcessorItems)
|
||||||
)
|
image_size = images.get_image_size(item_idx)
|
||||||
|
|
||||||
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
|
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||||
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
|
image_width=image_size.width,
|
||||||
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
image_height=image_size.height,
|
||||||
```
|
)
|
||||||
|
|
||||||
|
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
|
||||||
|
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
|
||||||
|
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||||
|
```
|
||||||
|
|
||||||
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
|
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
|
||||||
a BOS token (`<s>`) is also added to the promopt:
|
a BOS token (`<s>`) is also added to the promopt:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
|
|
||||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
```python
|
||||||
image_input=tensor_batch_images,
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
|
||||||
image_present=image_present,
|
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||||
image_unpadded_h=image_unpadded_heights,
|
image_input=tensor_batch_images,
|
||||||
image_unpadded_w=image_unpadded_widths,
|
image_present=image_present,
|
||||||
image_placeholder_id=image_placeholder_id,
|
image_unpadded_h=image_unpadded_heights,
|
||||||
image_newline_id=image_newline_id,
|
image_unpadded_w=image_unpadded_widths,
|
||||||
variable_sized=True,
|
image_placeholder_id=image_placeholder_id,
|
||||||
)
|
image_newline_id=image_newline_id,
|
||||||
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
|
variable_sized=True,
|
||||||
tokenizer=self.tokenizer,
|
)
|
||||||
prompts=prompts,
|
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
|
||||||
scale_factors=scale_factors,
|
tokenizer=self.tokenizer,
|
||||||
max_tokens_to_generate=self.max_tokens_to_generate,
|
prompts=prompts,
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
scale_factors=scale_factors,
|
||||||
add_BOS=True,
|
max_tokens_to_generate=self.max_tokens_to_generate,
|
||||||
add_beginning_of_answer_token=True,
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
)
|
add_BOS=True,
|
||||||
```
|
add_beginning_of_answer_token=True,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
To assign the vision embeddings to only the image tokens, instead of a string
|
To assign the vision embeddings to only the image tokens, instead of a string
|
||||||
you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
|
you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
hf_config = self.info.get_hf_config()
|
|
||||||
bos_token_id = hf_config.bos_token_id # `<s>`
|
|
||||||
assert isinstance(bos_token_id, int)
|
|
||||||
|
|
||||||
def get_replacement_fuyu(item_idx: int):
|
```python
|
||||||
images = mm_items.get_items("image", ImageProcessorItems)
|
|
||||||
image_size = images.get_image_size(item_idx)
|
|
||||||
|
|
||||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
|
||||||
image_width=image_size.width,
|
|
||||||
image_height=image_size.height,
|
|
||||||
)
|
|
||||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
|
||||||
[_NEWLINE_TOKEN_ID]) * nrows
|
|
||||||
|
|
||||||
return PromptUpdateDetails.select_token_id(
|
|
||||||
image_tokens + [bos_token_id],
|
|
||||||
embed_token_id=_IMAGE_TOKEN_ID,
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
|
|
||||||
we can search for it to conduct the replacement at the start of the string:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def _get_prompt_updates(
|
|
||||||
self,
|
|
||||||
mm_items: MultiModalDataItems,
|
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
|
||||||
out_mm_kwargs: MultiModalKwargs,
|
|
||||||
) -> Sequence[PromptUpdate]:
|
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
bos_token_id = hf_config.bos_token_id
|
bos_token_id = hf_config.bos_token_id # `<s>`
|
||||||
assert isinstance(bos_token_id, int)
|
assert isinstance(bos_token_id, int)
|
||||||
|
|
||||||
tokenizer = self.info.get_tokenizer()
|
|
||||||
eot_token_id = tokenizer.bos_token_id
|
|
||||||
assert isinstance(eot_token_id, int)
|
|
||||||
|
|
||||||
def get_replacement_fuyu(item_idx: int):
|
def get_replacement_fuyu(item_idx: int):
|
||||||
images = mm_items.get_items("image", ImageProcessorItems)
|
images = mm_items.get_items("image", ImageProcessorItems)
|
||||||
image_size = images.get_image_size(item_idx)
|
image_size = images.get_image_size(item_idx)
|
||||||
@ -742,15 +744,52 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
image_tokens + [bos_token_id],
|
image_tokens + [bos_token_id],
|
||||||
embed_token_id=_IMAGE_TOKEN_ID,
|
embed_token_id=_IMAGE_TOKEN_ID,
|
||||||
)
|
)
|
||||||
|
```
|
||||||
|
|
||||||
return [
|
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
|
||||||
PromptReplacement(
|
we can search for it to conduct the replacement at the start of the string:
|
||||||
modality="image",
|
|
||||||
target=[eot_token_id],
|
??? Code
|
||||||
replacement=get_replacement_fuyu,
|
|
||||||
)
|
```python
|
||||||
]
|
def _get_prompt_updates(
|
||||||
```
|
self,
|
||||||
|
mm_items: MultiModalDataItems,
|
||||||
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
|
out_mm_kwargs: MultiModalKwargs,
|
||||||
|
) -> Sequence[PromptUpdate]:
|
||||||
|
hf_config = self.info.get_hf_config()
|
||||||
|
bos_token_id = hf_config.bos_token_id
|
||||||
|
assert isinstance(bos_token_id, int)
|
||||||
|
|
||||||
|
tokenizer = self.info.get_tokenizer()
|
||||||
|
eot_token_id = tokenizer.bos_token_id
|
||||||
|
assert isinstance(eot_token_id, int)
|
||||||
|
|
||||||
|
def get_replacement_fuyu(item_idx: int):
|
||||||
|
images = mm_items.get_items("image", ImageProcessorItems)
|
||||||
|
image_size = images.get_image_size(item_idx)
|
||||||
|
|
||||||
|
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||||
|
image_width=image_size.width,
|
||||||
|
image_height=image_size.height,
|
||||||
|
)
|
||||||
|
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||||
|
[_NEWLINE_TOKEN_ID]) * nrows
|
||||||
|
|
||||||
|
return PromptUpdateDetails.select_token_id(
|
||||||
|
image_tokens + [bos_token_id],
|
||||||
|
embed_token_id=_IMAGE_TOKEN_ID,
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
PromptReplacement(
|
||||||
|
modality="image",
|
||||||
|
target=[eot_token_id],
|
||||||
|
replacement=get_replacement_fuyu,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
## 5. Register processor-related classes
|
## 5. Register processor-related classes
|
||||||
|
|
||||||
|
|||||||
@ -97,26 +97,26 @@ to manually kill the profiler and generate your `nsys-rep` report.
|
|||||||
|
|
||||||
You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
|
You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
|
||||||
|
|
||||||
CLI example:
|
??? CLI example
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
nsys stats report1.nsys-rep
|
nsys stats report1.nsys-rep
|
||||||
...
|
...
|
||||||
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
|
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
|
||||||
|
|
||||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||||
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
|
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
|
||||||
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
|
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
|
||||||
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
|
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
|
||||||
12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
|
12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
|
||||||
9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
|
9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
|
||||||
5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
|
5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
|
||||||
4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
|
4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
|
||||||
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
|
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
|
||||||
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
|
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
|
||||||
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
|
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
GUI example:
|
GUI example:
|
||||||
|
|
||||||
|
|||||||
@ -97,19 +97,21 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
|
|||||||
flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
|
flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
|
||||||
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
|
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
|
||||||
|
|
||||||
```console
|
??? Command
|
||||||
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
|
|
||||||
python3 use_existing_torch.py
|
```console
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
|
||||||
--file docker/Dockerfile \
|
python3 use_existing_torch.py
|
||||||
--target vllm-openai \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
--platform "linux/arm64" \
|
--file docker/Dockerfile \
|
||||||
-t vllm/vllm-gh200-openai:latest \
|
--target vllm-openai \
|
||||||
--build-arg max_jobs=66 \
|
--platform "linux/arm64" \
|
||||||
--build-arg nvcc_threads=2 \
|
-t vllm/vllm-gh200-openai:latest \
|
||||||
--build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
--build-arg nvcc_threads=2 \
|
||||||
```
|
--build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
|
||||||
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
|
If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
|
||||||
|
|||||||
@ -30,51 +30,53 @@ python -m vllm.entrypoints.openai.api_server \
|
|||||||
|
|
||||||
- Call it with AutoGen:
|
- Call it with AutoGen:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
import asyncio
|
|
||||||
from autogen_core.models import UserMessage
|
```python
|
||||||
from autogen_ext.models.openai import OpenAIChatCompletionClient
|
import asyncio
|
||||||
from autogen_core.models import ModelFamily
|
from autogen_core.models import UserMessage
|
||||||
|
from autogen_ext.models.openai import OpenAIChatCompletionClient
|
||||||
|
from autogen_core.models import ModelFamily
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
# Create a model client
|
# Create a model client
|
||||||
model_client = OpenAIChatCompletionClient(
|
model_client = OpenAIChatCompletionClient(
|
||||||
model="mistralai/Mistral-7B-Instruct-v0.2",
|
model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||||
base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
|
base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
|
||||||
api_key="EMPTY",
|
api_key="EMPTY",
|
||||||
model_info={
|
model_info={
|
||||||
"vision": False,
|
"vision": False,
|
||||||
"function_calling": False,
|
"function_calling": False,
|
||||||
"json_output": False,
|
"json_output": False,
|
||||||
"family": ModelFamily.MISTRAL,
|
"family": ModelFamily.MISTRAL,
|
||||||
"structured_output": True,
|
"structured_output": True,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
|
messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
|
||||||
|
|
||||||
# Create a stream.
|
# Create a stream.
|
||||||
stream = model_client.create_stream(messages=messages)
|
stream = model_client.create_stream(messages=messages)
|
||||||
|
|
||||||
# Iterate over the stream and print the responses.
|
# Iterate over the stream and print the responses.
|
||||||
print("Streamed responses:")
|
print("Streamed responses:")
|
||||||
async for response in stream:
|
async for response in stream:
|
||||||
if isinstance(response, str):
|
if isinstance(response, str):
|
||||||
# A partial response is a string.
|
# A partial response is a string.
|
||||||
print(response, flush=True, end="")
|
print(response, flush=True, end="")
|
||||||
else:
|
else:
|
||||||
# The last response is a CreateResult object with the complete message.
|
# The last response is a CreateResult object with the complete message.
|
||||||
print("\n\n------------\n")
|
print("\n\n------------\n")
|
||||||
print("The complete response:", flush=True)
|
print("The complete response:", flush=True)
|
||||||
print(response.content, flush=True)
|
print(response.content, flush=True)
|
||||||
|
|
||||||
# Close the client when done.
|
# Close the client when done.
|
||||||
await model_client.close()
|
await model_client.close()
|
||||||
|
|
||||||
|
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
For details, see the tutorial:
|
For details, see the tutorial:
|
||||||
|
|
||||||
|
|||||||
@ -34,25 +34,27 @@ vllm = "latest"
|
|||||||
|
|
||||||
Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
|
Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
|
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
|
def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
# Print the outputs.
|
sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
|
||||||
results = []
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
results.append({"prompt": prompt, "generated_text": generated_text})
|
|
||||||
|
|
||||||
return {"results": results}
|
# Print the outputs.
|
||||||
```
|
results = []
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
results.append({"prompt": prompt, "generated_text": generated_text})
|
||||||
|
|
||||||
|
return {"results": results}
|
||||||
|
```
|
||||||
|
|
||||||
Then, run the following code to deploy it to the cloud:
|
Then, run the following code to deploy it to the cloud:
|
||||||
|
|
||||||
@ -62,47 +64,51 @@ cerebrium deploy
|
|||||||
|
|
||||||
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
|
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
|
||||||
|
|
||||||
```python
|
??? Command
|
||||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
|
||||||
-H 'Content-Type: application/json' \
|
```python
|
||||||
-H 'Authorization: <JWT TOKEN>' \
|
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||||
--data '{
|
-H 'Content-Type: application/json' \
|
||||||
"prompts": [
|
-H 'Authorization: <JWT TOKEN>' \
|
||||||
"Hello, my name is",
|
--data '{
|
||||||
"The president of the United States is",
|
"prompts": [
|
||||||
"The capital of France is",
|
"Hello, my name is",
|
||||||
"The future of AI is"
|
"The president of the United States is",
|
||||||
]
|
"The capital of France is",
|
||||||
}'
|
"The future of AI is"
|
||||||
```
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
You should get a response like:
|
You should get a response like:
|
||||||
|
|
||||||
```python
|
??? Response
|
||||||
{
|
|
||||||
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
```python
|
||||||
"result": {
|
{
|
||||||
"result": [
|
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
||||||
{
|
"result": {
|
||||||
"prompt": "Hello, my name is",
|
"result": [
|
||||||
"generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
|
{
|
||||||
},
|
"prompt": "Hello, my name is",
|
||||||
{
|
"generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
|
||||||
"prompt": "The president of the United States is",
|
},
|
||||||
"generated_text": " elected every four years. This is a democratic system.\n\n5. What"
|
{
|
||||||
},
|
"prompt": "The president of the United States is",
|
||||||
{
|
"generated_text": " elected every four years. This is a democratic system.\n\n5. What"
|
||||||
"prompt": "The capital of France is",
|
},
|
||||||
"generated_text": " Paris.\n"
|
{
|
||||||
},
|
"prompt": "The capital of France is",
|
||||||
{
|
"generated_text": " Paris.\n"
|
||||||
"prompt": "The future of AI is",
|
},
|
||||||
"generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
|
{
|
||||||
}
|
"prompt": "The future of AI is",
|
||||||
]
|
"generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
|
||||||
},
|
}
|
||||||
"run_time_ms": 152.53663063049316
|
]
|
||||||
}
|
},
|
||||||
```
|
"run_time_ms": 152.53663063049316
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
You now have an autoscaling endpoint where you only pay for the compute you use!
|
You now have an autoscaling endpoint where you only pay for the compute you use!
|
||||||
|
|||||||
@ -26,75 +26,81 @@ dstack init
|
|||||||
|
|
||||||
Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
|
Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
|
||||||
|
|
||||||
```yaml
|
??? Config
|
||||||
type: service
|
|
||||||
|
|
||||||
python: "3.11"
|
```yaml
|
||||||
env:
|
type: service
|
||||||
- MODEL=NousResearch/Llama-2-7b-chat-hf
|
|
||||||
port: 8000
|
python: "3.11"
|
||||||
resources:
|
env:
|
||||||
gpu: 24GB
|
- MODEL=NousResearch/Llama-2-7b-chat-hf
|
||||||
commands:
|
port: 8000
|
||||||
- pip install vllm
|
resources:
|
||||||
- vllm serve $MODEL --port 8000
|
gpu: 24GB
|
||||||
model:
|
commands:
|
||||||
format: openai
|
- pip install vllm
|
||||||
type: chat
|
- vllm serve $MODEL --port 8000
|
||||||
name: NousResearch/Llama-2-7b-chat-hf
|
model:
|
||||||
```
|
format: openai
|
||||||
|
type: chat
|
||||||
|
name: NousResearch/Llama-2-7b-chat-hf
|
||||||
|
```
|
||||||
|
|
||||||
Then, run the following CLI for provisioning:
|
Then, run the following CLI for provisioning:
|
||||||
|
|
||||||
```console
|
??? Command
|
||||||
$ dstack run . -f serve.dstack.yml
|
|
||||||
|
|
||||||
⠸ Getting run plan...
|
```console
|
||||||
Configuration serve.dstack.yml
|
$ dstack run . -f serve.dstack.yml
|
||||||
Project deep-diver-main
|
|
||||||
User deep-diver
|
|
||||||
Min resources 2..xCPU, 8GB.., 1xGPU (24GB)
|
|
||||||
Max price -
|
|
||||||
Max duration -
|
|
||||||
Spot policy auto
|
|
||||||
Retry policy no
|
|
||||||
|
|
||||||
# BACKEND REGION INSTANCE RESOURCES SPOT PRICE
|
⠸ Getting run plan...
|
||||||
1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
Configuration serve.dstack.yml
|
||||||
2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
Project deep-diver-main
|
||||||
3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
User deep-diver
|
||||||
...
|
Min resources 2..xCPU, 8GB.., 1xGPU (24GB)
|
||||||
Shown 3 of 193 offers, $5.876 max
|
Max price -
|
||||||
|
Max duration -
|
||||||
|
Spot policy auto
|
||||||
|
Retry policy no
|
||||||
|
|
||||||
Continue? [y/n]: y
|
# BACKEND REGION INSTANCE RESOURCES SPOT PRICE
|
||||||
⠙ Submitting run...
|
1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
||||||
⠏ Launching spicy-treefrog-1 (pulling)
|
2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
||||||
spicy-treefrog-1 provisioning completed (running)
|
3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
|
||||||
Service is published at ...
|
...
|
||||||
```
|
Shown 3 of 193 offers, $5.876 max
|
||||||
|
|
||||||
|
Continue? [y/n]: y
|
||||||
|
⠙ Submitting run...
|
||||||
|
⠏ Launching spicy-treefrog-1 (pulling)
|
||||||
|
spicy-treefrog-1 provisioning completed (running)
|
||||||
|
Service is published at ...
|
||||||
|
```
|
||||||
|
|
||||||
After the provisioning, you can interact with the model by using the OpenAI SDK:
|
After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
client = OpenAI(
|
```python
|
||||||
base_url="https://gateway.<gateway domain>",
|
from openai import OpenAI
|
||||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
|
|
||||||
)
|
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
client = OpenAI(
|
||||||
model="NousResearch/Llama-2-7b-chat-hf",
|
base_url="https://gateway.<gateway domain>",
|
||||||
messages=[
|
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
|
||||||
{
|
)
|
||||||
"role": "user",
|
|
||||||
"content": "Compose a poem that explains the concept of recursion in programming.",
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
print(completion.choices[0].message.content)
|
completion = client.chat.completions.create(
|
||||||
```
|
model="NousResearch/Llama-2-7b-chat-hf",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Compose a poem that explains the concept of recursion in programming.",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(completion.choices[0].message.content)
|
||||||
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
|
dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
|
||||||
|
|||||||
@ -27,29 +27,29 @@ vllm serve mistralai/Mistral-7B-Instruct-v0.1
|
|||||||
|
|
||||||
- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
|
- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from haystack.components.generators.chat import OpenAIChatGenerator
|
|
||||||
from haystack.dataclasses import ChatMessage
|
|
||||||
from haystack.utils import Secret
|
|
||||||
|
|
||||||
generator = OpenAIChatGenerator(
|
```python
|
||||||
# for compatibility with the OpenAI API, a placeholder api_key is needed
|
from haystack.components.generators.chat import OpenAIChatGenerator
|
||||||
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
from haystack.dataclasses import ChatMessage
|
||||||
model="mistralai/Mistral-7B-Instruct-v0.1",
|
from haystack.utils import Secret
|
||||||
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
|
||||||
generation_kwargs = {"max_tokens": 512}
|
|
||||||
)
|
|
||||||
|
|
||||||
response = generator.run(
|
generator = OpenAIChatGenerator(
|
||||||
messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
|
# for compatibility with the OpenAI API, a placeholder api_key is needed
|
||||||
)
|
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
||||||
|
model="mistralai/Mistral-7B-Instruct-v0.1",
|
||||||
|
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
||||||
|
generation_kwargs = {"max_tokens": 512}
|
||||||
|
)
|
||||||
|
|
||||||
print("-"*30)
|
response = generator.run(
|
||||||
print(response)
|
messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
|
||||||
print("-"*30)
|
)
|
||||||
```
|
|
||||||
|
|
||||||
Output e.g.:
|
print("-"*30)
|
||||||
|
print(response)
|
||||||
|
print("-"*30)
|
||||||
|
```
|
||||||
|
|
||||||
```console
|
```console
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|||||||
@ -34,21 +34,23 @@ vllm serve qwen/Qwen1.5-0.5B-Chat
|
|||||||
|
|
||||||
- Call it with litellm:
|
- Call it with litellm:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
import litellm
|
|
||||||
|
|
||||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
```python
|
||||||
|
import litellm
|
||||||
|
|
||||||
# hosted_vllm is prefix key word and necessary
|
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||||
response = litellm.completion(
|
|
||||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
|
||||||
messages=messages,
|
|
||||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
|
||||||
temperature=0.2,
|
|
||||||
max_tokens=80)
|
|
||||||
|
|
||||||
print(response)
|
# hosted_vllm is prefix key word and necessary
|
||||||
```
|
response = litellm.completion(
|
||||||
|
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||||
|
messages=messages,
|
||||||
|
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||||
|
temperature=0.2,
|
||||||
|
max_tokens=80)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
### Embeddings
|
### Embeddings
|
||||||
|
|
||||||
|
|||||||
@ -17,99 +17,101 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber
|
|||||||
|
|
||||||
Deploy the following yaml file `lws.yaml`
|
Deploy the following yaml file `lws.yaml`
|
||||||
|
|
||||||
```yaml
|
??? Yaml
|
||||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
|
||||||
kind: LeaderWorkerSet
|
```yaml
|
||||||
metadata:
|
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||||
name: vllm
|
kind: LeaderWorkerSet
|
||||||
spec:
|
metadata:
|
||||||
replicas: 2
|
name: vllm
|
||||||
leaderWorkerTemplate:
|
spec:
|
||||||
size: 2
|
replicas: 2
|
||||||
restartPolicy: RecreateGroupOnPodRestart
|
leaderWorkerTemplate:
|
||||||
leaderTemplate:
|
size: 2
|
||||||
metadata:
|
restartPolicy: RecreateGroupOnPodRestart
|
||||||
labels:
|
leaderTemplate:
|
||||||
role: leader
|
metadata:
|
||||||
spec:
|
labels:
|
||||||
containers:
|
role: leader
|
||||||
- name: vllm-leader
|
spec:
|
||||||
image: docker.io/vllm/vllm-openai:latest
|
containers:
|
||||||
env:
|
- name: vllm-leader
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
image: docker.io/vllm/vllm-openai:latest
|
||||||
value: <your-hf-token>
|
env:
|
||||||
command:
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
- sh
|
value: <your-hf-token>
|
||||||
- -c
|
command:
|
||||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
- sh
|
||||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
|
- -c
|
||||||
resources:
|
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||||
limits:
|
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
|
||||||
nvidia.com/gpu: "8"
|
resources:
|
||||||
memory: 1124Gi
|
limits:
|
||||||
ephemeral-storage: 800Gi
|
nvidia.com/gpu: "8"
|
||||||
requests:
|
memory: 1124Gi
|
||||||
ephemeral-storage: 800Gi
|
ephemeral-storage: 800Gi
|
||||||
cpu: 125
|
requests:
|
||||||
ports:
|
ephemeral-storage: 800Gi
|
||||||
- containerPort: 8080
|
cpu: 125
|
||||||
readinessProbe:
|
ports:
|
||||||
tcpSocket:
|
- containerPort: 8080
|
||||||
port: 8080
|
readinessProbe:
|
||||||
initialDelaySeconds: 15
|
tcpSocket:
|
||||||
periodSeconds: 10
|
port: 8080
|
||||||
volumeMounts:
|
initialDelaySeconds: 15
|
||||||
- mountPath: /dev/shm
|
periodSeconds: 10
|
||||||
name: dshm
|
volumeMounts:
|
||||||
volumes:
|
- mountPath: /dev/shm
|
||||||
- name: dshm
|
name: dshm
|
||||||
emptyDir:
|
volumes:
|
||||||
medium: Memory
|
- name: dshm
|
||||||
sizeLimit: 15Gi
|
emptyDir:
|
||||||
workerTemplate:
|
medium: Memory
|
||||||
spec:
|
sizeLimit: 15Gi
|
||||||
containers:
|
workerTemplate:
|
||||||
- name: vllm-worker
|
spec:
|
||||||
image: docker.io/vllm/vllm-openai:latest
|
containers:
|
||||||
command:
|
- name: vllm-worker
|
||||||
- sh
|
image: docker.io/vllm/vllm-openai:latest
|
||||||
- -c
|
command:
|
||||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
- sh
|
||||||
resources:
|
- -c
|
||||||
limits:
|
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||||
nvidia.com/gpu: "8"
|
resources:
|
||||||
memory: 1124Gi
|
limits:
|
||||||
ephemeral-storage: 800Gi
|
nvidia.com/gpu: "8"
|
||||||
requests:
|
memory: 1124Gi
|
||||||
ephemeral-storage: 800Gi
|
ephemeral-storage: 800Gi
|
||||||
cpu: 125
|
requests:
|
||||||
env:
|
ephemeral-storage: 800Gi
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
cpu: 125
|
||||||
value: <your-hf-token>
|
env:
|
||||||
volumeMounts:
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
- mountPath: /dev/shm
|
value: <your-hf-token>
|
||||||
name: dshm
|
volumeMounts:
|
||||||
volumes:
|
- mountPath: /dev/shm
|
||||||
- name: dshm
|
name: dshm
|
||||||
emptyDir:
|
volumes:
|
||||||
medium: Memory
|
- name: dshm
|
||||||
sizeLimit: 15Gi
|
emptyDir:
|
||||||
---
|
medium: Memory
|
||||||
apiVersion: v1
|
sizeLimit: 15Gi
|
||||||
kind: Service
|
---
|
||||||
metadata:
|
apiVersion: v1
|
||||||
name: vllm-leader
|
kind: Service
|
||||||
spec:
|
metadata:
|
||||||
ports:
|
name: vllm-leader
|
||||||
- name: http
|
spec:
|
||||||
port: 8080
|
ports:
|
||||||
protocol: TCP
|
- name: http
|
||||||
targetPort: 8080
|
port: 8080
|
||||||
selector:
|
protocol: TCP
|
||||||
leaderworkerset.sigs.k8s.io/name: vllm
|
targetPort: 8080
|
||||||
role: leader
|
selector:
|
||||||
type: ClusterIP
|
leaderworkerset.sigs.k8s.io/name: vllm
|
||||||
```
|
role: leader
|
||||||
|
type: ClusterIP
|
||||||
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
kubectl apply -f lws.yaml
|
kubectl apply -f lws.yaml
|
||||||
@ -175,25 +177,27 @@ curl http://localhost:8080/v1/completions \
|
|||||||
|
|
||||||
The output should be similar to the following
|
The output should be similar to the following
|
||||||
|
|
||||||
```text
|
??? Output
|
||||||
{
|
|
||||||
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
|
```text
|
||||||
"object": "text_completion",
|
|
||||||
"created": 1715138766,
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
|
||||||
"choices": [
|
|
||||||
{
|
{
|
||||||
"index": 0,
|
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
|
||||||
"text": " top destination for foodies, with",
|
"object": "text_completion",
|
||||||
"logprobs": null,
|
"created": 1715138766,
|
||||||
"finish_reason": "length",
|
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
||||||
"stop_reason": null
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"text": " top destination for foodies, with",
|
||||||
|
"logprobs": null,
|
||||||
|
"finish_reason": "length",
|
||||||
|
"stop_reason": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 5,
|
||||||
|
"total_tokens": 12,
|
||||||
|
"completion_tokens": 7
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
```
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": 5,
|
|
||||||
"total_tokens": 12,
|
|
||||||
"completion_tokens": 7
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|||||||
@ -24,48 +24,50 @@ sky check
|
|||||||
|
|
||||||
See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
|
See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
|
||||||
|
|
||||||
```yaml
|
??? Yaml
|
||||||
resources:
|
|
||||||
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
|
||||||
use_spot: True
|
|
||||||
disk_size: 512 # Ensure model checkpoints can fit.
|
|
||||||
disk_tier: best
|
|
||||||
ports: 8081 # Expose to internet traffic.
|
|
||||||
|
|
||||||
envs:
|
```yaml
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
resources:
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
||||||
|
use_spot: True
|
||||||
|
disk_size: 512 # Ensure model checkpoints can fit.
|
||||||
|
disk_tier: best
|
||||||
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
setup: |
|
envs:
|
||||||
conda create -n vllm python=3.10 -y
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
conda activate vllm
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
pip install vllm==0.4.0.post1
|
setup: |
|
||||||
# Install Gradio for web UI.
|
conda create -n vllm python=3.10 -y
|
||||||
pip install gradio openai
|
conda activate vllm
|
||||||
pip install flash-attn==2.5.7
|
|
||||||
|
|
||||||
run: |
|
pip install vllm==0.4.0.post1
|
||||||
conda activate vllm
|
# Install Gradio for web UI.
|
||||||
echo 'Starting vllm api server...'
|
pip install gradio openai
|
||||||
python -u -m vllm.entrypoints.openai.api_server \
|
pip install flash-attn==2.5.7
|
||||||
--port 8081 \
|
|
||||||
--model $MODEL_NAME \
|
|
||||||
--trust-remote-code \
|
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
|
||||||
2>&1 | tee api_server.log &
|
|
||||||
|
|
||||||
echo 'Waiting for vllm api server to start...'
|
run: |
|
||||||
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
|
conda activate vllm
|
||||||
|
echo 'Starting vllm api server...'
|
||||||
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
|
--port 8081 \
|
||||||
|
--model $MODEL_NAME \
|
||||||
|
--trust-remote-code \
|
||||||
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
|
2>&1 | tee api_server.log &
|
||||||
|
|
||||||
echo 'Starting gradio server...'
|
echo 'Waiting for vllm api server to start...'
|
||||||
git clone https://github.com/vllm-project/vllm.git || true
|
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
|
||||||
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
|
||||||
-m $MODEL_NAME \
|
echo 'Starting gradio server...'
|
||||||
--port 8811 \
|
git clone https://github.com/vllm-project/vllm.git || true
|
||||||
--model-url http://localhost:8081/v1 \
|
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||||
--stop-token-ids 128009,128001
|
-m $MODEL_NAME \
|
||||||
```
|
--port 8811 \
|
||||||
|
--model-url http://localhost:8081/v1 \
|
||||||
|
--stop-token-ids 128009,128001
|
||||||
|
```
|
||||||
|
|
||||||
Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
|
Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
|
||||||
|
|
||||||
@ -93,68 +95,67 @@ HF_TOKEN="your-huggingface-token" \
|
|||||||
|
|
||||||
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
|
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
|
||||||
|
|
||||||
```yaml
|
??? Yaml
|
||||||
service:
|
|
||||||
replicas: 2
|
|
||||||
# An actual request for readiness probe.
|
|
||||||
readiness_probe:
|
|
||||||
path: /v1/chat/completions
|
|
||||||
post_data:
|
|
||||||
model: $MODEL_NAME
|
|
||||||
messages:
|
|
||||||
- role: user
|
|
||||||
content: Hello! What is your name?
|
|
||||||
max_completion_tokens: 1
|
|
||||||
```
|
|
||||||
|
|
||||||
<details>
|
```yaml
|
||||||
<summary>Click to see the full recipe YAML</summary>
|
service:
|
||||||
|
replicas: 2
|
||||||
```yaml
|
# An actual request for readiness probe.
|
||||||
service:
|
readiness_probe:
|
||||||
replicas: 2
|
path: /v1/chat/completions
|
||||||
# An actual request for readiness probe.
|
post_data:
|
||||||
readiness_probe:
|
model: $MODEL_NAME
|
||||||
path: /v1/chat/completions
|
messages:
|
||||||
post_data:
|
- role: user
|
||||||
model: $MODEL_NAME
|
content: Hello! What is your name?
|
||||||
messages:
|
|
||||||
- role: user
|
|
||||||
content: Hello! What is your name?
|
|
||||||
max_completion_tokens: 1
|
max_completion_tokens: 1
|
||||||
|
```
|
||||||
|
|
||||||
resources:
|
??? Yaml
|
||||||
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
|
||||||
use_spot: True
|
|
||||||
disk_size: 512 # Ensure model checkpoints can fit.
|
|
||||||
disk_tier: best
|
|
||||||
ports: 8081 # Expose to internet traffic.
|
|
||||||
|
|
||||||
envs:
|
```yaml
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
service:
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
replicas: 2
|
||||||
|
# An actual request for readiness probe.
|
||||||
|
readiness_probe:
|
||||||
|
path: /v1/chat/completions
|
||||||
|
post_data:
|
||||||
|
model: $MODEL_NAME
|
||||||
|
messages:
|
||||||
|
- role: user
|
||||||
|
content: Hello! What is your name?
|
||||||
|
max_completion_tokens: 1
|
||||||
|
|
||||||
setup: |
|
resources:
|
||||||
conda create -n vllm python=3.10 -y
|
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
||||||
conda activate vllm
|
use_spot: True
|
||||||
|
disk_size: 512 # Ensure model checkpoints can fit.
|
||||||
|
disk_tier: best
|
||||||
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
pip install vllm==0.4.0.post1
|
envs:
|
||||||
# Install Gradio for web UI.
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
pip install gradio openai
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
pip install flash-attn==2.5.7
|
|
||||||
|
|
||||||
run: |
|
setup: |
|
||||||
conda activate vllm
|
conda create -n vllm python=3.10 -y
|
||||||
echo 'Starting vllm api server...'
|
conda activate vllm
|
||||||
python -u -m vllm.entrypoints.openai.api_server \
|
|
||||||
--port 8081 \
|
|
||||||
--model $MODEL_NAME \
|
|
||||||
--trust-remote-code \
|
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
|
||||||
2>&1 | tee api_server.log
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
pip install vllm==0.4.0.post1
|
||||||
|
# Install Gradio for web UI.
|
||||||
|
pip install gradio openai
|
||||||
|
pip install flash-attn==2.5.7
|
||||||
|
|
||||||
|
run: |
|
||||||
|
conda activate vllm
|
||||||
|
echo 'Starting vllm api server...'
|
||||||
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
|
--port 8081 \
|
||||||
|
--model $MODEL_NAME \
|
||||||
|
--trust-remote-code \
|
||||||
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
|
2>&1 | tee api_server.log
|
||||||
|
```
|
||||||
|
|
||||||
Start the serving the Llama-3 8B model on multiple replicas:
|
Start the serving the Llama-3 8B model on multiple replicas:
|
||||||
|
|
||||||
@ -170,8 +171,7 @@ Wait until the service is ready:
|
|||||||
watch -n10 sky serve status vllm
|
watch -n10 sky serve status vllm
|
||||||
```
|
```
|
||||||
|
|
||||||
<details>
|
Example outputs:
|
||||||
<summary>Example outputs:</summary>
|
|
||||||
|
|
||||||
```console
|
```console
|
||||||
Services
|
Services
|
||||||
@ -184,29 +184,29 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R
|
|||||||
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
|
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
|
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
|
||||||
|
|
||||||
```console
|
??? Commands
|
||||||
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
|
|
||||||
curl -L http://$ENDPOINT/v1/chat/completions \
|
```bash
|
||||||
-H "Content-Type: application/json" \
|
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
|
||||||
-d '{
|
curl -L http://$ENDPOINT/v1/chat/completions \
|
||||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
-H "Content-Type: application/json" \
|
||||||
"messages": [
|
-d '{
|
||||||
{
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
"role": "system",
|
"messages": [
|
||||||
"content": "You are a helpful assistant."
|
{
|
||||||
},
|
"role": "system",
|
||||||
{
|
"content": "You are a helpful assistant."
|
||||||
"role": "user",
|
},
|
||||||
"content": "Who are you?"
|
{
|
||||||
}
|
"role": "user",
|
||||||
],
|
"content": "Who are you?"
|
||||||
"stop_token_ids": [128009, 128001]
|
}
|
||||||
}'
|
],
|
||||||
```
|
"stop_token_ids": [128009, 128001]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
|
To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
|
||||||
|
|
||||||
@ -220,57 +220,54 @@ service:
|
|||||||
|
|
||||||
This will scale the service up to when the QPS exceeds 2 for each replica.
|
This will scale the service up to when the QPS exceeds 2 for each replica.
|
||||||
|
|
||||||
<details>
|
??? Yaml
|
||||||
<summary>Click to see the full recipe YAML</summary>
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
service:
|
service:
|
||||||
replica_policy:
|
replica_policy:
|
||||||
min_replicas: 2
|
min_replicas: 2
|
||||||
max_replicas: 4
|
max_replicas: 4
|
||||||
target_qps_per_replica: 2
|
target_qps_per_replica: 2
|
||||||
# An actual request for readiness probe.
|
# An actual request for readiness probe.
|
||||||
readiness_probe:
|
readiness_probe:
|
||||||
path: /v1/chat/completions
|
path: /v1/chat/completions
|
||||||
post_data:
|
post_data:
|
||||||
model: $MODEL_NAME
|
model: $MODEL_NAME
|
||||||
messages:
|
messages:
|
||||||
- role: user
|
- role: user
|
||||||
content: Hello! What is your name?
|
content: Hello! What is your name?
|
||||||
max_completion_tokens: 1
|
max_completion_tokens: 1
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
|
||||||
use_spot: True
|
use_spot: True
|
||||||
disk_size: 512 # Ensure model checkpoints can fit.
|
disk_size: 512 # Ensure model checkpoints can fit.
|
||||||
disk_tier: best
|
disk_tier: best
|
||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
setup: |
|
setup: |
|
||||||
conda create -n vllm python=3.10 -y
|
conda create -n vllm python=3.10 -y
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
|
|
||||||
pip install vllm==0.4.0.post1
|
pip install vllm==0.4.0.post1
|
||||||
# Install Gradio for web UI.
|
# Install Gradio for web UI.
|
||||||
pip install gradio openai
|
pip install gradio openai
|
||||||
pip install flash-attn==2.5.7
|
pip install flash-attn==2.5.7
|
||||||
|
|
||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
python -u -m vllm.entrypoints.openai.api_server \
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
--model $MODEL_NAME \
|
--model $MODEL_NAME \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log
|
2>&1 | tee api_server.log
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
To update the service with the new config:
|
To update the service with the new config:
|
||||||
|
|
||||||
@ -288,38 +285,35 @@ sky serve down vllm
|
|||||||
|
|
||||||
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
|
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
|
||||||
|
|
||||||
<details>
|
??? Yaml
|
||||||
<summary>Click to see the full GUI YAML</summary>
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
envs:
|
envs:
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
|
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
cpus: 2
|
cpus: 2
|
||||||
|
|
||||||
setup: |
|
setup: |
|
||||||
conda create -n vllm python=3.10 -y
|
conda create -n vllm python=3.10 -y
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
|
|
||||||
# Install Gradio for web UI.
|
# Install Gradio for web UI.
|
||||||
pip install gradio openai
|
pip install gradio openai
|
||||||
|
|
||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
export PATH=$PATH:/sbin
|
export PATH=$PATH:/sbin
|
||||||
|
|
||||||
echo 'Starting gradio server...'
|
echo 'Starting gradio server...'
|
||||||
git clone https://github.com/vllm-project/vllm.git || true
|
git clone https://github.com/vllm-project/vllm.git || true
|
||||||
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||||
-m $MODEL_NAME \
|
-m $MODEL_NAME \
|
||||||
--port 8811 \
|
--port 8811 \
|
||||||
--model-url http://$ENDPOINT/v1 \
|
--model-url http://$ENDPOINT/v1 \
|
||||||
--stop-token-ids 128009,128001 | tee ~/gradio.log
|
--stop-token-ids 128009,128001 | tee ~/gradio.log
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
1. Start the chat web UI:
|
1. Start the chat web UI:
|
||||||
|
|
||||||
|
|||||||
@ -60,22 +60,22 @@ And then you can send out a query to the OpenAI-compatible API to check the avai
|
|||||||
curl -o- http://localhost:30080/models
|
curl -o- http://localhost:30080/models
|
||||||
```
|
```
|
||||||
|
|
||||||
Expected output:
|
??? Output
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
|
||||||
"object": "list",
|
|
||||||
"data": [
|
|
||||||
{
|
{
|
||||||
"id": "facebook/opt-125m",
|
"object": "list",
|
||||||
"object": "model",
|
"data": [
|
||||||
"created": 1737428424,
|
{
|
||||||
"owned_by": "vllm",
|
"id": "facebook/opt-125m",
|
||||||
"root": null
|
"object": "model",
|
||||||
|
"created": 1737428424,
|
||||||
|
"owned_by": "vllm",
|
||||||
|
"root": null
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
```
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
|
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
|
||||||
|
|
||||||
@ -89,23 +89,23 @@ curl -X POST http://localhost:30080/completions \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Expected output:
|
??? Output
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
|
||||||
"id": "completion-id",
|
|
||||||
"object": "text_completion",
|
|
||||||
"created": 1737428424,
|
|
||||||
"model": "facebook/opt-125m",
|
|
||||||
"choices": [
|
|
||||||
{
|
{
|
||||||
"text": " there was a brave knight who...",
|
"id": "completion-id",
|
||||||
"index": 0,
|
"object": "text_completion",
|
||||||
"finish_reason": "length"
|
"created": 1737428424,
|
||||||
|
"model": "facebook/opt-125m",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"text": " there was a brave knight who...",
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": "length"
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
```
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Uninstall
|
### Uninstall
|
||||||
|
|
||||||
@ -121,23 +121,25 @@ sudo helm uninstall vllm
|
|||||||
|
|
||||||
The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
|
The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
|
||||||
|
|
||||||
```yaml
|
??? Yaml
|
||||||
servingEngineSpec:
|
|
||||||
runtimeClassName: ""
|
|
||||||
modelSpec:
|
|
||||||
- name: "opt125m"
|
|
||||||
repository: "vllm/vllm-openai"
|
|
||||||
tag: "latest"
|
|
||||||
modelURL: "facebook/opt-125m"
|
|
||||||
|
|
||||||
replicaCount: 1
|
```yaml
|
||||||
|
servingEngineSpec:
|
||||||
|
runtimeClassName: ""
|
||||||
|
modelSpec:
|
||||||
|
- name: "opt125m"
|
||||||
|
repository: "vllm/vllm-openai"
|
||||||
|
tag: "latest"
|
||||||
|
modelURL: "facebook/opt-125m"
|
||||||
|
|
||||||
requestCPU: 6
|
replicaCount: 1
|
||||||
requestMemory: "16Gi"
|
|
||||||
requestGPU: 1
|
|
||||||
|
|
||||||
pvcStorage: "10Gi"
|
requestCPU: 6
|
||||||
```
|
requestMemory: "16Gi"
|
||||||
|
requestGPU: 1
|
||||||
|
|
||||||
|
pvcStorage: "10Gi"
|
||||||
|
```
|
||||||
|
|
||||||
In this YAML configuration:
|
In this YAML configuration:
|
||||||
* **`modelSpec`** includes:
|
* **`modelSpec`** includes:
|
||||||
|
|||||||
@ -29,85 +29,89 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
|||||||
|
|
||||||
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
||||||
|
|
||||||
```bash
|
??? Config
|
||||||
cat <<EOF |kubectl apply -f -
|
|
||||||
apiVersion: v1
|
```bash
|
||||||
kind: PersistentVolumeClaim
|
cat <<EOF |kubectl apply -f -
|
||||||
metadata:
|
apiVersion: v1
|
||||||
name: vllm-models
|
kind: PersistentVolumeClaim
|
||||||
spec:
|
metadata:
|
||||||
accessModes:
|
name: vllm-models
|
||||||
- ReadWriteOnce
|
spec:
|
||||||
volumeMode: Filesystem
|
accessModes:
|
||||||
resources:
|
- ReadWriteOnce
|
||||||
requests:
|
volumeMode: Filesystem
|
||||||
storage: 50Gi
|
resources:
|
||||||
---
|
requests:
|
||||||
apiVersion: v1
|
storage: 50Gi
|
||||||
kind: Secret
|
---
|
||||||
metadata:
|
apiVersion: v1
|
||||||
name: hf-token-secret
|
kind: Secret
|
||||||
type: Opaque
|
metadata:
|
||||||
data:
|
name: hf-token-secret
|
||||||
token: $(HF_TOKEN)
|
type: Opaque
|
||||||
EOF
|
data:
|
||||||
```
|
token: $(HF_TOKEN)
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||||
|
|
||||||
```bash
|
??? Config
|
||||||
cat <<EOF |kubectl apply -f -
|
|
||||||
apiVersion: apps/v1
|
```bash
|
||||||
kind: Deployment
|
cat <<EOF |kubectl apply -f -
|
||||||
metadata:
|
apiVersion: apps/v1
|
||||||
name: vllm-server
|
kind: Deployment
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: vllm
|
|
||||||
template:
|
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
name: vllm-server
|
||||||
app.kubernetes.io/name: vllm
|
|
||||||
spec:
|
spec:
|
||||||
containers:
|
replicas: 1
|
||||||
- name: vllm
|
selector:
|
||||||
image: vllm/vllm-openai:latest
|
matchLabels:
|
||||||
command: ["/bin/sh", "-c"]
|
app.kubernetes.io/name: vllm
|
||||||
args: [
|
template:
|
||||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
metadata:
|
||||||
]
|
labels:
|
||||||
env:
|
app.kubernetes.io/name: vllm
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
spec:
|
||||||
valueFrom:
|
containers:
|
||||||
secretKeyRef:
|
- name: vllm
|
||||||
name: hf-token-secret
|
image: vllm/vllm-openai:latest
|
||||||
key: token
|
command: ["/bin/sh", "-c"]
|
||||||
ports:
|
args: [
|
||||||
- containerPort: 8000
|
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||||
volumeMounts:
|
]
|
||||||
|
env:
|
||||||
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
ports:
|
||||||
|
- containerPort: 8000
|
||||||
|
volumeMounts:
|
||||||
|
- name: llama-storage
|
||||||
|
mountPath: /root/.cache/huggingface
|
||||||
|
volumes:
|
||||||
- name: llama-storage
|
- name: llama-storage
|
||||||
mountPath: /root/.cache/huggingface
|
persistentVolumeClaim:
|
||||||
volumes:
|
claimName: vllm-models
|
||||||
- name: llama-storage
|
---
|
||||||
persistentVolumeClaim:
|
apiVersion: v1
|
||||||
claimName: vllm-models
|
kind: Service
|
||||||
---
|
metadata:
|
||||||
apiVersion: v1
|
name: vllm-server
|
||||||
kind: Service
|
spec:
|
||||||
metadata:
|
selector:
|
||||||
name: vllm-server
|
app.kubernetes.io/name: vllm
|
||||||
spec:
|
ports:
|
||||||
selector:
|
- protocol: TCP
|
||||||
app.kubernetes.io/name: vllm
|
port: 8000
|
||||||
ports:
|
targetPort: 8000
|
||||||
- protocol: TCP
|
type: ClusterIP
|
||||||
port: 8000
|
EOF
|
||||||
targetPort: 8000
|
```
|
||||||
type: ClusterIP
|
|
||||||
EOF
|
|
||||||
```
|
|
||||||
|
|
||||||
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
|
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
|
||||||
|
|
||||||
@ -128,6 +132,9 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
|
|
||||||
PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
|
PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Yaml</summary>
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: PersistentVolumeClaim
|
kind: PersistentVolumeClaim
|
||||||
@ -144,6 +151,8 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
volumeMode: Filesystem
|
volumeMode: Filesystem
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
|
Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@ -156,13 +165,16 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
stringData:
|
stringData:
|
||||||
token: "REPLACE_WITH_TOKEN"
|
token: "REPLACE_WITH_TOKEN"
|
||||||
```
|
```
|
||||||
|
|
||||||
Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
|
Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
|
||||||
|
|
||||||
Here are two examples for using NVIDIA GPU and AMD GPU.
|
Here are two examples for using NVIDIA GPU and AMD GPU.
|
||||||
|
|
||||||
NVIDIA GPU:
|
NVIDIA GPU:
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Yaml</summary>
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
@ -233,10 +245,15 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
periodSeconds: 5
|
periodSeconds: 5
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
AMD GPU:
|
AMD GPU:
|
||||||
|
|
||||||
You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
|
You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Yaml</summary>
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
@ -305,12 +322,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
mountPath: /dev/shm
|
mountPath: /dev/shm
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
|
You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
|
||||||
|
|
||||||
2. Create a Kubernetes Service for vLLM
|
2. Create a Kubernetes Service for vLLM
|
||||||
|
|
||||||
Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
|
Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Yaml</summary>
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
@ -330,6 +352,8 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
3. Deploy and Test
|
3. Deploy and Test
|
||||||
|
|
||||||
Apply the deployment and service configurations using `kubectl apply -f <filename>`:
|
Apply the deployment and service configurations using `kubectl apply -f <filename>`:
|
||||||
|
|||||||
@ -36,23 +36,25 @@ docker build . -f Dockerfile.nginx --tag nginx-lb
|
|||||||
|
|
||||||
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
||||||
|
|
||||||
```console
|
??? Config
|
||||||
upstream backend {
|
|
||||||
least_conn;
|
```console
|
||||||
server vllm0:8000 max_fails=3 fail_timeout=10000s;
|
upstream backend {
|
||||||
server vllm1:8000 max_fails=3 fail_timeout=10000s;
|
least_conn;
|
||||||
}
|
server vllm0:8000 max_fails=3 fail_timeout=10000s;
|
||||||
server {
|
server vllm1:8000 max_fails=3 fail_timeout=10000s;
|
||||||
listen 80;
|
|
||||||
location / {
|
|
||||||
proxy_pass http://backend;
|
|
||||||
proxy_set_header Host $host;
|
|
||||||
proxy_set_header X-Real-IP $remote_addr;
|
|
||||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
|
||||||
proxy_set_header X-Forwarded-Proto $scheme;
|
|
||||||
}
|
}
|
||||||
}
|
server {
|
||||||
```
|
listen 80;
|
||||||
|
location / {
|
||||||
|
proxy_pass http://backend;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
[](){ #nginxloadbalancer-nginx-vllm-container }
|
[](){ #nginxloadbalancer-nginx-vllm-container }
|
||||||
|
|
||||||
@ -93,30 +95,32 @@ Notes:
|
|||||||
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
|
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
|
||||||
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
|
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
|
||||||
|
|
||||||
```console
|
??? Commands
|
||||||
mkdir -p ~/.cache/huggingface/hub/
|
|
||||||
hf_cache_dir=~/.cache/huggingface/
|
```console
|
||||||
docker run \
|
mkdir -p ~/.cache/huggingface/hub/
|
||||||
-itd \
|
hf_cache_dir=~/.cache/huggingface/
|
||||||
--ipc host \
|
docker run \
|
||||||
--network vllm_nginx \
|
-itd \
|
||||||
--gpus device=0 \
|
--ipc host \
|
||||||
--shm-size=10.24gb \
|
--network vllm_nginx \
|
||||||
-v $hf_cache_dir:/root/.cache/huggingface/ \
|
--gpus device=0 \
|
||||||
-p 8081:8000 \
|
--shm-size=10.24gb \
|
||||||
--name vllm0 vllm \
|
-v $hf_cache_dir:/root/.cache/huggingface/ \
|
||||||
--model meta-llama/Llama-2-7b-chat-hf
|
-p 8081:8000 \
|
||||||
docker run \
|
--name vllm0 vllm \
|
||||||
-itd \
|
--model meta-llama/Llama-2-7b-chat-hf
|
||||||
--ipc host \
|
docker run \
|
||||||
--network vllm_nginx \
|
-itd \
|
||||||
--gpus device=1 \
|
--ipc host \
|
||||||
--shm-size=10.24gb \
|
--network vllm_nginx \
|
||||||
-v $hf_cache_dir:/root/.cache/huggingface/ \
|
--gpus device=1 \
|
||||||
-p 8082:8000 \
|
--shm-size=10.24gb \
|
||||||
--name vllm1 vllm \
|
-v $hf_cache_dir:/root/.cache/huggingface/ \
|
||||||
--model meta-llama/Llama-2-7b-chat-hf
|
-p 8082:8000 \
|
||||||
```
|
--name vllm1 vllm \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf
|
||||||
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
|
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
|
||||||
|
|||||||
@ -22,31 +22,33 @@ server.
|
|||||||
|
|
||||||
Here is a sample of `LLM` class usage:
|
Here is a sample of `LLM` class usage:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
# Define a list of input prompts
|
```python
|
||||||
prompts = [
|
from vllm import LLM, SamplingParams
|
||||||
"Hello, my name is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The largest ocean is",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Define sampling parameters
|
# Define a list of input prompts
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The largest ocean is",
|
||||||
|
]
|
||||||
|
|
||||||
# Initialize the LLM engine with the OPT-125M model
|
# Define sampling parameters
|
||||||
llm = LLM(model="facebook/opt-125m")
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
# Generate outputs for the input prompts
|
# Initialize the LLM engine with the OPT-125M model
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
llm = LLM(model="facebook/opt-125m")
|
||||||
|
|
||||||
# Print the generated outputs
|
# Generate outputs for the input prompts
|
||||||
for output in outputs:
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
# Print the generated outputs
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
for output in outputs:
|
||||||
```
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
|
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
|
||||||
|
|
||||||
@ -178,32 +180,34 @@ vision-language model.
|
|||||||
|
|
||||||
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
|
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
class MyOldModel(nn.Module):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
cache_config: Optional[CacheConfig] = None,
|
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
|
||||||
lora_config: Optional[LoRAConfig] = None,
|
|
||||||
prefix: str = "",
|
|
||||||
) -> None:
|
|
||||||
...
|
|
||||||
|
|
||||||
from vllm.config import VllmConfig
|
```python
|
||||||
class MyNewModel(MyOldModel):
|
class MyOldModel(nn.Module):
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(
|
||||||
config = vllm_config.model_config.hf_config
|
self,
|
||||||
cache_config = vllm_config.cache_config
|
config,
|
||||||
quant_config = vllm_config.quant_config
|
cache_config: Optional[CacheConfig] = None,
|
||||||
lora_config = vllm_config.lora_config
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
super().__init__(config, cache_config, quant_config, lora_config, prefix)
|
lora_config: Optional[LoRAConfig] = None,
|
||||||
|
prefix: str = "",
|
||||||
|
) -> None:
|
||||||
|
...
|
||||||
|
|
||||||
if __version__ >= "0.6.4":
|
from vllm.config import VllmConfig
|
||||||
MyModel = MyNewModel
|
class MyNewModel(MyOldModel):
|
||||||
else:
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
MyModel = MyOldModel
|
config = vllm_config.model_config.hf_config
|
||||||
```
|
cache_config = vllm_config.cache_config
|
||||||
|
quant_config = vllm_config.quant_config
|
||||||
|
lora_config = vllm_config.lora_config
|
||||||
|
super().__init__(config, cache_config, quant_config, lora_config, prefix)
|
||||||
|
|
||||||
|
if __version__ >= "0.6.4":
|
||||||
|
MyModel = MyNewModel
|
||||||
|
else:
|
||||||
|
MyModel = MyOldModel
|
||||||
|
```
|
||||||
|
|
||||||
This way, the model can work with both old and new versions of vLLM.
|
This way, the model can work with both old and new versions of vLLM.
|
||||||
|
|
||||||
|
|||||||
@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
|
|||||||
all results for output have been calculated but are just stored in
|
all results for output have been calculated but are just stored in
|
||||||
different thread register memory.
|
different thread register memory.
|
||||||
|
|
||||||
```cpp
|
??? Code
|
||||||
float* out_smem = reinterpret_cast<float*>(shared_mem);
|
|
||||||
for (int i = NUM_WARPS; i > 1; i /= 2) {
|
|
||||||
// Upper warps write to shared memory.
|
|
||||||
...
|
|
||||||
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
|
|
||||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
|
||||||
...
|
|
||||||
dst[row_idx] = accs[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Lower warps update the output.
|
```cpp
|
||||||
const float* src = &out_smem[warp_idx * HEAD_SIZE];
|
float* out_smem = reinterpret_cast<float*>(shared_mem);
|
||||||
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
for (int i = NUM_WARPS; i > 1; i /= 2) {
|
||||||
|
// Upper warps write to shared memory.
|
||||||
...
|
...
|
||||||
accs[i] += src[row_idx];
|
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
|
||||||
}
|
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||||
|
...
|
||||||
|
dst[row_idx] = accs[i];
|
||||||
|
}
|
||||||
|
|
||||||
// Write out the accs.
|
// Lower warps update the output.
|
||||||
}
|
const float* src = &out_smem[warp_idx * HEAD_SIZE];
|
||||||
```
|
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
|
||||||
|
...
|
||||||
|
accs[i] += src[row_idx];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write out the accs.
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Output
|
## Output
|
||||||
|
|
||||||
|
|||||||
@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
|
|||||||
|
|
||||||
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
|
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# inside `setup.py` file
|
|
||||||
from setuptools import setup
|
|
||||||
|
|
||||||
setup(name='vllm_add_dummy_model',
|
```python
|
||||||
version='0.1',
|
# inside `setup.py` file
|
||||||
packages=['vllm_add_dummy_model'],
|
from setuptools import setup
|
||||||
entry_points={
|
|
||||||
'vllm.general_plugins':
|
|
||||||
["register_dummy_model = vllm_add_dummy_model:register"]
|
|
||||||
})
|
|
||||||
|
|
||||||
# inside `vllm_add_dummy_model.py` file
|
setup(name='vllm_add_dummy_model',
|
||||||
def register():
|
version='0.1',
|
||||||
from vllm import ModelRegistry
|
packages=['vllm_add_dummy_model'],
|
||||||
|
entry_points={
|
||||||
|
'vllm.general_plugins':
|
||||||
|
["register_dummy_model = vllm_add_dummy_model:register"]
|
||||||
|
})
|
||||||
|
|
||||||
if "MyLlava" not in ModelRegistry.get_supported_archs():
|
# inside `vllm_add_dummy_model.py` file
|
||||||
ModelRegistry.register_model(
|
def register():
|
||||||
"MyLlava",
|
from vllm import ModelRegistry
|
||||||
"vllm_add_dummy_model.my_llava:MyLlava",
|
|
||||||
)
|
if "MyLlava" not in ModelRegistry.get_supported_archs():
|
||||||
```
|
ModelRegistry.register_model(
|
||||||
|
"MyLlava",
|
||||||
|
"vllm_add_dummy_model.my_llava:MyLlava",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
|
||||||
|
|
||||||
|
|||||||
@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
|
|||||||
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
|
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
|
||||||
the third parameter is the path to the LoRA adapter.
|
the third parameter is the path to the LoRA adapter.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
sampling_params = SamplingParams(
|
|
||||||
temperature=0,
|
|
||||||
max_tokens=256,
|
|
||||||
stop=["[/assistant]"]
|
|
||||||
)
|
|
||||||
|
|
||||||
prompts = [
|
```python
|
||||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
|
sampling_params = SamplingParams(
|
||||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
|
temperature=0,
|
||||||
]
|
max_tokens=256,
|
||||||
|
stop=["[/assistant]"]
|
||||||
|
)
|
||||||
|
|
||||||
outputs = llm.generate(
|
prompts = [
|
||||||
prompts,
|
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
|
||||||
sampling_params,
|
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
|
||||||
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
|
]
|
||||||
)
|
|
||||||
```
|
outputs = llm.generate(
|
||||||
|
prompts,
|
||||||
|
sampling_params,
|
||||||
|
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||||
|
|
||||||
@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
|
|||||||
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
|
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
|
||||||
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
|
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
|
||||||
|
|
||||||
```bash
|
??? Command
|
||||||
curl localhost:8000/v1/models | jq .
|
|
||||||
{
|
```bash
|
||||||
"object": "list",
|
curl localhost:8000/v1/models | jq .
|
||||||
"data": [
|
{
|
||||||
{
|
"object": "list",
|
||||||
"id": "meta-llama/Llama-2-7b-hf",
|
"data": [
|
||||||
"object": "model",
|
{
|
||||||
...
|
"id": "meta-llama/Llama-2-7b-hf",
|
||||||
},
|
"object": "model",
|
||||||
{
|
...
|
||||||
"id": "sql-lora",
|
},
|
||||||
"object": "model",
|
{
|
||||||
...
|
"id": "sql-lora",
|
||||||
}
|
"object": "model",
|
||||||
]
|
...
|
||||||
}
|
}
|
||||||
```
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
|
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
|
||||||
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
|
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
|
||||||
@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:
|
|||||||
|
|
||||||
1. Implement the LoRAResolver interface.
|
1. Implement the LoRAResolver interface.
|
||||||
|
|
||||||
Example of a simple S3 LoRAResolver implementation:
|
??? Example of a simple S3 LoRAResolver implementation
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
import s3fs
|
import s3fs
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.lora.resolver import LoRAResolver
|
from vllm.lora.resolver import LoRAResolver
|
||||||
|
|
||||||
class S3LoRAResolver(LoRAResolver):
|
class S3LoRAResolver(LoRAResolver):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.s3 = s3fs.S3FileSystem()
|
self.s3 = s3fs.S3FileSystem()
|
||||||
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
|
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
|
||||||
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
|
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
|
||||||
|
|
||||||
async def resolve_lora(self, base_model_name, lora_name):
|
async def resolve_lora(self, base_model_name, lora_name):
|
||||||
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
|
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
|
||||||
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
|
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
|
||||||
|
|
||||||
# Download the LoRA from S3 to the local path
|
# Download the LoRA from S3 to the local path
|
||||||
await self.s3._get(
|
await self.s3._get(
|
||||||
s3_path, local_path, recursive=True, maxdepth=1
|
s3_path, local_path, recursive=True, maxdepth=1
|
||||||
)
|
)
|
||||||
|
|
||||||
lora_request = LoRARequest(
|
lora_request = LoRARequest(
|
||||||
lora_name=lora_name,
|
lora_name=lora_name,
|
||||||
lora_path=local_path,
|
lora_path=local_path,
|
||||||
lora_int_id=abs(hash(lora_name))
|
lora_int_id=abs(hash(lora_name))
|
||||||
)
|
)
|
||||||
return lora_request
|
return lora_request
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Register `LoRAResolver` plugin.
|
2. Register `LoRAResolver` plugin.
|
||||||
|
|
||||||
@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
|
|||||||
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
|
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
|
||||||
- The `root` field points to the artifact location of the lora adapter.
|
- The `root` field points to the artifact location of the lora adapter.
|
||||||
|
|
||||||
```bash
|
??? Command output
|
||||||
$ curl http://localhost:8000/v1/models
|
|
||||||
|
|
||||||
{
|
```bash
|
||||||
"object": "list",
|
$ curl http://localhost:8000/v1/models
|
||||||
"data": [
|
|
||||||
{
|
{
|
||||||
"id": "meta-llama/Llama-2-7b-hf",
|
"object": "list",
|
||||||
"object": "model",
|
"data": [
|
||||||
"created": 1715644056,
|
|
||||||
"owned_by": "vllm",
|
|
||||||
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
|
|
||||||
"parent": null,
|
|
||||||
"permission": [
|
|
||||||
{
|
{
|
||||||
.....
|
"id": "meta-llama/Llama-2-7b-hf",
|
||||||
|
"object": "model",
|
||||||
|
"created": 1715644056,
|
||||||
|
"owned_by": "vllm",
|
||||||
|
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
|
||||||
|
"parent": null,
|
||||||
|
"permission": [
|
||||||
|
{
|
||||||
|
.....
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sql-lora",
|
||||||
|
"object": "model",
|
||||||
|
"created": 1715644056,
|
||||||
|
"owned_by": "vllm",
|
||||||
|
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
|
||||||
|
"parent": meta-llama/Llama-2-7b-hf,
|
||||||
|
"permission": [
|
||||||
|
{
|
||||||
|
....
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
}
|
||||||
{
|
```
|
||||||
"id": "sql-lora",
|
|
||||||
"object": "model",
|
|
||||||
"created": 1715644056,
|
|
||||||
"owned_by": "vllm",
|
|
||||||
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
|
|
||||||
"parent": meta-llama/Llama-2-7b-hf,
|
|
||||||
"permission": [
|
|
||||||
{
|
|
||||||
....
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|||||||
@ -20,111 +20,117 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
|
|||||||
|
|
||||||
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
# Refer to the HuggingFace repo for the correct format to use
|
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
|
||||||
|
|
||||||
# Load the image using PIL.Image
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
image = PIL.Image.open(...)
|
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||||||
|
|
||||||
# Single prompt inference
|
# Load the image using PIL.Image
|
||||||
outputs = llm.generate({
|
image = PIL.Image.open(...)
|
||||||
"prompt": prompt,
|
|
||||||
"multi_modal_data": {"image": image},
|
|
||||||
})
|
|
||||||
|
|
||||||
for o in outputs:
|
# Single prompt inference
|
||||||
generated_text = o.outputs[0].text
|
outputs = llm.generate({
|
||||||
print(generated_text)
|
"prompt": prompt,
|
||||||
|
"multi_modal_data": {"image": image},
|
||||||
|
})
|
||||||
|
|
||||||
# Batch inference
|
for o in outputs:
|
||||||
image_1 = PIL.Image.open(...)
|
generated_text = o.outputs[0].text
|
||||||
image_2 = PIL.Image.open(...)
|
print(generated_text)
|
||||||
outputs = llm.generate(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
|
|
||||||
"multi_modal_data": {"image": image_1},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
|
|
||||||
"multi_modal_data": {"image": image_2},
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
for o in outputs:
|
# Batch inference
|
||||||
generated_text = o.outputs[0].text
|
image_1 = PIL.Image.open(...)
|
||||||
print(generated_text)
|
image_2 = PIL.Image.open(...)
|
||||||
```
|
outputs = llm.generate(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
|
||||||
|
"multi_modal_data": {"image": image_1},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
|
||||||
|
"multi_modal_data": {"image": image_2},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||||
|
|
||||||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
llm = LLM(
|
```python
|
||||||
model="microsoft/Phi-3.5-vision-instruct",
|
from vllm import LLM
|
||||||
trust_remote_code=True, # Required to load Phi-3.5-vision
|
|
||||||
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
|
|
||||||
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
|
|
||||||
)
|
|
||||||
|
|
||||||
# Refer to the HuggingFace repo for the correct format to use
|
llm = LLM(
|
||||||
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
|
trust_remote_code=True, # Required to load Phi-3.5-vision
|
||||||
|
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
|
||||||
|
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
|
||||||
|
)
|
||||||
|
|
||||||
# Load the images using PIL.Image
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
image1 = PIL.Image.open(...)
|
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
|
||||||
image2 = PIL.Image.open(...)
|
|
||||||
|
|
||||||
outputs = llm.generate({
|
# Load the images using PIL.Image
|
||||||
"prompt": prompt,
|
image1 = PIL.Image.open(...)
|
||||||
"multi_modal_data": {
|
image2 = PIL.Image.open(...)
|
||||||
"image": [image1, image2]
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
for o in outputs:
|
outputs = llm.generate({
|
||||||
generated_text = o.outputs[0].text
|
"prompt": prompt,
|
||||||
print(generated_text)
|
"multi_modal_data": {
|
||||||
```
|
"image": [image1, image2]
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
||||||
|
|
||||||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
# Specify the maximum number of frames per video to be 4. This can be changed.
|
```python
|
||||||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
from vllm import LLM
|
||||||
|
|
||||||
# Create the request payload.
|
# Specify the maximum number of frames per video to be 4. This can be changed.
|
||||||
video_frames = ... # load your video making sure it only has the number of frames specified earlier.
|
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||||||
message = {
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
for i in range(len(video_frames)):
|
|
||||||
base64_image = encode_image(video_frames[i]) # base64 encoding.
|
|
||||||
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
|
||||||
message["content"].append(new_image)
|
|
||||||
|
|
||||||
# Perform inference and log output.
|
# Create the request payload.
|
||||||
outputs = llm.chat([message])
|
video_frames = ... # load your video making sure it only has the number of frames specified earlier.
|
||||||
|
message = {
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
for i in range(len(video_frames)):
|
||||||
|
base64_image = encode_image(video_frames[i]) # base64 encoding.
|
||||||
|
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
||||||
|
message["content"].append(new_image)
|
||||||
|
|
||||||
for o in outputs:
|
# Perform inference and log output.
|
||||||
generated_text = o.outputs[0].text
|
outputs = llm.chat([message])
|
||||||
print(generated_text)
|
|
||||||
```
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
```
|
||||||
|
|
||||||
### Video Inputs
|
### Video Inputs
|
||||||
|
|
||||||
@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
|
|||||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
# Inference with image embeddings as input
|
```python
|
||||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
from vllm import LLM
|
||||||
|
|
||||||
# Refer to the HuggingFace repo for the correct format to use
|
# Inference with image embeddings as input
|
||||||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||||
|
|
||||||
# Embeddings for single image
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
|
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||||||
image_embeds = torch.load(...)
|
|
||||||
|
|
||||||
outputs = llm.generate({
|
# Embeddings for single image
|
||||||
"prompt": prompt,
|
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
|
||||||
"multi_modal_data": {"image": image_embeds},
|
image_embeds = torch.load(...)
|
||||||
})
|
|
||||||
|
|
||||||
for o in outputs:
|
outputs = llm.generate({
|
||||||
generated_text = o.outputs[0].text
|
"prompt": prompt,
|
||||||
print(generated_text)
|
"multi_modal_data": {"image": image_embeds},
|
||||||
```
|
})
|
||||||
|
|
||||||
|
for o in outputs:
|
||||||
|
generated_text = o.outputs[0].text
|
||||||
|
print(generated_text)
|
||||||
|
```
|
||||||
|
|
||||||
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
|
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# Construct the prompt based on your model
|
|
||||||
prompt = ...
|
|
||||||
|
|
||||||
# Embeddings for multiple images
|
```python
|
||||||
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
|
# Construct the prompt based on your model
|
||||||
image_embeds = torch.load(...)
|
prompt = ...
|
||||||
|
|
||||||
# Qwen2-VL
|
# Embeddings for multiple images
|
||||||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
|
||||||
mm_data = {
|
image_embeds = torch.load(...)
|
||||||
"image": {
|
|
||||||
"image_embeds": image_embeds,
|
# Qwen2-VL
|
||||||
# image_grid_thw is needed to calculate positional encoding.
|
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||||||
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
|
mm_data = {
|
||||||
|
"image": {
|
||||||
|
"image_embeds": image_embeds,
|
||||||
|
# image_grid_thw is needed to calculate positional encoding.
|
||||||
|
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
# MiniCPM-V
|
# MiniCPM-V
|
||||||
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
|
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
|
||||||
mm_data = {
|
mm_data = {
|
||||||
"image": {
|
"image": {
|
||||||
"image_embeds": image_embeds,
|
"image_embeds": image_embeds,
|
||||||
# image_sizes is needed to calculate details of the sliced image.
|
# image_sizes is needed to calculate details of the sliced image.
|
||||||
"image_sizes": [image.size for image in images], # list of image sizes
|
"image_sizes": [image.size for image in images], # list of image sizes
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
outputs = llm.generate({
|
outputs = llm.generate({
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"multi_modal_data": mm_data,
|
"multi_modal_data": mm_data,
|
||||||
})
|
})
|
||||||
|
|
||||||
for o in outputs:
|
for o in outputs:
|
||||||
generated_text = o.outputs[0].text
|
generated_text = o.outputs[0].text
|
||||||
print(generated_text)
|
print(generated_text)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Online Serving
|
## Online Serving
|
||||||
|
|
||||||
@ -235,51 +245,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
|
|||||||
|
|
||||||
Then, you can use the OpenAI client as follows:
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
openai_api_key = "EMPTY"
|
```python
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
from openai import OpenAI
|
||||||
|
|
||||||
client = OpenAI(
|
openai_api_key = "EMPTY"
|
||||||
api_key=openai_api_key,
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
base_url=openai_api_base,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Single-image input inference
|
client = OpenAI(
|
||||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
chat_response = client.chat.completions.create(
|
# Single-image input inference
|
||||||
model="microsoft/Phi-3.5-vision-instruct",
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||||
messages=[{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
|
||||||
# since the prompt will be processed automatically by the API server.
|
|
||||||
{"type": "text", "text": "What’s in this image?"},
|
|
||||||
{"type": "image_url", "image_url": {"url": image_url}},
|
|
||||||
],
|
|
||||||
}],
|
|
||||||
)
|
|
||||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
|
||||||
|
|
||||||
# Multi-image input inference
|
chat_response = client.chat.completions.create(
|
||||||
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
|
messages=[{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||||
|
# since the prompt will be processed automatically by the API server.
|
||||||
|
{"type": "text", "text": "What’s in this image?"},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||||
|
|
||||||
chat_response = client.chat.completions.create(
|
# Multi-image input inference
|
||||||
model="microsoft/Phi-3.5-vision-instruct",
|
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
|
||||||
messages=[{
|
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
|
||||||
"role": "user",
|
|
||||||
"content": [
|
chat_response = client.chat.completions.create(
|
||||||
{"type": "text", "text": "What are the animals in these images?"},
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
{"type": "image_url", "image_url": {"url": image_url_duck}},
|
messages=[{
|
||||||
{"type": "image_url", "image_url": {"url": image_url_lion}},
|
"role": "user",
|
||||||
],
|
"content": [
|
||||||
}],
|
{"type": "text", "text": "What are the animals in these images?"},
|
||||||
)
|
{"type": "image_url", "image_url": {"url": image_url_duck}},
|
||||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
{"type": "image_url", "image_url": {"url": image_url_lion}},
|
||||||
```
|
],
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||||
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||||
|
|
||||||
@ -311,44 +323,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
|
|||||||
|
|
||||||
Then, you can use the OpenAI client as follows:
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
openai_api_key = "EMPTY"
|
```python
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
from openai import OpenAI
|
||||||
|
|
||||||
client = OpenAI(
|
openai_api_key = "EMPTY"
|
||||||
api_key=openai_api_key,
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
base_url=openai_api_base,
|
|
||||||
)
|
|
||||||
|
|
||||||
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
client = OpenAI(
|
||||||
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
## Use video url in the payload
|
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||||
chat_completion_from_url = client.chat.completions.create(
|
|
||||||
messages=[{
|
## Use video url in the payload
|
||||||
"role":
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
"user",
|
messages=[{
|
||||||
"content": [
|
"role":
|
||||||
{
|
"user",
|
||||||
"type": "text",
|
"content": [
|
||||||
"text": "What's in this video?"
|
{
|
||||||
},
|
"type": "text",
|
||||||
{
|
"text": "What's in this video?"
|
||||||
"type": "video_url",
|
|
||||||
"video_url": {
|
|
||||||
"url": video_url
|
|
||||||
},
|
},
|
||||||
},
|
{
|
||||||
],
|
"type": "video_url",
|
||||||
}],
|
"video_url": {
|
||||||
model=model,
|
"url": video_url
|
||||||
max_completion_tokens=64,
|
},
|
||||||
)
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
result = chat_completion_from_url.choices[0].message.content
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
print("Chat completion output from image url:", result)
|
print("Chat completion output from image url:", result)
|
||||||
```
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||||
|
|
||||||
@ -373,84 +387,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
|
|||||||
|
|
||||||
Then, you can use the OpenAI client as follows:
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
import base64
|
|
||||||
import requests
|
|
||||||
from openai import OpenAI
|
|
||||||
from vllm.assets.audio import AudioAsset
|
|
||||||
|
|
||||||
def encode_base64_content_from_url(content_url: str) -> str:
|
```python
|
||||||
"""Encode a content retrieved from a remote url to base64 format."""
|
import base64
|
||||||
|
import requests
|
||||||
|
from openai import OpenAI
|
||||||
|
from vllm.assets.audio import AudioAsset
|
||||||
|
|
||||||
with requests.get(content_url) as response:
|
def encode_base64_content_from_url(content_url: str) -> str:
|
||||||
response.raise_for_status()
|
"""Encode a content retrieved from a remote url to base64 format."""
|
||||||
result = base64.b64encode(response.content).decode('utf-8')
|
|
||||||
|
|
||||||
return result
|
with requests.get(content_url) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
result = base64.b64encode(response.content).decode('utf-8')
|
||||||
|
|
||||||
openai_api_key = "EMPTY"
|
return result
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
|
||||||
|
|
||||||
client = OpenAI(
|
openai_api_key = "EMPTY"
|
||||||
api_key=openai_api_key,
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
base_url=openai_api_base,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Any format supported by librosa is supported
|
client = OpenAI(
|
||||||
audio_url = AudioAsset("winning_call").url
|
api_key=openai_api_key,
|
||||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
chat_completion_from_base64 = client.chat.completions.create(
|
# Any format supported by librosa is supported
|
||||||
messages=[{
|
audio_url = AudioAsset("winning_call").url
|
||||||
"role": "user",
|
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||||
"content": [
|
|
||||||
{
|
chat_completion_from_base64 = client.chat.completions.create(
|
||||||
"type": "text",
|
messages=[{
|
||||||
"text": "What's in this audio?"
|
"role": "user",
|
||||||
},
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "input_audio",
|
"type": "text",
|
||||||
"input_audio": {
|
"text": "What's in this audio?"
|
||||||
"data": audio_base64,
|
|
||||||
"format": "wav"
|
|
||||||
},
|
},
|
||||||
},
|
{
|
||||||
],
|
"type": "input_audio",
|
||||||
}],
|
"input_audio": {
|
||||||
model=model,
|
"data": audio_base64,
|
||||||
max_completion_tokens=64,
|
"format": "wav"
|
||||||
)
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
result = chat_completion_from_base64.choices[0].message.content
|
result = chat_completion_from_base64.choices[0].message.content
|
||||||
print("Chat completion output from input audio:", result)
|
print("Chat completion output from input audio:", result)
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
|
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
chat_completion_from_url = client.chat.completions.create(
|
|
||||||
messages=[{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "What's in this audio?"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "audio_url",
|
|
||||||
"audio_url": {
|
|
||||||
"url": audio_url
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}],
|
|
||||||
model=model,
|
|
||||||
max_completion_tokens=64,
|
|
||||||
)
|
|
||||||
|
|
||||||
result = chat_completion_from_url.choices[0].message.content
|
```python
|
||||||
print("Chat completion output from audio url:", result)
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
```
|
messages=[{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this audio?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "audio_url",
|
||||||
|
"audio_url": {
|
||||||
|
"url": audio_url
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
|
print("Chat completion output from audio url:", result)
|
||||||
|
```
|
||||||
|
|
||||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||||
|
|
||||||
@ -470,61 +488,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
|
|||||||
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
|
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
|
||||||
The following example demonstrates how to pass image embeddings to the OpenAI server:
|
The following example demonstrates how to pass image embeddings to the OpenAI server:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
image_embedding = torch.load(...)
|
|
||||||
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
|
|
||||||
|
|
||||||
buffer = io.BytesIO()
|
```python
|
||||||
torch.save(image_embedding, buffer)
|
image_embedding = torch.load(...)
|
||||||
buffer.seek(0)
|
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||||
binary_data = buffer.read()
|
|
||||||
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
|
|
||||||
|
|
||||||
client = OpenAI(
|
buffer = io.BytesIO()
|
||||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
torch.save(image_embedding, buffer)
|
||||||
api_key=openai_api_key,
|
buffer.seek(0)
|
||||||
base_url=openai_api_base,
|
binary_data = buffer.read()
|
||||||
)
|
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||||
|
|
||||||
# Basic usage - this is equivalent to the LLaVA example for offline inference
|
client = OpenAI(
|
||||||
model = "llava-hf/llava-1.5-7b-hf"
|
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||||
embeds = {
|
api_key=openai_api_key,
|
||||||
"type": "image_embeds",
|
base_url=openai_api_base,
|
||||||
"image_embeds": f"{base64_image_embedding}"
|
)
|
||||||
}
|
|
||||||
|
|
||||||
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
# Basic usage - this is equivalent to the LLaVA example for offline inference
|
||||||
model = "Qwen/Qwen2-VL-2B-Instruct"
|
model = "llava-hf/llava-1.5-7b-hf"
|
||||||
embeds = {
|
embeds = {
|
||||||
"type": "image_embeds",
|
"type": "image_embeds",
|
||||||
"image_embeds": {
|
"image_embeds": f"{base64_image_embedding}"
|
||||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
}
|
||||||
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
|
||||||
},
|
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
||||||
}
|
model = "Qwen/Qwen2-VL-2B-Instruct"
|
||||||
model = "openbmb/MiniCPM-V-2_6"
|
embeds = {
|
||||||
embeds = {
|
"type": "image_embeds",
|
||||||
"type": "image_embeds",
|
"image_embeds": {
|
||||||
"image_embeds": {
|
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||||
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
|
||||||
},
|
|
||||||
}
|
|
||||||
chat_completion = client.chat.completions.create(
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": "You are a helpful assistant."},
|
|
||||||
{"role": "user", "content": [
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "What's in this image?",
|
|
||||||
},
|
},
|
||||||
embeds,
|
}
|
||||||
],
|
model = "openbmb/MiniCPM-V-2_6"
|
||||||
},
|
embeds = {
|
||||||
],
|
"type": "image_embeds",
|
||||||
model=model,
|
"image_embeds": {
|
||||||
)
|
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||||
```
|
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
||||||
|
},
|
||||||
|
}
|
||||||
|
chat_completion = client.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this image?",
|
||||||
|
},
|
||||||
|
embeds,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
model=model,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Only one message can contain `{"type": "image_embeds"}`.
|
Only one message can contain `{"type": "image_embeds"}`.
|
||||||
|
|||||||
@ -15,29 +15,31 @@ pip install autoawq
|
|||||||
|
|
||||||
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from awq import AutoAWQForCausalLM
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
|
|
||||||
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
|
```python
|
||||||
quant_path = 'mistral-instruct-v0.2-awq'
|
from awq import AutoAWQForCausalLM
|
||||||
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# Load model
|
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
|
||||||
model = AutoAWQForCausalLM.from_pretrained(
|
quant_path = 'mistral-instruct-v0.2-awq'
|
||||||
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
|
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
||||||
)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
||||||
|
|
||||||
# Quantize
|
# Load model
|
||||||
model.quantize(tokenizer, quant_config=quant_config)
|
model = AutoAWQForCausalLM.from_pretrained(
|
||||||
|
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
|
|
||||||
# Save quantized model
|
# Quantize
|
||||||
model.save_quantized(quant_path)
|
model.quantize(tokenizer, quant_config=quant_config)
|
||||||
tokenizer.save_pretrained(quant_path)
|
|
||||||
|
|
||||||
print(f'Model is quantized and saved at "{quant_path}"')
|
# Save quantized model
|
||||||
```
|
model.save_quantized(quant_path)
|
||||||
|
tokenizer.save_pretrained(quant_path)
|
||||||
|
|
||||||
|
print(f'Model is quantized and saved at "{quant_path}"')
|
||||||
|
```
|
||||||
|
|
||||||
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
|
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
|
||||||
|
|
||||||
@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \
|
|||||||
|
|
||||||
AWQ models are also supported directly through the LLM entrypoint:
|
AWQ models are also supported directly through the LLM entrypoint:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
# Sample prompts.
|
```python
|
||||||
prompts = [
|
from vllm import LLM, SamplingParams
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
# Create a sampling params object.
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
||||||
|
|
||||||
# Create an LLM.
|
# Sample prompts.
|
||||||
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
|
prompts = [
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
"Hello, my name is",
|
||||||
# that contain the prompt, generated text, and other information.
|
"The president of the United States is",
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
"The capital of France is",
|
||||||
# Print the outputs.
|
"The future of AI is",
|
||||||
for output in outputs:
|
]
|
||||||
prompt = output.prompt
|
# Create a sampling params object.
|
||||||
generated_text = output.outputs[0].text
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
||||||
```
|
# Create an LLM.
|
||||||
|
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
|
||||||
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
|
# that contain the prompt, generated text, and other information.
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|||||||
@ -43,17 +43,19 @@ llm = LLM(
|
|||||||
|
|
||||||
## Read gptq format checkpoint
|
## Read gptq format checkpoint
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
|
```python
|
||||||
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
|
from vllm import LLM
|
||||||
llm = LLM(
|
import torch
|
||||||
model=model_id,
|
|
||||||
dtype=torch.float16,
|
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
|
||||||
trust_remote_code=True,
|
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
|
||||||
quantization="bitblas",
|
llm = LLM(
|
||||||
max_model_len=1024
|
model=model_id,
|
||||||
)
|
dtype=torch.float16,
|
||||||
```
|
trust_remote_code=True,
|
||||||
|
quantization="bitblas",
|
||||||
|
max_model_len=1024
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|||||||
@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
|
|||||||
|
|
||||||
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
|
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from llmcompressor.transformers import oneshot
|
|
||||||
from llmcompressor.modifiers.quantization import QuantizationModifier
|
|
||||||
|
|
||||||
# Configure the simple PTQ quantization
|
```python
|
||||||
recipe = QuantizationModifier(
|
from llmcompressor.transformers import oneshot
|
||||||
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
from llmcompressor.modifiers.quantization import QuantizationModifier
|
||||||
|
|
||||||
# Apply the quantization algorithm.
|
# Configure the simple PTQ quantization
|
||||||
oneshot(model=model, recipe=recipe)
|
recipe = QuantizationModifier(
|
||||||
|
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
||||||
|
|
||||||
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
|
# Apply the quantization algorithm.
|
||||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
|
oneshot(model=model, recipe=recipe)
|
||||||
model.save_pretrained(SAVE_DIR)
|
|
||||||
tokenizer.save_pretrained(SAVE_DIR)
|
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
|
||||||
```
|
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
|
||||||
|
model.save_pretrained(SAVE_DIR)
|
||||||
|
tokenizer.save_pretrained(SAVE_DIR)
|
||||||
|
```
|
||||||
|
|
||||||
### 3. Evaluating Accuracy
|
### 3. Evaluating Accuracy
|
||||||
|
|
||||||
|
|||||||
@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
|||||||
|
|
||||||
You can also use the GGUF model directly through the LLM entrypoint:
|
You can also use the GGUF model directly through the LLM entrypoint:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
# In this script, we demonstrate how to pass input to the chat method:
|
```python
|
||||||
conversation = [
|
from vllm import LLM, SamplingParams
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": "You are a helpful assistant"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Hello"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "Hello! How can I assist you today?"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write an essay about the importance of higher education.",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create a sampling params object.
|
# In this script, we demonstrate how to pass input to the chat method:
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
conversation = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "Hello! How can I assist you today?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Write an essay about the importance of higher education.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
# Create an LLM.
|
# Create a sampling params object.
|
||||||
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
|
||||||
# that contain the prompt, generated text, and other information.
|
|
||||||
outputs = llm.chat(conversation, sampling_params)
|
|
||||||
|
|
||||||
# Print the outputs.
|
# Create an LLM.
|
||||||
for output in outputs:
|
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||||
prompt = output.prompt
|
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
||||||
generated_text = output.outputs[0].text
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
# that contain the prompt, generated text, and other information.
|
||||||
```
|
outputs = llm.chat(conversation, sampling_params)
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|||||||
@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
|
|||||||
|
|
||||||
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from datasets import load_dataset
|
|
||||||
from gptqmodel import GPTQModel, QuantizeConfig
|
|
||||||
|
|
||||||
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
```python
|
||||||
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
|
from datasets import load_dataset
|
||||||
|
from gptqmodel import GPTQModel, QuantizeConfig
|
||||||
|
|
||||||
calibration_dataset = load_dataset(
|
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
"allenai/c4",
|
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
|
||||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
|
||||||
split="train"
|
|
||||||
).select(range(1024))["text"]
|
|
||||||
|
|
||||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
calibration_dataset = load_dataset(
|
||||||
|
"allenai/c4",
|
||||||
|
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||||
|
split="train"
|
||||||
|
).select(range(1024))["text"]
|
||||||
|
|
||||||
model = GPTQModel.load(model_id, quant_config)
|
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||||
|
|
||||||
# increase `batch_size` to match gpu/vram specs to speed up quantization
|
model = GPTQModel.load(model_id, quant_config)
|
||||||
model.quantize(calibration_dataset, batch_size=2)
|
|
||||||
|
|
||||||
model.save(quant_path)
|
# increase `batch_size` to match gpu/vram specs to speed up quantization
|
||||||
```
|
model.quantize(calibration_dataset, batch_size=2)
|
||||||
|
|
||||||
|
model.save(quant_path)
|
||||||
|
```
|
||||||
|
|
||||||
## Running a quantized model with vLLM
|
## Running a quantized model with vLLM
|
||||||
|
|
||||||
@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
|
|||||||
|
|
||||||
GPTQModel quantized models are also supported directly through the LLM entrypoint:
|
GPTQModel quantized models are also supported directly through the LLM entrypoint:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
# Sample prompts.
|
```python
|
||||||
prompts = [
|
from vllm import LLM, SamplingParams
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create a sampling params object.
|
# Sample prompts.
|
||||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
|
||||||
# Create an LLM.
|
# Create a sampling params object.
|
||||||
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
|
||||||
|
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
# Create an LLM.
|
||||||
# that contain the prompt, generated text, and other information.
|
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
# Print the outputs.
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
print("-"*50)
|
# that contain the prompt, generated text, and other information.
|
||||||
for output in outputs:
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
# Print the outputs.
|
||||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
|
||||||
print("-"*50)
|
print("-"*50)
|
||||||
```
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||||
|
print("-"*50)
|
||||||
|
```
|
||||||
|
|||||||
@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
|
|||||||
It's best to use calibration data that closely matches your deployment data.
|
It's best to use calibration data that closely matches your deployment data.
|
||||||
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from datasets import load_dataset
|
|
||||||
|
|
||||||
NUM_CALIBRATION_SAMPLES = 512
|
```python
|
||||||
MAX_SEQUENCE_LENGTH = 2048
|
from datasets import load_dataset
|
||||||
|
|
||||||
# Load and preprocess the dataset
|
NUM_CALIBRATION_SAMPLES = 512
|
||||||
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
MAX_SEQUENCE_LENGTH = 2048
|
||||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
|
||||||
|
|
||||||
def preprocess(example):
|
# Load and preprocess the dataset
|
||||||
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
||||||
ds = ds.map(preprocess)
|
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||||
|
|
||||||
def tokenize(sample):
|
def preprocess(example):
|
||||||
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
||||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
ds = ds.map(preprocess)
|
||||||
```
|
|
||||||
|
def tokenize(sample):
|
||||||
|
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
||||||
|
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||||
|
```
|
||||||
|
|
||||||
### 3. Applying Quantization
|
### 3. Applying Quantization
|
||||||
|
|
||||||
Now, apply the quantization algorithms:
|
Now, apply the quantization algorithms:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from llmcompressor.transformers import oneshot
|
|
||||||
from llmcompressor.modifiers.quantization import GPTQModifier
|
|
||||||
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
|
||||||
|
|
||||||
# Configure the quantization algorithms
|
```python
|
||||||
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
|
from llmcompressor.transformers import oneshot
|
||||||
|
from llmcompressor.modifiers.quantization import GPTQModifier
|
||||||
|
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
||||||
|
|
||||||
# Apply quantization
|
# Configure the quantization algorithms
|
||||||
oneshot(
|
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
|
||||||
model=model,
|
|
||||||
dataset=ds,
|
|
||||||
recipe=recipe,
|
|
||||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
|
||||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
|
# Apply quantization
|
||||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
|
oneshot(
|
||||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
model=model,
|
||||||
tokenizer.save_pretrained(SAVE_DIR)
|
dataset=ds,
|
||||||
```
|
recipe=recipe,
|
||||||
|
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||||
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
|
||||||
|
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
|
||||||
|
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||||
|
tokenizer.save_pretrained(SAVE_DIR)
|
||||||
|
```
|
||||||
|
|
||||||
This process creates a W4A16 model with weights quantized to 4-bit integers.
|
This process creates a W4A16 model with weights quantized to 4-bit integers.
|
||||||
|
|
||||||
@ -137,34 +141,36 @@ $ lm_eval --model vllm \
|
|||||||
|
|
||||||
The following is an example of an expanded quantization recipe you can tune to your own use case:
|
The following is an example of an expanded quantization recipe you can tune to your own use case:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from compressed_tensors.quantization import (
|
|
||||||
QuantizationArgs,
|
```python
|
||||||
QuantizationScheme,
|
from compressed_tensors.quantization import (
|
||||||
QuantizationStrategy,
|
QuantizationArgs,
|
||||||
QuantizationType,
|
QuantizationScheme,
|
||||||
)
|
QuantizationStrategy,
|
||||||
recipe = GPTQModifier(
|
QuantizationType,
|
||||||
targets="Linear",
|
)
|
||||||
config_groups={
|
recipe = GPTQModifier(
|
||||||
"config_group": QuantizationScheme(
|
targets="Linear",
|
||||||
targets=["Linear"],
|
config_groups={
|
||||||
weights=QuantizationArgs(
|
"config_group": QuantizationScheme(
|
||||||
num_bits=4,
|
targets=["Linear"],
|
||||||
type=QuantizationType.INT,
|
weights=QuantizationArgs(
|
||||||
strategy=QuantizationStrategy.GROUP,
|
num_bits=4,
|
||||||
group_size=128,
|
type=QuantizationType.INT,
|
||||||
symmetric=True,
|
strategy=QuantizationStrategy.GROUP,
|
||||||
dynamic=False,
|
group_size=128,
|
||||||
actorder="weight",
|
symmetric=True,
|
||||||
|
dynamic=False,
|
||||||
|
actorder="weight",
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
},
|
||||||
},
|
ignore=["lm_head"],
|
||||||
ignore=["lm_head"],
|
update_size=NUM_CALIBRATION_SAMPLES,
|
||||||
update_size=NUM_CALIBRATION_SAMPLES,
|
dampening_frac=0.01
|
||||||
dampening_frac=0.01
|
)
|
||||||
)
|
```
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting and Support
|
## Troubleshooting and Support
|
||||||
|
|
||||||
|
|||||||
@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
|
|||||||
It's best to use calibration data that closely matches your deployment data.
|
It's best to use calibration data that closely matches your deployment data.
|
||||||
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from datasets import load_dataset
|
|
||||||
|
|
||||||
NUM_CALIBRATION_SAMPLES = 512
|
```python
|
||||||
MAX_SEQUENCE_LENGTH = 2048
|
from datasets import load_dataset
|
||||||
|
|
||||||
# Load and preprocess the dataset
|
NUM_CALIBRATION_SAMPLES = 512
|
||||||
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
MAX_SEQUENCE_LENGTH = 2048
|
||||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
|
||||||
|
|
||||||
def preprocess(example):
|
# Load and preprocess the dataset
|
||||||
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
||||||
ds = ds.map(preprocess)
|
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||||
|
|
||||||
def tokenize(sample):
|
def preprocess(example):
|
||||||
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
||||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
ds = ds.map(preprocess)
|
||||||
```
|
|
||||||
|
def tokenize(sample):
|
||||||
|
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
||||||
|
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
### 3. Applying Quantization
|
### 3. Applying Quantization
|
||||||
|
|
||||||
Now, apply the quantization algorithms:
|
Now, apply the quantization algorithms:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from llmcompressor.transformers import oneshot
|
|
||||||
from llmcompressor.modifiers.quantization import GPTQModifier
|
|
||||||
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
|
||||||
|
|
||||||
# Configure the quantization algorithms
|
```python
|
||||||
recipe = [
|
from llmcompressor.transformers import oneshot
|
||||||
SmoothQuantModifier(smoothing_strength=0.8),
|
from llmcompressor.modifiers.quantization import GPTQModifier
|
||||||
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
|
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
||||||
]
|
|
||||||
|
|
||||||
# Apply quantization
|
# Configure the quantization algorithms
|
||||||
oneshot(
|
recipe = [
|
||||||
model=model,
|
SmoothQuantModifier(smoothing_strength=0.8),
|
||||||
dataset=ds,
|
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
|
||||||
recipe=recipe,
|
]
|
||||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
|
||||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
|
# Apply quantization
|
||||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
|
oneshot(
|
||||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
model=model,
|
||||||
tokenizer.save_pretrained(SAVE_DIR)
|
dataset=ds,
|
||||||
```
|
recipe=recipe,
|
||||||
|
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||||
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
|
||||||
|
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
|
||||||
|
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||||
|
tokenizer.save_pretrained(SAVE_DIR)
|
||||||
|
```
|
||||||
|
|
||||||
This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
|
This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
|
||||||
|
|
||||||
|
|||||||
@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te
|
|||||||
|
|
||||||
Below is an example showing how to quantize a model using modelopt's PTQ API:
|
Below is an example showing how to quantize a model using modelopt's PTQ API:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
import modelopt.torch.quantization as mtq
|
|
||||||
from transformers import AutoModelForCausalLM
|
|
||||||
|
|
||||||
# Load the model from HuggingFace
|
```python
|
||||||
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
|
import modelopt.torch.quantization as mtq
|
||||||
|
from transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
# Select the quantization config, for example, FP8
|
# Load the model from HuggingFace
|
||||||
config = mtq.FP8_DEFAULT_CFG
|
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
|
||||||
|
|
||||||
# Define a forward loop function for calibration
|
# Select the quantization config, for example, FP8
|
||||||
def forward_loop(model):
|
config = mtq.FP8_DEFAULT_CFG
|
||||||
for data in calib_set:
|
|
||||||
model(data)
|
|
||||||
|
|
||||||
# PTQ with in-place replacement of quantized modules
|
# Define a forward loop function for calibration
|
||||||
model = mtq.quantize(model, config, forward_loop)
|
def forward_loop(model):
|
||||||
```
|
for data in calib_set:
|
||||||
|
model(data)
|
||||||
|
|
||||||
|
# PTQ with in-place replacement of quantized modules
|
||||||
|
model = mtq.quantize(model, config, forward_loop)
|
||||||
|
```
|
||||||
|
|
||||||
After the model is quantized, you can export it to a quantized checkpoint using the export API:
|
After the model is quantized, you can export it to a quantized checkpoint using the export API:
|
||||||
|
|
||||||
@ -48,31 +50,33 @@ with torch.inference_mode():
|
|||||||
|
|
||||||
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
|
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
def main():
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
def main():
|
||||||
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
|
|
||||||
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
||||||
|
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
|
||||||
|
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
||||||
|
|
||||||
prompts = [
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
|
||||||
for output in outputs:
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
for output in outputs:
|
||||||
main()
|
prompt = output.prompt
|
||||||
```
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
```
|
||||||
|
|||||||
@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
|
|||||||
|
|
||||||
Here is an example of how to enable FP8 quantization:
|
Here is an example of how to enable FP8 quantization:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# To calculate kv cache scales on the fly enable the calculate_kv_scales
|
|
||||||
# parameter
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
```python
|
||||||
|
# To calculate kv cache scales on the fly enable the calculate_kv_scales
|
||||||
|
# parameter
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
|
||||||
kv_cache_dtype="fp8",
|
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
||||||
calculate_kv_scales=True)
|
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
||||||
prompt = "London is the capital of"
|
kv_cache_dtype="fp8",
|
||||||
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
calculate_kv_scales=True)
|
||||||
print(out)
|
prompt = "London is the capital of"
|
||||||
```
|
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
||||||
|
print(out)
|
||||||
|
```
|
||||||
|
|
||||||
The `kv_cache_dtype` argument specifies the data type for KV cache storage:
|
The `kv_cache_dtype` argument specifies the data type for KV cache storage:
|
||||||
- `"auto"`: Uses the model's default "unquantized" data type
|
- `"auto"`: Uses the model's default "unquantized" data type
|
||||||
@ -71,67 +73,69 @@ pip install llmcompressor
|
|||||||
|
|
||||||
Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
|
Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from datasets import load_dataset
|
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
||||||
from llmcompressor.transformers import oneshot
|
|
||||||
|
|
||||||
# Select model and load it
|
```python
|
||||||
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
from datasets import load_dataset
|
||||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
from llmcompressor.transformers import oneshot
|
||||||
|
|
||||||
# Select calibration dataset
|
# Select model and load it
|
||||||
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
|
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
||||||
DATASET_SPLIT = "train_sft"
|
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||||
|
|
||||||
# Configure calibration parameters
|
# Select calibration dataset
|
||||||
NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point
|
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
|
||||||
MAX_SEQUENCE_LENGTH = 2048
|
DATASET_SPLIT = "train_sft"
|
||||||
|
|
||||||
# Load and preprocess dataset
|
# Configure calibration parameters
|
||||||
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
|
NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point
|
||||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
MAX_SEQUENCE_LENGTH = 2048
|
||||||
|
|
||||||
def process_and_tokenize(example):
|
# Load and preprocess dataset
|
||||||
text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
|
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
|
||||||
return tokenizer(
|
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||||
text,
|
|
||||||
padding=False,
|
def process_and_tokenize(example):
|
||||||
max_length=MAX_SEQUENCE_LENGTH,
|
text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
|
||||||
truncation=True,
|
return tokenizer(
|
||||||
add_special_tokens=False,
|
text,
|
||||||
|
padding=False,
|
||||||
|
max_length=MAX_SEQUENCE_LENGTH,
|
||||||
|
truncation=True,
|
||||||
|
add_special_tokens=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
|
||||||
|
|
||||||
|
# Configure quantization settings
|
||||||
|
recipe = """
|
||||||
|
quant_stage:
|
||||||
|
quant_modifiers:
|
||||||
|
QuantizationModifier:
|
||||||
|
kv_cache_scheme:
|
||||||
|
num_bits: 8
|
||||||
|
type: float
|
||||||
|
strategy: tensor
|
||||||
|
dynamic: false
|
||||||
|
symmetric: true
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Apply quantization
|
||||||
|
oneshot(
|
||||||
|
model=model,
|
||||||
|
dataset=ds,
|
||||||
|
recipe=recipe,
|
||||||
|
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||||
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||||
)
|
)
|
||||||
|
|
||||||
ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
|
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
|
||||||
# Configure quantization settings
|
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||||
recipe = """
|
tokenizer.save_pretrained(SAVE_DIR)
|
||||||
quant_stage:
|
```
|
||||||
quant_modifiers:
|
|
||||||
QuantizationModifier:
|
|
||||||
kv_cache_scheme:
|
|
||||||
num_bits: 8
|
|
||||||
type: float
|
|
||||||
strategy: tensor
|
|
||||||
dynamic: false
|
|
||||||
symmetric: true
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Apply quantization
|
|
||||||
oneshot(
|
|
||||||
model=model,
|
|
||||||
dataset=ds,
|
|
||||||
recipe=recipe,
|
|
||||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
|
||||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
|
|
||||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
|
|
||||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
|
||||||
tokenizer.save_pretrained(SAVE_DIR)
|
|
||||||
```
|
|
||||||
|
|
||||||
The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.
|
The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.
|
||||||
|
|
||||||
|
|||||||
@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below:
|
|||||||
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
|
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
|
||||||
to fetch model and tokenizer.
|
to fetch model and tokenizer.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
||||||
|
|
||||||
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
|
```python
|
||||||
MAX_SEQ_LEN = 512
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
|
||||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
MAX_SEQ_LEN = 512
|
||||||
)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||||
```
|
)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
|
||||||
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
```
|
||||||
|
|
||||||
### 2. Prepare the Calibration Dataloader
|
### 2. Prepare the Calibration Dataloader
|
||||||
|
|
||||||
@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
|
|||||||
to load calibration data. For more details about how to use calibration datasets efficiently, please refer
|
to load calibration data. For more details about how to use calibration datasets efficiently, please refer
|
||||||
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
|
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from datasets import load_dataset
|
|
||||||
from torch.utils.data import DataLoader
|
|
||||||
|
|
||||||
BATCH_SIZE = 1
|
```python
|
||||||
NUM_CALIBRATION_DATA = 512
|
from datasets import load_dataset
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
# Load the dataset and get calibration data.
|
BATCH_SIZE = 1
|
||||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
NUM_CALIBRATION_DATA = 512
|
||||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
|
||||||
|
|
||||||
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
# Load the dataset and get calibration data.
|
||||||
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||||
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||||
batch_size=BATCH_SIZE, drop_last=True)
|
|
||||||
```
|
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
||||||
|
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
||||||
|
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
||||||
|
batch_size=BATCH_SIZE, drop_last=True)
|
||||||
|
```
|
||||||
|
|
||||||
### 3. Set the Quantization Configuration
|
### 3. Set the Quantization Configuration
|
||||||
|
|
||||||
@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
|||||||
AutoSmoothQuant config file for Llama is
|
AutoSmoothQuant config file for Llama is
|
||||||
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
|
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from quark.torch.quantization import (Config, QuantizationConfig,
|
|
||||||
FP8E4M3PerTensorSpec,
|
|
||||||
load_quant_algo_config_from_file)
|
|
||||||
|
|
||||||
# Define fp8/per-tensor/static spec.
|
```python
|
||||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
from quark.torch.quantization import (Config, QuantizationConfig,
|
||||||
is_dynamic=False).to_quantization_spec()
|
FP8E4M3PerTensorSpec,
|
||||||
|
load_quant_algo_config_from_file)
|
||||||
|
|
||||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
# Define fp8/per-tensor/static spec.
|
||||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
||||||
weight=FP8_PER_TENSOR_SPEC)
|
is_dynamic=False).to_quantization_spec()
|
||||||
|
|
||||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
||||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
weight=FP8_PER_TENSOR_SPEC)
|
||||||
kv_cache_quant_config = {name :
|
|
||||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
|
||||||
weight=global_quant_config.weight,
|
|
||||||
output_tensors=KV_CACHE_SPEC)
|
|
||||||
for name in kv_cache_layer_names_for_llama}
|
|
||||||
layer_quant_config = kv_cache_quant_config.copy()
|
|
||||||
|
|
||||||
# Define algorithm config by config file.
|
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||||
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||||
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
kv_cache_quant_config = {name :
|
||||||
|
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
||||||
|
weight=global_quant_config.weight,
|
||||||
|
output_tensors=KV_CACHE_SPEC)
|
||||||
|
for name in kv_cache_layer_names_for_llama}
|
||||||
|
layer_quant_config = kv_cache_quant_config.copy()
|
||||||
|
|
||||||
EXCLUDE_LAYERS = ["lm_head"]
|
# Define algorithm config by config file.
|
||||||
quant_config = Config(
|
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
||||||
global_quant_config=global_quant_config,
|
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
||||||
layer_quant_config=layer_quant_config,
|
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
||||||
kv_cache_quant_config=kv_cache_quant_config,
|
|
||||||
exclude=EXCLUDE_LAYERS,
|
EXCLUDE_LAYERS = ["lm_head"]
|
||||||
algo_config=algo_config)
|
quant_config = Config(
|
||||||
```
|
global_quant_config=global_quant_config,
|
||||||
|
layer_quant_config=layer_quant_config,
|
||||||
|
kv_cache_quant_config=kv_cache_quant_config,
|
||||||
|
exclude=EXCLUDE_LAYERS,
|
||||||
|
algo_config=algo_config)
|
||||||
|
```
|
||||||
|
|
||||||
### 4. Quantize the Model and Export
|
### 4. Quantize the Model and Export
|
||||||
|
|
||||||
@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to
|
|||||||
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
|
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
|
||||||
for more exporting format details.
|
for more exporting format details.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
import torch
|
|
||||||
from quark.torch import ModelQuantizer, ModelExporter
|
|
||||||
from quark.torch.export import ExporterConfig, JsonExporterConfig
|
|
||||||
|
|
||||||
# Apply quantization.
|
```python
|
||||||
quantizer = ModelQuantizer(quant_config)
|
import torch
|
||||||
quant_model = quantizer.quantize_model(model, calib_dataloader)
|
from quark.torch import ModelQuantizer, ModelExporter
|
||||||
|
from quark.torch.export import ExporterConfig, JsonExporterConfig
|
||||||
|
|
||||||
# Freeze quantized model to export.
|
# Apply quantization.
|
||||||
freezed_model = quantizer.freeze(model)
|
quantizer = ModelQuantizer(quant_config)
|
||||||
|
quant_model = quantizer.quantize_model(model, calib_dataloader)
|
||||||
|
|
||||||
# Define export config.
|
# Freeze quantized model to export.
|
||||||
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
|
freezed_model = quantizer.freeze(model)
|
||||||
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
|
|
||||||
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
|
|
||||||
|
|
||||||
# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
|
# Define export config.
|
||||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
|
||||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
|
||||||
with torch.no_grad():
|
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
|
||||||
exporter.export_safetensors_model(freezed_model,
|
|
||||||
quant_config=quant_config, tokenizer=tokenizer)
|
# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
|
||||||
```
|
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||||
|
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||||
|
with torch.no_grad():
|
||||||
|
exporter.export_safetensors_model(freezed_model,
|
||||||
|
quant_config=quant_config, tokenizer=tokenizer)
|
||||||
|
```
|
||||||
|
|
||||||
### 5. Evaluation in vLLM
|
### 5. Evaluation in vLLM
|
||||||
|
|
||||||
Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
|
Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
# Sample prompts.
|
```python
|
||||||
prompts = [
|
from vllm import LLM, SamplingParams
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
# Create a sampling params object.
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
||||||
|
|
||||||
# Create an LLM.
|
# Sample prompts.
|
||||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
prompts = [
|
||||||
kv_cache_dtype='fp8',quantization='quark')
|
"Hello, my name is",
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
"The president of the United States is",
|
||||||
# that contain the prompt, generated text, and other information.
|
"The capital of France is",
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
"The future of AI is",
|
||||||
# Print the outputs.
|
]
|
||||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
# Create a sampling params object.
|
||||||
for output in outputs:
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
# Create an LLM.
|
||||||
print(f"Prompt: {prompt!r}")
|
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||||
print(f"Output: {generated_text!r}")
|
kv_cache_dtype='fp8',quantization='quark')
|
||||||
print("-" * 60)
|
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||||
```
|
# that contain the prompt, generated text, and other information.
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
# Print the outputs.
|
||||||
|
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}")
|
||||||
|
print(f"Output: {generated_text!r}")
|
||||||
|
print("-" * 60)
|
||||||
|
```
|
||||||
|
|
||||||
Or, you can use `lm_eval` to evaluate accuracy:
|
Or, you can use `lm_eval` to evaluate accuracy:
|
||||||
|
|
||||||
|
|||||||
@ -15,26 +15,28 @@ pip install \
|
|||||||
## Quantizing HuggingFace Models
|
## Quantizing HuggingFace Models
|
||||||
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
|
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
|
||||||
|
|
||||||
```Python
|
??? Code
|
||||||
import torch
|
|
||||||
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
|
||||||
from torchao.quantization import Int8WeightOnlyConfig
|
|
||||||
|
|
||||||
model_name = "meta-llama/Meta-Llama-3-8B"
|
```Python
|
||||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
import torch
|
||||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||||
model_name,
|
from torchao.quantization import Int8WeightOnlyConfig
|
||||||
torch_dtype="auto",
|
|
||||||
device_map="auto",
|
|
||||||
quantization_config=quantization_config
|
|
||||||
)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
input_text = "What are we having for dinner?"
|
|
||||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
|
||||||
|
|
||||||
hub_repo = # YOUR HUB REPO ID
|
model_name = "meta-llama/Meta-Llama-3-8B"
|
||||||
tokenizer.push_to_hub(hub_repo)
|
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||||
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
|
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||||
```
|
model_name,
|
||||||
|
torch_dtype="auto",
|
||||||
|
device_map="auto",
|
||||||
|
quantization_config=quantization_config
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
input_text = "What are we having for dinner?"
|
||||||
|
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||||
|
|
||||||
|
hub_repo = # YOUR HUB REPO ID
|
||||||
|
tokenizer.push_to_hub(hub_repo)
|
||||||
|
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
|
||||||
|
```
|
||||||
|
|
||||||
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
|
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
|
||||||
|
|||||||
@ -33,34 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
|||||||
|
|
||||||
Next, make a request to the model that should return the reasoning content in the response.
|
Next, make a request to the model that should return the reasoning content in the response.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
```python
|
||||||
openai_api_key = "EMPTY"
|
from openai import OpenAI
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
|
||||||
|
|
||||||
client = OpenAI(
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
api_key=openai_api_key,
|
openai_api_key = "EMPTY"
|
||||||
base_url=openai_api_base,
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
)
|
|
||||||
|
|
||||||
models = client.models.list()
|
client = OpenAI(
|
||||||
model = models.data[0].id
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
# Round 1
|
models = client.models.list()
|
||||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
model = models.data[0].id
|
||||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
|
||||||
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
|
|
||||||
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
|
|
||||||
response = client.chat.completions.create(model=model, messages=messages)
|
|
||||||
|
|
||||||
reasoning_content = response.choices[0].message.reasoning_content
|
# Round 1
|
||||||
content = response.choices[0].message.content
|
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||||
|
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||||
|
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
|
||||||
|
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
|
||||||
|
response = client.chat.completions.create(model=model, messages=messages)
|
||||||
|
|
||||||
print("reasoning_content:", reasoning_content)
|
reasoning_content = response.choices[0].message.reasoning_content
|
||||||
print("content:", content)
|
content = response.choices[0].message.content
|
||||||
```
|
|
||||||
|
print("reasoning_content:", reasoning_content)
|
||||||
|
print("content:", content)
|
||||||
|
```
|
||||||
|
|
||||||
The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
|
The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
|
||||||
|
|
||||||
@ -68,77 +70,81 @@ The `reasoning_content` field contains the reasoning steps that led to the final
|
|||||||
|
|
||||||
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
|
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
|
||||||
|
|
||||||
```json
|
??? Json
|
||||||
{
|
|
||||||
"id": "chatcmpl-123",
|
```json
|
||||||
"object": "chat.completion.chunk",
|
{
|
||||||
"created": 1694268190,
|
"id": "chatcmpl-123",
|
||||||
"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
"object": "chat.completion.chunk",
|
||||||
"system_fingerprint": "fp_44709d6fcb",
|
"created": 1694268190,
|
||||||
"choices": [
|
"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
||||||
{
|
"system_fingerprint": "fp_44709d6fcb",
|
||||||
"index": 0,
|
"choices": [
|
||||||
"delta": {
|
{
|
||||||
"role": "assistant",
|
"index": 0,
|
||||||
"reasoning_content": "is",
|
"delta": {
|
||||||
},
|
"role": "assistant",
|
||||||
"logprobs": null,
|
"reasoning_content": "is",
|
||||||
"finish_reason": null
|
},
|
||||||
}
|
"logprobs": null,
|
||||||
]
|
"finish_reason": null
|
||||||
}
|
}
|
||||||
```
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
|
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
```python
|
||||||
openai_api_key = "EMPTY"
|
from openai import OpenAI
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
|
||||||
|
|
||||||
client = OpenAI(
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
api_key=openai_api_key,
|
openai_api_key = "EMPTY"
|
||||||
base_url=openai_api_base,
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
)
|
|
||||||
|
|
||||||
models = client.models.list()
|
client = OpenAI(
|
||||||
model = models.data[0].id
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
models = client.models.list()
|
||||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
model = models.data[0].id
|
||||||
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
|
|
||||||
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
|
|
||||||
stream = client.chat.completions.create(model=model,
|
|
||||||
messages=messages,
|
|
||||||
stream=True)
|
|
||||||
|
|
||||||
print("client: Start streaming chat completions...")
|
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||||
printed_reasoning_content = False
|
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||||
printed_content = False
|
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
|
||||||
|
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
|
||||||
|
stream = client.chat.completions.create(model=model,
|
||||||
|
messages=messages,
|
||||||
|
stream=True)
|
||||||
|
|
||||||
for chunk in stream:
|
print("client: Start streaming chat completions...")
|
||||||
reasoning_content = None
|
printed_reasoning_content = False
|
||||||
content = None
|
printed_content = False
|
||||||
# Check the content is reasoning_content or content
|
|
||||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
|
||||||
reasoning_content = chunk.choices[0].delta.reasoning_content
|
|
||||||
elif hasattr(chunk.choices[0].delta, "content"):
|
|
||||||
content = chunk.choices[0].delta.content
|
|
||||||
|
|
||||||
if reasoning_content is not None:
|
for chunk in stream:
|
||||||
if not printed_reasoning_content:
|
reasoning_content = None
|
||||||
printed_reasoning_content = True
|
content = None
|
||||||
print("reasoning_content:", end="", flush=True)
|
# Check the content is reasoning_content or content
|
||||||
print(reasoning_content, end="", flush=True)
|
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||||
elif content is not None:
|
reasoning_content = chunk.choices[0].delta.reasoning_content
|
||||||
if not printed_content:
|
elif hasattr(chunk.choices[0].delta, "content"):
|
||||||
printed_content = True
|
content = chunk.choices[0].delta.content
|
||||||
print("\ncontent:", end="", flush=True)
|
|
||||||
# Extract and print the content
|
if reasoning_content is not None:
|
||||||
print(content, end="", flush=True)
|
if not printed_reasoning_content:
|
||||||
```
|
printed_reasoning_content = True
|
||||||
|
print("reasoning_content:", end="", flush=True)
|
||||||
|
print(reasoning_content, end="", flush=True)
|
||||||
|
elif content is not None:
|
||||||
|
if not printed_content:
|
||||||
|
printed_content = True
|
||||||
|
print("\ncontent:", end="", flush=True)
|
||||||
|
# Extract and print the content
|
||||||
|
print(content, end="", flush=True)
|
||||||
|
```
|
||||||
|
|
||||||
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||||
|
|
||||||
@ -146,41 +152,43 @@ Remember to check whether the `reasoning_content` exists in the response before
|
|||||||
|
|
||||||
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
tools = [{
|
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||||
"type": "function",
|
|
||||||
"function": {
|
tools = [{
|
||||||
"name": "get_weather",
|
"type": "function",
|
||||||
"description": "Get the current weather in a given location",
|
"function": {
|
||||||
"parameters": {
|
"name": "get_weather",
|
||||||
"type": "object",
|
"description": "Get the current weather in a given location",
|
||||||
"properties": {
|
"parameters": {
|
||||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
"type": "object",
|
||||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
"properties": {
|
||||||
},
|
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||||
"required": ["location", "unit"]
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||||
|
},
|
||||||
|
"required": ["location", "unit"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}]
|
||||||
}]
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=client.models.list().data[0].id,
|
model=client.models.list().data[0].id,
|
||||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||||
tools=tools,
|
tools=tools,
|
||||||
tool_choice="auto"
|
tool_choice="auto"
|
||||||
)
|
)
|
||||||
|
|
||||||
print(response)
|
print(response)
|
||||||
tool_call = response.choices[0].message.tool_calls[0].function
|
tool_call = response.choices[0].message.tool_calls[0].function
|
||||||
|
|
||||||
print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
|
print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
|
||||||
print(f"Function called: {tool_call.name}")
|
print(f"Function called: {tool_call.name}")
|
||||||
print(f"Arguments: {tool_call.arguments}")
|
print(f"Arguments: {tool_call.arguments}")
|
||||||
```
|
```
|
||||||
|
|
||||||
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
|
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
|
||||||
|
|
||||||
@ -192,85 +200,89 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
|
|||||||
|
|
||||||
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# import the required packages
|
|
||||||
|
|
||||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
```python
|
||||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
# import the required packages
|
||||||
DeltaMessage)
|
|
||||||
|
|
||||||
# define a reasoning parser and register it to vllm
|
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||||
# the name list in register_module can be used
|
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||||
# in --reasoning-parser.
|
DeltaMessage)
|
||||||
@ReasoningParserManager.register_module(["example"])
|
|
||||||
class ExampleParser(ReasoningParser):
|
|
||||||
def __init__(self, tokenizer: AnyTokenizer):
|
|
||||||
super().__init__(tokenizer)
|
|
||||||
|
|
||||||
def extract_reasoning_content_streaming(
|
# define a reasoning parser and register it to vllm
|
||||||
self,
|
# the name list in register_module can be used
|
||||||
previous_text: str,
|
# in --reasoning-parser.
|
||||||
current_text: str,
|
@ReasoningParserManager.register_module(["example"])
|
||||||
delta_text: str,
|
class ExampleParser(ReasoningParser):
|
||||||
previous_token_ids: Sequence[int],
|
def __init__(self, tokenizer: AnyTokenizer):
|
||||||
current_token_ids: Sequence[int],
|
super().__init__(tokenizer)
|
||||||
delta_token_ids: Sequence[int],
|
|
||||||
) -> Union[DeltaMessage, None]:
|
|
||||||
"""
|
|
||||||
Instance method that should be implemented for extracting reasoning
|
|
||||||
from an incomplete response; for use when handling reasoning calls and
|
|
||||||
streaming. Has to be an instance method because it requires state -
|
|
||||||
the current tokens/diffs, but also the information about what has
|
|
||||||
previously been parsed and extracted (see constructor)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def extract_reasoning_content(
|
def extract_reasoning_content_streaming(
|
||||||
self, model_output: str, request: ChatCompletionRequest
|
self,
|
||||||
) -> tuple[Optional[str], Optional[str]]:
|
previous_text: str,
|
||||||
"""
|
current_text: str,
|
||||||
Extract reasoning content from a complete model-generated string.
|
delta_text: str,
|
||||||
|
previous_token_ids: Sequence[int],
|
||||||
|
current_token_ids: Sequence[int],
|
||||||
|
delta_token_ids: Sequence[int],
|
||||||
|
) -> Union[DeltaMessage, None]:
|
||||||
|
"""
|
||||||
|
Instance method that should be implemented for extracting reasoning
|
||||||
|
from an incomplete response; for use when handling reasoning calls and
|
||||||
|
streaming. Has to be an instance method because it requires state -
|
||||||
|
the current tokens/diffs, but also the information about what has
|
||||||
|
previously been parsed and extracted (see constructor)
|
||||||
|
"""
|
||||||
|
|
||||||
Used for non-streaming responses where we have the entire model response
|
def extract_reasoning_content(
|
||||||
available before sending to the client.
|
self, model_output: str, request: ChatCompletionRequest
|
||||||
|
) -> tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
Extract reasoning content from a complete model-generated string.
|
||||||
|
|
||||||
Parameters:
|
Used for non-streaming responses where we have the entire model response
|
||||||
model_output: str
|
available before sending to the client.
|
||||||
The model-generated string to extract reasoning content from.
|
|
||||||
|
|
||||||
request: ChatCompletionRequest
|
Parameters:
|
||||||
The request object that was used to generate the model_output.
|
model_output: str
|
||||||
|
The model-generated string to extract reasoning content from.
|
||||||
|
|
||||||
Returns:
|
request: ChatCompletionRequest
|
||||||
tuple[Optional[str], Optional[str]]
|
The request object that was used to generate the model_output.
|
||||||
A tuple containing the reasoning content and the content.
|
|
||||||
"""
|
Returns:
|
||||||
```
|
tuple[Optional[str], Optional[str]]
|
||||||
|
A tuple containing the reasoning content and the content.
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
@dataclass
|
|
||||||
class DeepSeekReasoner(Reasoner):
|
|
||||||
"""
|
|
||||||
Reasoner for DeepSeek R series models.
|
|
||||||
"""
|
|
||||||
start_token_id: int
|
|
||||||
end_token_id: int
|
|
||||||
|
|
||||||
start_token: str = "<think>"
|
```python
|
||||||
end_token: str = "</think>"
|
@dataclass
|
||||||
|
class DeepSeekReasoner(Reasoner):
|
||||||
|
"""
|
||||||
|
Reasoner for DeepSeek R series models.
|
||||||
|
"""
|
||||||
|
start_token_id: int
|
||||||
|
end_token_id: int
|
||||||
|
|
||||||
@classmethod
|
start_token: str = "<think>"
|
||||||
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
|
end_token: str = "</think>"
|
||||||
return cls(start_token_id=tokenizer.encode(
|
|
||||||
"<think>", add_special_tokens=False)[0],
|
|
||||||
end_token_id=tokenizer.encode("</think>",
|
|
||||||
add_special_tokens=False)[0])
|
|
||||||
|
|
||||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
@classmethod
|
||||||
return self.end_token_id in input_ids
|
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
|
||||||
...
|
return cls(start_token_id=tokenizer.encode(
|
||||||
```
|
"<think>", add_special_tokens=False)[0],
|
||||||
|
end_token_id=tokenizer.encode("</think>",
|
||||||
|
add_special_tokens=False)[0])
|
||||||
|
|
||||||
|
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||||
|
return self.end_token_id in input_ids
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
|
The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
|
||||||
|
|
||||||
|
|||||||
@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory
|
|||||||
|
|
||||||
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
prompts = [
|
```python
|
||||||
"The future of AI is",
|
from vllm import LLM, SamplingParams
|
||||||
]
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
||||||
|
|
||||||
llm = LLM(
|
prompts = [
|
||||||
model="facebook/opt-6.7b",
|
"The future of AI is",
|
||||||
tensor_parallel_size=1,
|
]
|
||||||
speculative_config={
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
"model": "facebook/opt-125m",
|
|
||||||
"num_speculative_tokens": 5,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
for output in outputs:
|
llm = LLM(
|
||||||
prompt = output.prompt
|
model="facebook/opt-6.7b",
|
||||||
generated_text = output.outputs[0].text
|
tensor_parallel_size=1,
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
speculative_config={
|
||||||
```
|
"model": "facebook/opt-125m",
|
||||||
|
"num_speculative_tokens": 5,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
To perform the same with an online mode launch the server:
|
To perform the same with an online mode launch the server:
|
||||||
|
|
||||||
@ -60,69 +62,73 @@ python -m vllm.entrypoints.openai.api_server \
|
|||||||
|
|
||||||
Then use a client:
|
Then use a client:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
```python
|
||||||
openai_api_key = "EMPTY"
|
from openai import OpenAI
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
|
||||||
|
|
||||||
client = OpenAI(
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
openai_api_key = "EMPTY"
|
||||||
api_key=openai_api_key,
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
base_url=openai_api_base,
|
|
||||||
)
|
|
||||||
|
|
||||||
models = client.models.list()
|
client = OpenAI(
|
||||||
model = models.data[0].id
|
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||||
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
# Completion API
|
models = client.models.list()
|
||||||
stream = False
|
model = models.data[0].id
|
||||||
completion = client.completions.create(
|
|
||||||
model=model,
|
|
||||||
prompt="The future of AI is",
|
|
||||||
echo=False,
|
|
||||||
n=1,
|
|
||||||
stream=stream,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Completion results:")
|
# Completion API
|
||||||
if stream:
|
stream = False
|
||||||
for c in completion:
|
completion = client.completions.create(
|
||||||
print(c)
|
model=model,
|
||||||
else:
|
prompt="The future of AI is",
|
||||||
print(completion)
|
echo=False,
|
||||||
```
|
n=1,
|
||||||
|
stream=stream,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Completion results:")
|
||||||
|
if stream:
|
||||||
|
for c in completion:
|
||||||
|
print(c)
|
||||||
|
else:
|
||||||
|
print(completion)
|
||||||
|
```
|
||||||
|
|
||||||
## Speculating by matching n-grams in the prompt
|
## Speculating by matching n-grams in the prompt
|
||||||
|
|
||||||
The following code configures vLLM to use speculative decoding where proposals are generated by
|
The following code configures vLLM to use speculative decoding where proposals are generated by
|
||||||
matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
|
matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
prompts = [
|
```python
|
||||||
"The future of AI is",
|
from vllm import LLM, SamplingParams
|
||||||
]
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
||||||
|
|
||||||
llm = LLM(
|
prompts = [
|
||||||
model="facebook/opt-6.7b",
|
"The future of AI is",
|
||||||
tensor_parallel_size=1,
|
]
|
||||||
speculative_config={
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
"method": "ngram",
|
|
||||||
"num_speculative_tokens": 5,
|
|
||||||
"prompt_lookup_max": 4,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
for output in outputs:
|
llm = LLM(
|
||||||
prompt = output.prompt
|
model="facebook/opt-6.7b",
|
||||||
generated_text = output.outputs[0].text
|
tensor_parallel_size=1,
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
speculative_config={
|
||||||
```
|
"method": "ngram",
|
||||||
|
"num_speculative_tokens": 5,
|
||||||
|
"prompt_lookup_max": 4,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
## Speculating using MLP speculators
|
## Speculating using MLP speculators
|
||||||
|
|
||||||
@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam
|
|||||||
For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
|
For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
|
||||||
[this technical report](https://arxiv.org/abs/2404.19124).
|
[this technical report](https://arxiv.org/abs/2404.19124).
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
prompts = [
|
```python
|
||||||
"The future of AI is",
|
from vllm import LLM, SamplingParams
|
||||||
]
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
||||||
|
|
||||||
llm = LLM(
|
prompts = [
|
||||||
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"The future of AI is",
|
||||||
tensor_parallel_size=4,
|
]
|
||||||
speculative_config={
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
"model": "ibm-ai-platform/llama3-70b-accelerator",
|
|
||||||
"draft_tensor_parallel_size": 1,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
for output in outputs:
|
llm = LLM(
|
||||||
prompt = output.prompt
|
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
generated_text = output.outputs[0].text
|
tensor_parallel_size=4,
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
speculative_config={
|
||||||
```
|
"model": "ibm-ai-platform/llama3-70b-accelerator",
|
||||||
|
"draft_tensor_parallel_size": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
Note that these speculative models currently need to be run without tensor parallelism, although
|
Note that these speculative models currently need to be run without tensor parallelism, although
|
||||||
it is possible to run the main model using tensor parallelism (see example above). Since the
|
it is possible to run the main model using tensor parallelism (see example above). Since the
|
||||||
@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub:
|
|||||||
The following code configures vLLM to use speculative decoding where proposals are generated by
|
The following code configures vLLM to use speculative decoding where proposals are generated by
|
||||||
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
|
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
prompts = [
|
```python
|
||||||
"The future of AI is",
|
from vllm import LLM, SamplingParams
|
||||||
]
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
||||||
|
|
||||||
llm = LLM(
|
prompts = [
|
||||||
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
"The future of AI is",
|
||||||
tensor_parallel_size=4,
|
]
|
||||||
speculative_config={
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
|
||||||
"draft_tensor_parallel_size": 1,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
llm = LLM(
|
||||||
|
model="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
|
tensor_parallel_size=4,
|
||||||
|
speculative_config={
|
||||||
|
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||||
|
"draft_tensor_parallel_size": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
for output in outputs:
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
||||||
|
|
||||||
```
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
A few important things to consider when using the EAGLE based draft models:
|
A few important things to consider when using the EAGLE based draft models:
|
||||||
|
|
||||||
|
|||||||
@ -33,39 +33,43 @@ text.
|
|||||||
|
|
||||||
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
client = OpenAI(
|
|
||||||
base_url="http://localhost:8000/v1",
|
|
||||||
api_key="-",
|
|
||||||
)
|
|
||||||
model = client.models.list().data[0].id
|
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
```python
|
||||||
model=model,
|
from openai import OpenAI
|
||||||
messages=[
|
client = OpenAI(
|
||||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
base_url="http://localhost:8000/v1",
|
||||||
],
|
api_key="-",
|
||||||
extra_body={"guided_choice": ["positive", "negative"]},
|
)
|
||||||
)
|
model = client.models.list().data[0].id
|
||||||
print(completion.choices[0].message.content)
|
|
||||||
```
|
completion = client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||||
|
],
|
||||||
|
extra_body={"guided_choice": ["positive", "negative"]},
|
||||||
|
)
|
||||||
|
print(completion.choices[0].message.content)
|
||||||
|
```
|
||||||
|
|
||||||
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
|
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
completion = client.chat.completions.create(
|
|
||||||
model=model,
|
```python
|
||||||
messages=[
|
completion = client.chat.completions.create(
|
||||||
{
|
model=model,
|
||||||
"role": "user",
|
messages=[
|
||||||
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
|
{
|
||||||
}
|
"role": "user",
|
||||||
],
|
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
|
||||||
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
}
|
||||||
)
|
],
|
||||||
print(completion.choices[0].message.content)
|
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
||||||
```
|
)
|
||||||
|
print(completion.choices[0].message.content)
|
||||||
|
```
|
||||||
|
|
||||||
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
|
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
|
||||||
For this we can use the `guided_json` parameter in two different ways:
|
For this we can use the `guided_json` parameter in two different ways:
|
||||||
@ -75,41 +79,43 @@ For this we can use the `guided_json` parameter in two different ways:
|
|||||||
|
|
||||||
The next example shows how to use the `guided_json` parameter with a Pydantic model:
|
The next example shows how to use the `guided_json` parameter with a Pydantic model:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from pydantic import BaseModel
|
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
class CarType(str, Enum):
|
```python
|
||||||
sedan = "sedan"
|
from pydantic import BaseModel
|
||||||
suv = "SUV"
|
from enum import Enum
|
||||||
truck = "Truck"
|
|
||||||
coupe = "Coupe"
|
|
||||||
|
|
||||||
class CarDescription(BaseModel):
|
class CarType(str, Enum):
|
||||||
brand: str
|
sedan = "sedan"
|
||||||
model: str
|
suv = "SUV"
|
||||||
car_type: CarType
|
truck = "Truck"
|
||||||
|
coupe = "Coupe"
|
||||||
|
|
||||||
json_schema = CarDescription.model_json_schema()
|
class CarDescription(BaseModel):
|
||||||
|
brand: str
|
||||||
|
model: str
|
||||||
|
car_type: CarType
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
json_schema = CarDescription.model_json_schema()
|
||||||
model=model,
|
|
||||||
messages=[
|
completion = client.chat.completions.create(
|
||||||
{
|
model=model,
|
||||||
"role": "user",
|
messages=[
|
||||||
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
|
{
|
||||||
}
|
"role": "user",
|
||||||
],
|
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
|
||||||
"response_format": {
|
}
|
||||||
"type": "json_schema",
|
],
|
||||||
"json_schema": {
|
"response_format": {
|
||||||
"name": "car-description",
|
"type": "json_schema",
|
||||||
"schema": CarDescription.model_json_schema()
|
"json_schema": {
|
||||||
|
"name": "car-description",
|
||||||
|
"schema": CarDescription.model_json_schema()
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
)
|
||||||
)
|
print(completion.choices[0].message.content)
|
||||||
print(completion.choices[0].message.content)
|
```
|
||||||
```
|
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
While not strictly necessary, normally it´s better to indicate in the prompt the
|
While not strictly necessary, normally it´s better to indicate in the prompt the
|
||||||
@ -121,33 +127,35 @@ difficult to use, but it´s really powerful. It allows us to define complete
|
|||||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||||
As an example, we can use to define a specific format of simplified SQL queries:
|
As an example, we can use to define a specific format of simplified SQL queries:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
simplified_sql_grammar = """
|
|
||||||
root ::= select_statement
|
|
||||||
|
|
||||||
select_statement ::= "SELECT " column " from " table " where " condition
|
```python
|
||||||
|
simplified_sql_grammar = """
|
||||||
|
root ::= select_statement
|
||||||
|
|
||||||
column ::= "col_1 " | "col_2 "
|
select_statement ::= "SELECT " column " from " table " where " condition
|
||||||
|
|
||||||
table ::= "table_1 " | "table_2 "
|
column ::= "col_1 " | "col_2 "
|
||||||
|
|
||||||
condition ::= column "= " number
|
table ::= "table_1 " | "table_2 "
|
||||||
|
|
||||||
number ::= "1 " | "2 "
|
condition ::= column "= " number
|
||||||
"""
|
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
number ::= "1 " | "2 "
|
||||||
model=model,
|
"""
|
||||||
messages=[
|
|
||||||
{
|
completion = client.chat.completions.create(
|
||||||
"role": "user",
|
model=model,
|
||||||
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
|
messages=[
|
||||||
}
|
{
|
||||||
],
|
"role": "user",
|
||||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
|
||||||
)
|
}
|
||||||
print(completion.choices[0].message.content)
|
],
|
||||||
```
|
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||||
|
)
|
||||||
|
print(completion.choices[0].message.content)
|
||||||
|
```
|
||||||
|
|
||||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
||||||
|
|
||||||
@ -161,34 +169,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r
|
|||||||
|
|
||||||
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
|
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from pydantic import BaseModel
|
|
||||||
|
```python
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
class People(BaseModel):
|
class People(BaseModel):
|
||||||
name: str
|
name: str
|
||||||
age: int
|
age: int
|
||||||
|
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Generate a JSON with the name and age of one random person.",
|
"content": "Generate a JSON with the name and age of one random person.",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
response_format={
|
response_format={
|
||||||
"type": "json_schema",
|
"type": "json_schema",
|
||||||
"json_schema": {
|
"json_schema": {
|
||||||
"name": "people",
|
"name": "people",
|
||||||
"schema": People.model_json_schema()
|
"schema": People.model_json_schema()
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||||
print("content: ", completion.choices[0].message.content)
|
print("content: ", completion.choices[0].message.content)
|
||||||
```
|
```
|
||||||
|
|
||||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
||||||
|
|
||||||
@ -202,33 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.
|
|||||||
|
|
||||||
Here is a simple example demonstrating how to get structured output using Pydantic models:
|
Here is a simple example demonstrating how to get structured output using Pydantic models:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from pydantic import BaseModel
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
class Info(BaseModel):
|
```python
|
||||||
name: str
|
from pydantic import BaseModel
|
||||||
age: int
|
from openai import OpenAI
|
||||||
|
|
||||||
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
|
class Info(BaseModel):
|
||||||
model = client.models.list().data[0].id
|
name: str
|
||||||
completion = client.beta.chat.completions.parse(
|
age: int
|
||||||
model=model,
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": "You are a helpful assistant."},
|
|
||||||
{"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
|
|
||||||
],
|
|
||||||
response_format=Info,
|
|
||||||
)
|
|
||||||
|
|
||||||
message = completion.choices[0].message
|
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
|
||||||
print(message)
|
model = client.models.list().data[0].id
|
||||||
assert message.parsed
|
completion = client.beta.chat.completions.parse(
|
||||||
print("Name:", message.parsed.name)
|
model=model,
|
||||||
print("Age:", message.parsed.age)
|
messages=[
|
||||||
```
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
|
||||||
|
],
|
||||||
|
response_format=Info,
|
||||||
|
)
|
||||||
|
|
||||||
Output:
|
message = completion.choices[0].message
|
||||||
|
print(message)
|
||||||
|
assert message.parsed
|
||||||
|
print("Name:", message.parsed.name)
|
||||||
|
print("Age:", message.parsed.age)
|
||||||
|
```
|
||||||
|
|
||||||
```console
|
```console
|
||||||
ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
|
ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
|
||||||
@ -238,35 +248,37 @@ Age: 28
|
|||||||
|
|
||||||
Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
|
Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from typing import List
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
class Step(BaseModel):
|
```python
|
||||||
explanation: str
|
from typing import List
|
||||||
output: str
|
from pydantic import BaseModel
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
class MathResponse(BaseModel):
|
class Step(BaseModel):
|
||||||
steps: list[Step]
|
explanation: str
|
||||||
final_answer: str
|
output: str
|
||||||
|
|
||||||
completion = client.beta.chat.completions.parse(
|
class MathResponse(BaseModel):
|
||||||
model=model,
|
steps: list[Step]
|
||||||
messages=[
|
final_answer: str
|
||||||
{"role": "system", "content": "You are a helpful expert math tutor."},
|
|
||||||
{"role": "user", "content": "Solve 8x + 31 = 2."},
|
|
||||||
],
|
|
||||||
response_format=MathResponse,
|
|
||||||
)
|
|
||||||
|
|
||||||
message = completion.choices[0].message
|
completion = client.beta.chat.completions.parse(
|
||||||
print(message)
|
model=model,
|
||||||
assert message.parsed
|
messages=[
|
||||||
for i, step in enumerate(message.parsed.steps):
|
{"role": "system", "content": "You are a helpful expert math tutor."},
|
||||||
print(f"Step #{i}:", step)
|
{"role": "user", "content": "Solve 8x + 31 = 2."},
|
||||||
print("Answer:", message.parsed.final_answer)
|
],
|
||||||
```
|
response_format=MathResponse,
|
||||||
|
)
|
||||||
|
|
||||||
|
message = completion.choices[0].message
|
||||||
|
print(message)
|
||||||
|
assert message.parsed
|
||||||
|
for i, step in enumerate(message.parsed.steps):
|
||||||
|
print(f"Step #{i}:", step)
|
||||||
|
print("Answer:", message.parsed.final_answer)
|
||||||
|
```
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
|
|
||||||
@ -296,19 +308,21 @@ These parameters can be used in the same way as the parameters from the Online
|
|||||||
Serving examples above. One example for the usage of the `choice` parameter is
|
Serving examples above. One example for the usage of the `choice` parameter is
|
||||||
shown below:
|
shown below:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from vllm.sampling_params import GuidedDecodingParams
|
|
||||||
|
|
||||||
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.sampling_params import GuidedDecodingParams
|
||||||
|
|
||||||
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
|
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
||||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
|
||||||
outputs = llm.generate(
|
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||||
prompts="Classify this sentiment: vLLM is wonderful!",
|
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||||
sampling_params=sampling_params,
|
outputs = llm.generate(
|
||||||
)
|
prompts="Classify this sentiment: vLLM is wonderful!",
|
||||||
print(outputs[0].outputs[0].text)
|
sampling_params=sampling_params,
|
||||||
```
|
)
|
||||||
|
print(outputs[0].outputs[0].text)
|
||||||
|
```
|
||||||
|
|
||||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
||||||
|
|||||||
@ -15,44 +15,46 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \
|
|||||||
|
|
||||||
Next, make a request to the model that should result in it using the available tools:
|
Next, make a request to the model that should result in it using the available tools:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
import json
|
|
||||||
|
|
||||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
import json
|
||||||
|
|
||||||
def get_weather(location: str, unit: str):
|
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||||
return f"Getting the weather for {location} in {unit}..."
|
|
||||||
tool_functions = {"get_weather": get_weather}
|
|
||||||
|
|
||||||
tools = [{
|
def get_weather(location: str, unit: str):
|
||||||
"type": "function",
|
return f"Getting the weather for {location} in {unit}..."
|
||||||
"function": {
|
tool_functions = {"get_weather": get_weather}
|
||||||
"name": "get_weather",
|
|
||||||
"description": "Get the current weather in a given location",
|
tools = [{
|
||||||
"parameters": {
|
"type": "function",
|
||||||
"type": "object",
|
"function": {
|
||||||
"properties": {
|
"name": "get_weather",
|
||||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
"description": "Get the current weather in a given location",
|
||||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
"parameters": {
|
||||||
},
|
"type": "object",
|
||||||
"required": ["location", "unit"]
|
"properties": {
|
||||||
|
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||||
|
},
|
||||||
|
"required": ["location", "unit"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}]
|
||||||
}]
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=client.models.list().data[0].id,
|
model=client.models.list().data[0].id,
|
||||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||||
tools=tools,
|
tools=tools,
|
||||||
tool_choice="auto"
|
tool_choice="auto"
|
||||||
)
|
)
|
||||||
|
|
||||||
tool_call = response.choices[0].message.tool_calls[0].function
|
tool_call = response.choices[0].message.tool_calls[0].function
|
||||||
print(f"Function called: {tool_call.name}")
|
print(f"Function called: {tool_call.name}")
|
||||||
print(f"Arguments: {tool_call.arguments}")
|
print(f"Arguments: {tool_call.arguments}")
|
||||||
print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
|
print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
|
||||||
```
|
```
|
||||||
|
|
||||||
Example output:
|
Example output:
|
||||||
|
|
||||||
@ -301,49 +303,51 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen
|
|||||||
|
|
||||||
Here is a summary of a plugin file:
|
Here is a summary of a plugin file:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
|
|
||||||
# import the required packages
|
```python
|
||||||
|
|
||||||
# define a tool parser and register it to vllm
|
# import the required packages
|
||||||
# the name list in register_module can be used
|
|
||||||
# in --tool-call-parser. you can define as many
|
|
||||||
# tool parsers as you want here.
|
|
||||||
@ToolParserManager.register_module(["example"])
|
|
||||||
class ExampleToolParser(ToolParser):
|
|
||||||
def __init__(self, tokenizer: AnyTokenizer):
|
|
||||||
super().__init__(tokenizer)
|
|
||||||
|
|
||||||
# adjust request. e.g.: set skip special tokens
|
# define a tool parser and register it to vllm
|
||||||
# to False for tool call output.
|
# the name list in register_module can be used
|
||||||
def adjust_request(
|
# in --tool-call-parser. you can define as many
|
||||||
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
# tool parsers as you want here.
|
||||||
return request
|
@ToolParserManager.register_module(["example"])
|
||||||
|
class ExampleToolParser(ToolParser):
|
||||||
|
def __init__(self, tokenizer: AnyTokenizer):
|
||||||
|
super().__init__(tokenizer)
|
||||||
|
|
||||||
# implement the tool call parse for stream call
|
# adjust request. e.g.: set skip special tokens
|
||||||
def extract_tool_calls_streaming(
|
# to False for tool call output.
|
||||||
self,
|
def adjust_request(
|
||||||
previous_text: str,
|
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
||||||
current_text: str,
|
return request
|
||||||
delta_text: str,
|
|
||||||
previous_token_ids: Sequence[int],
|
|
||||||
current_token_ids: Sequence[int],
|
|
||||||
delta_token_ids: Sequence[int],
|
|
||||||
request: ChatCompletionRequest,
|
|
||||||
) -> Union[DeltaMessage, None]:
|
|
||||||
return delta
|
|
||||||
|
|
||||||
# implement the tool parse for non-stream call
|
# implement the tool call parse for stream call
|
||||||
def extract_tool_calls(
|
def extract_tool_calls_streaming(
|
||||||
self,
|
self,
|
||||||
model_output: str,
|
previous_text: str,
|
||||||
request: ChatCompletionRequest,
|
current_text: str,
|
||||||
) -> ExtractedToolCallInformation:
|
delta_text: str,
|
||||||
return ExtractedToolCallInformation(tools_called=False,
|
previous_token_ids: Sequence[int],
|
||||||
tool_calls=[],
|
current_token_ids: Sequence[int],
|
||||||
content=text)
|
delta_token_ids: Sequence[int],
|
||||||
|
request: ChatCompletionRequest,
|
||||||
|
) -> Union[DeltaMessage, None]:
|
||||||
|
return delta
|
||||||
|
|
||||||
```
|
# implement the tool parse for non-stream call
|
||||||
|
def extract_tool_calls(
|
||||||
|
self,
|
||||||
|
model_output: str,
|
||||||
|
request: ChatCompletionRequest,
|
||||||
|
) -> ExtractedToolCallInformation:
|
||||||
|
return ExtractedToolCallInformation(tools_called=False,
|
||||||
|
tool_calls=[],
|
||||||
|
content=text)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
Then you can use this plugin in the command line like this.
|
Then you can use this plugin in the command line like this.
|
||||||
|
|
||||||
|
|||||||
@ -76,21 +76,23 @@ Currently, there are no pre-built CPU wheels.
|
|||||||
|
|
||||||
### Build image from source
|
### Build image from source
|
||||||
|
|
||||||
```console
|
??? Commands
|
||||||
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
|
|
||||||
|
|
||||||
# Launching OpenAI server
|
```console
|
||||||
$ docker run --rm \
|
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
|
||||||
--privileged=true \
|
|
||||||
--shm-size=4g \
|
# Launching OpenAI server
|
||||||
-p 8000:8000 \
|
$ docker run --rm \
|
||||||
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
|
--privileged=true \
|
||||||
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
|
--shm-size=4g \
|
||||||
vllm-cpu-env \
|
-p 8000:8000 \
|
||||||
--model=meta-llama/Llama-3.2-1B-Instruct \
|
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
|
||||||
--dtype=bfloat16 \
|
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
|
||||||
other vLLM OpenAI server arguments
|
vllm-cpu-env \
|
||||||
```
|
--model=meta-llama/Llama-3.2-1B-Instruct \
|
||||||
|
--dtype=bfloat16 \
|
||||||
|
other vLLM OpenAI server arguments
|
||||||
|
```
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
For ARM or Apple silicon, use `docker/Dockerfile.arm`
|
For ARM or Apple silicon, use `docker/Dockerfile.arm`
|
||||||
@ -144,32 +146,34 @@ vllm serve facebook/opt-125m
|
|||||||
|
|
||||||
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
|
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
|
||||||
|
|
||||||
```console
|
??? Commands
|
||||||
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
|
|
||||||
|
|
||||||
# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
|
```console
|
||||||
CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
|
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
|
||||||
0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
|
|
||||||
1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
|
|
||||||
2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
|
|
||||||
3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
|
|
||||||
4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
|
|
||||||
5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
|
|
||||||
6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
|
|
||||||
7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
|
|
||||||
8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
|
|
||||||
9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
|
|
||||||
10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
|
|
||||||
11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
|
|
||||||
12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
|
|
||||||
13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
|
|
||||||
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
|
|
||||||
15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
|
|
||||||
|
|
||||||
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
|
||||||
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
|
||||||
$ python examples/offline_inference/basic/basic.py
|
0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
|
||||||
```
|
1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
|
||||||
|
2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
|
||||||
|
3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
|
||||||
|
4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
|
||||||
|
5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
|
||||||
|
6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
|
||||||
|
7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
|
||||||
|
8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
|
||||||
|
9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
|
||||||
|
10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
|
||||||
|
11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
|
||||||
|
12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
|
||||||
|
13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
|
||||||
|
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
|
||||||
|
15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
|
||||||
|
|
||||||
|
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
||||||
|
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
||||||
|
$ python examples/offline_inference/basic/basic.py
|
||||||
|
```
|
||||||
|
|
||||||
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
|
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
|
||||||
|
|
||||||
|
|||||||
@ -90,24 +90,26 @@ Currently, there are no pre-built ROCm wheels.
|
|||||||
|
|
||||||
4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
|
4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
|
||||||
|
|
||||||
```bash
|
??? Commands
|
||||||
pip install --upgrade pip
|
|
||||||
|
|
||||||
# Build & install AMD SMI
|
```bash
|
||||||
pip install /opt/rocm/share/amd_smi
|
pip install --upgrade pip
|
||||||
|
|
||||||
# Install dependencies
|
# Build & install AMD SMI
|
||||||
pip install --upgrade numba \
|
pip install /opt/rocm/share/amd_smi
|
||||||
scipy \
|
|
||||||
huggingface-hub[cli,hf_transfer] \
|
|
||||||
setuptools_scm
|
|
||||||
pip install "numpy<2"
|
|
||||||
pip install -r requirements/rocm.txt
|
|
||||||
|
|
||||||
# Build vLLM for MI210/MI250/MI300.
|
# Install dependencies
|
||||||
export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
|
pip install --upgrade numba \
|
||||||
python3 setup.py develop
|
scipy \
|
||||||
```
|
huggingface-hub[cli,hf_transfer] \
|
||||||
|
setuptools_scm
|
||||||
|
pip install "numpy<2"
|
||||||
|
pip install -r requirements/rocm.txt
|
||||||
|
|
||||||
|
# Build vLLM for MI210/MI250/MI300.
|
||||||
|
export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
|
||||||
|
python3 setup.py develop
|
||||||
|
```
|
||||||
|
|
||||||
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
|
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
|
||||||
|
|
||||||
@ -201,19 +203,21 @@ DOCKER_BUILDKIT=1 docker build \
|
|||||||
|
|
||||||
To run the above docker image `vllm-rocm`, use the below command:
|
To run the above docker image `vllm-rocm`, use the below command:
|
||||||
|
|
||||||
```console
|
??? Command
|
||||||
docker run -it \
|
|
||||||
--network=host \
|
```console
|
||||||
--group-add=video \
|
docker run -it \
|
||||||
--ipc=host \
|
--network=host \
|
||||||
--cap-add=SYS_PTRACE \
|
--group-add=video \
|
||||||
--security-opt seccomp=unconfined \
|
--ipc=host \
|
||||||
--device /dev/kfd \
|
--cap-add=SYS_PTRACE \
|
||||||
--device /dev/dri \
|
--security-opt seccomp=unconfined \
|
||||||
-v <path/to/model>:/app/model \
|
--device /dev/kfd \
|
||||||
vllm-rocm \
|
--device /dev/dri \
|
||||||
bash
|
-v <path/to/model>:/app/model \
|
||||||
```
|
vllm-rocm \
|
||||||
|
bash
|
||||||
|
```
|
||||||
|
|
||||||
Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
|
Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
|
||||||
|
|
||||||
|
|||||||
@ -200,7 +200,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1
|
|||||||
|
|
||||||
`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
|
`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
|
||||||
|
|
||||||
Example (with ramp-up)
|
Example (with ramp-up):
|
||||||
|
|
||||||
```text
|
```text
|
||||||
min = 2, step = 32, max = 64
|
min = 2, step = 32, max = 64
|
||||||
@ -209,7 +209,7 @@ min = 2, step = 32, max = 64
|
|||||||
=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
|
=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
|
||||||
```
|
```
|
||||||
|
|
||||||
Example (without ramp-up)
|
Example (without ramp-up):
|
||||||
|
|
||||||
```text
|
```text
|
||||||
min = 128, step = 128, max = 512
|
min = 128, step = 128, max = 512
|
||||||
@ -232,19 +232,21 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come
|
|||||||
|
|
||||||
Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
|
Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
|
||||||
|
|
||||||
```text
|
??? Logs
|
||||||
INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
|
|
||||||
INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
|
```text
|
||||||
INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
|
INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
|
||||||
...
|
INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
|
||||||
INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
|
||||||
INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
|
...
|
||||||
INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
|
INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
||||||
INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
|
INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
|
||||||
...
|
INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
|
||||||
INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
|
INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
|
||||||
INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
...
|
||||||
```
|
INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
|
||||||
|
INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
||||||
|
```
|
||||||
|
|
||||||
This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
|
This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
|
||||||
|
|
||||||
@ -279,37 +281,39 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi
|
|||||||
|
|
||||||
Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
|
Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
|
||||||
|
|
||||||
```text
|
??? Logs
|
||||||
INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
|
|
||||||
INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
|
```text
|
||||||
INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
|
INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
|
||||||
INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
|
INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
|
||||||
INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
|
INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
|
||||||
INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
|
INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
|
||||||
INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
|
INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
|
||||||
INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
|
INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
|
||||||
INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
|
INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
|
||||||
INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
|
INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
|
||||||
INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
|
INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
|
||||||
INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
|
INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
|
||||||
...
|
INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
|
||||||
INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
|
||||||
INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
|
...
|
||||||
INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
||||||
...
|
INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
|
||||||
INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
|
INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
||||||
INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
|
...
|
||||||
...
|
INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
|
||||||
INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
|
INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
|
||||||
INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
|
...
|
||||||
INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
|
INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
|
||||||
INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
|
INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
|
||||||
INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
|
INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
|
||||||
INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
|
INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
|
||||||
INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
|
INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
|
||||||
INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
|
INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
|
||||||
INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
|
INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
|
||||||
```
|
INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
|
||||||
|
INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
|
||||||
|
```
|
||||||
|
|
||||||
### Recommended vLLM Parameters
|
### Recommended vLLM Parameters
|
||||||
|
|
||||||
|
|||||||
@ -147,20 +147,22 @@ curl http://localhost:8000/v1/completions \
|
|||||||
|
|
||||||
Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
|
Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
```python
|
||||||
openai_api_key = "EMPTY"
|
from openai import OpenAI
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
|
||||||
client = OpenAI(
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
api_key=openai_api_key,
|
openai_api_key = "EMPTY"
|
||||||
base_url=openai_api_base,
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
)
|
client = OpenAI(
|
||||||
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
|
api_key=openai_api_key,
|
||||||
prompt="San Francisco is a")
|
base_url=openai_api_base,
|
||||||
print("Completion result:", completion)
|
)
|
||||||
```
|
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
|
prompt="San Francisco is a")
|
||||||
|
print("Completion result:", completion)
|
||||||
|
```
|
||||||
|
|
||||||
A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
|
A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
|
||||||
|
|
||||||
@ -184,26 +186,28 @@ curl http://localhost:8000/v1/chat/completions \
|
|||||||
|
|
||||||
Alternatively, you can use the `openai` Python package:
|
Alternatively, you can use the `openai` Python package:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
# Set OpenAI's API key and API base to use vLLM's API server.
|
|
||||||
openai_api_key = "EMPTY"
|
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
|
||||||
|
|
||||||
client = OpenAI(
|
```python
|
||||||
api_key=openai_api_key,
|
from openai import OpenAI
|
||||||
base_url=openai_api_base,
|
# Set OpenAI's API key and API base to use vLLM's API server.
|
||||||
)
|
openai_api_key = "EMPTY"
|
||||||
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
chat_response = client.chat.completions.create(
|
client = OpenAI(
|
||||||
model="Qwen/Qwen2.5-1.5B-Instruct",
|
api_key=openai_api_key,
|
||||||
messages=[
|
base_url=openai_api_base,
|
||||||
{"role": "system", "content": "You are a helpful assistant."},
|
)
|
||||||
{"role": "user", "content": "Tell me a joke."},
|
|
||||||
]
|
chat_response = client.chat.completions.create(
|
||||||
)
|
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
print("Chat response:", chat_response)
|
messages=[
|
||||||
```
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "Tell me a joke."},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print("Chat response:", chat_response)
|
||||||
|
```
|
||||||
|
|
||||||
## On Attention Backends
|
## On Attention Backends
|
||||||
|
|
||||||
|
|||||||
@ -85,35 +85,37 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
|
|||||||
In general, only instruction-tuned models have a chat template.
|
In general, only instruction-tuned models have a chat template.
|
||||||
Base models may perform poorly as they are not trained to respond to the chat conversation.
|
Base models may perform poorly as they are not trained to respond to the chat conversation.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
```python
|
||||||
conversation = [
|
from vllm import LLM
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": "You are a helpful assistant"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Hello"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "Hello! How can I assist you today?"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write an essay about the importance of higher education.",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
outputs = llm.chat(conversation)
|
|
||||||
|
|
||||||
for output in outputs:
|
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||||
prompt = output.prompt
|
conversation = [
|
||||||
generated_text = output.outputs[0].text
|
{
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
"role": "system",
|
||||||
```
|
"content": "You are a helpful assistant"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "Hello! How can I assist you today?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Write an essay about the importance of higher education.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
outputs = llm.chat(conversation)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
|
A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
|
||||||
|
|
||||||
|
|||||||
@ -70,7 +70,10 @@ To make your model compatible with the Transformers backend, it needs:
|
|||||||
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
|
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
|
||||||
3. `MyModel` must contain `_supports_attention_backend = True`.
|
3. `MyModel` must contain `_supports_attention_backend = True`.
|
||||||
|
|
||||||
```python title="modeling_my_model.py"
|
<details>
|
||||||
|
<summary>modeling_my_model.py</summary>
|
||||||
|
|
||||||
|
```python
|
||||||
|
|
||||||
from transformers import PreTrainedModel
|
from transformers import PreTrainedModel
|
||||||
from torch import nn
|
from torch import nn
|
||||||
@ -93,6 +96,8 @@ class MyModel(PreTrainedModel):
|
|||||||
_supports_attention_backend = True
|
_supports_attention_backend = True
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
Here is what happens in the background when this model is loaded:
|
Here is what happens in the background when this model is loaded:
|
||||||
|
|
||||||
1. The config is loaded.
|
1. The config is loaded.
|
||||||
@ -103,7 +108,10 @@ That's it!
|
|||||||
|
|
||||||
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
|
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
|
||||||
|
|
||||||
```python title="configuration_my_model.py"
|
<details>
|
||||||
|
<summary>configuration_my_model.py</summary>
|
||||||
|
|
||||||
|
```python
|
||||||
|
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
@ -123,6 +131,8 @@ class MyConfig(PretrainedConfig):
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
|
- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
|
||||||
- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
|
- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
|
||||||
* You only need to do this for layers which are not present on all pipeline stages
|
* You only need to do this for layers which are not present on all pipeline stages
|
||||||
@ -198,6 +208,9 @@ huggingface-cli scan-cache --dir ~/.cache/huggingface/hub
|
|||||||
|
|
||||||
Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache:
|
Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache:
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Commands</summary>
|
||||||
|
|
||||||
```console
|
```console
|
||||||
# The `delete-cache` command requires extra dependencies to work with the TUI.
|
# The `delete-cache` command requires extra dependencies to work with the TUI.
|
||||||
# Please run `pip install huggingface_hub[cli]` to install them.
|
# Please run `pip install huggingface_hub[cli]` to install them.
|
||||||
@ -224,6 +237,8 @@ Start deletion.
|
|||||||
Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M.
|
Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M.
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
#### Using a proxy
|
#### Using a proxy
|
||||||
|
|
||||||
Here are some tips for loading/downloading models from Hugging Face using a proxy:
|
Here are some tips for loading/downloading models from Hugging Face using a proxy:
|
||||||
@ -601,27 +616,29 @@ Specified using `--task generate`.
|
|||||||
|
|
||||||
For the best results, we recommend using the following dependency versions (tested on A10 and L40):
|
For the best results, we recommend using the following dependency versions (tested on A10 and L40):
|
||||||
|
|
||||||
```text
|
??? Dependency versions
|
||||||
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
|
|
||||||
torch==2.5.1
|
|
||||||
torchvision==0.20.1
|
|
||||||
transformers==4.48.1
|
|
||||||
tokenizers==0.21.0
|
|
||||||
tiktoken==0.7.0
|
|
||||||
vllm==0.7.0
|
|
||||||
|
|
||||||
# Optional but recommended for improved performance and stability
|
```text
|
||||||
triton==3.1.0
|
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
|
||||||
xformers==0.0.28.post3
|
torch==2.5.1
|
||||||
uvloop==0.21.0
|
torchvision==0.20.1
|
||||||
protobuf==5.29.3
|
transformers==4.48.1
|
||||||
openai==1.60.2
|
tokenizers==0.21.0
|
||||||
opencv-python-headless==4.11.0.86
|
tiktoken==0.7.0
|
||||||
pillow==10.4.0
|
vllm==0.7.0
|
||||||
|
|
||||||
# Installed FlashAttention (for float16 only)
|
# Optional but recommended for improved performance and stability
|
||||||
flash-attn>=2.5.6 # Not used in float32, but should be documented
|
triton==3.1.0
|
||||||
```
|
xformers==0.0.28.post3
|
||||||
|
uvloop==0.21.0
|
||||||
|
protobuf==5.29.3
|
||||||
|
openai==1.60.2
|
||||||
|
opencv-python-headless==4.11.0.86
|
||||||
|
pillow==10.4.0
|
||||||
|
|
||||||
|
# Installed FlashAttention (for float16 only)
|
||||||
|
flash-attn>=2.5.6 # Not used in float32, but should be documented
|
||||||
|
```
|
||||||
|
|
||||||
**Note:** Make sure you understand the security implications of using outdated packages.
|
**Note:** Make sure you understand the security implications of using outdated packages.
|
||||||
|
|
||||||
|
|||||||
@ -13,19 +13,21 @@ pip install langchain langchain_community -q
|
|||||||
|
|
||||||
To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
|
To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from langchain_community.llms import VLLM
|
|
||||||
|
|
||||||
llm = VLLM(model="mosaicml/mpt-7b",
|
```python
|
||||||
trust_remote_code=True, # mandatory for hf models
|
from langchain_community.llms import VLLM
|
||||||
max_new_tokens=128,
|
|
||||||
top_k=10,
|
|
||||||
top_p=0.95,
|
|
||||||
temperature=0.8,
|
|
||||||
# tensor_parallel_size=... # for distributed inference
|
|
||||||
)
|
|
||||||
|
|
||||||
print(llm("What is the capital of France ?"))
|
llm = VLLM(model="mosaicml/mpt-7b",
|
||||||
```
|
trust_remote_code=True, # mandatory for hf models
|
||||||
|
max_new_tokens=128,
|
||||||
|
top_k=10,
|
||||||
|
top_p=0.95,
|
||||||
|
temperature=0.8,
|
||||||
|
# tensor_parallel_size=... # for distributed inference
|
||||||
|
)
|
||||||
|
|
||||||
|
print(llm("What is the capital of France ?"))
|
||||||
|
```
|
||||||
|
|
||||||
Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
|
Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.
|
||||||
|
|||||||
@ -15,22 +15,24 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
|
|||||||
|
|
||||||
To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
|
To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
from openai import OpenAI
|
|
||||||
client = OpenAI(
|
|
||||||
base_url="http://localhost:8000/v1",
|
|
||||||
api_key="token-abc123",
|
|
||||||
)
|
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
```python
|
||||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
from openai import OpenAI
|
||||||
messages=[
|
client = OpenAI(
|
||||||
{"role": "user", "content": "Hello!"}
|
base_url="http://localhost:8000/v1",
|
||||||
]
|
api_key="token-abc123",
|
||||||
)
|
)
|
||||||
|
|
||||||
print(completion.choices[0].message)
|
completion = client.chat.completions.create(
|
||||||
```
|
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Hello!"}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(completion.choices[0].message)
|
||||||
|
```
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
|
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
|
||||||
@ -147,27 +149,29 @@ with `--enable-request-id-headers`.
|
|||||||
> rather than within the vLLM layer for this reason.
|
> rather than within the vLLM layer for this reason.
|
||||||
> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.
|
> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
completion = client.chat.completions.create(
|
|
||||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
|
||||||
messages=[
|
|
||||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
|
||||||
],
|
|
||||||
extra_headers={
|
|
||||||
"x-request-id": "sentiment-classification-00001",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
print(completion._request_id)
|
|
||||||
|
|
||||||
completion = client.completions.create(
|
```python
|
||||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
completion = client.chat.completions.create(
|
||||||
prompt="A robot may not injure a human being",
|
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||||
extra_headers={
|
messages=[
|
||||||
"x-request-id": "completion-test",
|
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||||
}
|
],
|
||||||
)
|
extra_headers={
|
||||||
print(completion._request_id)
|
"x-request-id": "sentiment-classification-00001",
|
||||||
```
|
}
|
||||||
|
)
|
||||||
|
print(completion._request_id)
|
||||||
|
|
||||||
|
completion = client.completions.create(
|
||||||
|
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||||
|
prompt="A robot may not injure a human being",
|
||||||
|
extra_headers={
|
||||||
|
"x-request-id": "completion-test",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(completion._request_id)
|
||||||
|
```
|
||||||
|
|
||||||
## API Reference
|
## API Reference
|
||||||
|
|
||||||
@ -184,15 +188,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
|
|||||||
|
|
||||||
The following [sampling parameters][sampling-params] are supported.
|
The following [sampling parameters][sampling-params] are supported.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
|
||||||
|
```
|
||||||
|
|
||||||
The following extra parameters are supported:
|
The following extra parameters are supported:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
|
||||||
|
```
|
||||||
|
|
||||||
[](){ #chat-api }
|
[](){ #chat-api }
|
||||||
|
|
||||||
@ -212,15 +220,19 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
|
|||||||
|
|
||||||
The following [sampling parameters][sampling-params] are supported.
|
The following [sampling parameters][sampling-params] are supported.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
|
||||||
|
```
|
||||||
|
|
||||||
The following extra parameters are supported:
|
The following extra parameters are supported:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
|
||||||
|
```
|
||||||
|
|
||||||
[](){ #embeddings-api }
|
[](){ #embeddings-api }
|
||||||
|
|
||||||
@ -259,29 +271,31 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
|||||||
|
|
||||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
import requests
|
|
||||||
|
|
||||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
response = requests.post(
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||||
"http://localhost:8000/v1/embeddings",
|
|
||||||
json={
|
response = requests.post(
|
||||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
"http://localhost:8000/v1/embeddings",
|
||||||
"messages": [{
|
json={
|
||||||
"role": "user",
|
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||||
"content": [
|
"messages": [{
|
||||||
{"type": "image_url", "image_url": {"url": image_url}},
|
"role": "user",
|
||||||
{"type": "text", "text": "Represent the given image."},
|
"content": [
|
||||||
],
|
{"type": "image_url", "image_url": {"url": image_url}},
|
||||||
}],
|
{"type": "text", "text": "Represent the given image."},
|
||||||
"encoding_format": "float",
|
],
|
||||||
},
|
}],
|
||||||
)
|
"encoding_format": "float",
|
||||||
response.raise_for_status()
|
},
|
||||||
response_json = response.json()
|
)
|
||||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
response.raise_for_status()
|
||||||
```
|
response_json = response.json()
|
||||||
|
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||||
|
```
|
||||||
|
|
||||||
=== "DSE-Qwen2-MRL"
|
=== "DSE-Qwen2-MRL"
|
||||||
|
|
||||||
@ -316,15 +330,19 @@ The following [pooling parameters][pooling-params] are supported.
|
|||||||
|
|
||||||
The following extra parameters are supported by default:
|
The following extra parameters are supported by default:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
|
||||||
|
```
|
||||||
|
|
||||||
For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
|
For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
|
||||||
|
```
|
||||||
|
|
||||||
[](){ #transcriptions-api }
|
[](){ #transcriptions-api }
|
||||||
|
|
||||||
@ -343,15 +361,19 @@ Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
|
|||||||
|
|
||||||
The following [sampling parameters][sampling-params] are supported.
|
The following [sampling parameters][sampling-params] are supported.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
|
||||||
|
```
|
||||||
|
|
||||||
The following extra parameters are supported:
|
The following extra parameters are supported:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
||||||
|
```
|
||||||
|
|
||||||
[](){ #tokenizer-api }
|
[](){ #tokenizer-api }
|
||||||
|
|
||||||
@ -387,8 +409,6 @@ Code example: <gh-file:examples/online_serving/openai_classification_client.py>
|
|||||||
|
|
||||||
You can classify multiple texts by passing an array of strings:
|
You can classify multiple texts by passing an array of strings:
|
||||||
|
|
||||||
Request:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -v "http://127.0.0.1:8000/classify" \
|
curl -v "http://127.0.0.1:8000/classify" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
@ -401,47 +421,45 @@ curl -v "http://127.0.0.1:8000/classify" \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Response:
|
??? Response
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
{
|
|
||||||
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
|
|
||||||
"object": "list",
|
|
||||||
"created": 1745383065,
|
|
||||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
|
||||||
"data": [
|
|
||||||
{
|
{
|
||||||
"index": 0,
|
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
|
||||||
"label": "Default",
|
"object": "list",
|
||||||
"probs": [
|
"created": 1745383065,
|
||||||
0.565970778465271,
|
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||||
0.4340292513370514
|
"data": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"label": "Default",
|
||||||
|
"probs": [
|
||||||
|
0.565970778465271,
|
||||||
|
0.4340292513370514
|
||||||
|
],
|
||||||
|
"num_classes": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"label": "Spoiled",
|
||||||
|
"probs": [
|
||||||
|
0.26448777318000793,
|
||||||
|
0.7355121970176697
|
||||||
|
],
|
||||||
|
"num_classes": 2
|
||||||
|
}
|
||||||
],
|
],
|
||||||
"num_classes": 2
|
"usage": {
|
||||||
},
|
"prompt_tokens": 20,
|
||||||
{
|
"total_tokens": 20,
|
||||||
"index": 1,
|
"completion_tokens": 0,
|
||||||
"label": "Spoiled",
|
"prompt_tokens_details": null
|
||||||
"probs": [
|
}
|
||||||
0.26448777318000793,
|
|
||||||
0.7355121970176697
|
|
||||||
],
|
|
||||||
"num_classes": 2
|
|
||||||
}
|
}
|
||||||
],
|
```
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": 20,
|
|
||||||
"total_tokens": 20,
|
|
||||||
"completion_tokens": 0,
|
|
||||||
"prompt_tokens_details": null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also pass a string directly to the `input` field:
|
You can also pass a string directly to the `input` field:
|
||||||
|
|
||||||
Request:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -v "http://127.0.0.1:8000/classify" \
|
curl -v "http://127.0.0.1:8000/classify" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
@ -451,33 +469,33 @@ curl -v "http://127.0.0.1:8000/classify" \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Response:
|
??? Response
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
{
|
|
||||||
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
|
|
||||||
"object": "list",
|
|
||||||
"created": 1745383213,
|
|
||||||
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
|
||||||
"data": [
|
|
||||||
{
|
{
|
||||||
"index": 0,
|
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
|
||||||
"label": "Default",
|
"object": "list",
|
||||||
"probs": [
|
"created": 1745383213,
|
||||||
0.565970778465271,
|
"model": "jason9693/Qwen2.5-1.5B-apeach",
|
||||||
0.4340292513370514
|
"data": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"label": "Default",
|
||||||
|
"probs": [
|
||||||
|
0.565970778465271,
|
||||||
|
0.4340292513370514
|
||||||
|
],
|
||||||
|
"num_classes": 2
|
||||||
|
}
|
||||||
],
|
],
|
||||||
"num_classes": 2
|
"usage": {
|
||||||
|
"prompt_tokens": 10,
|
||||||
|
"total_tokens": 10,
|
||||||
|
"completion_tokens": 0,
|
||||||
|
"prompt_tokens_details": null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
```
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": 10,
|
|
||||||
"total_tokens": 10,
|
|
||||||
"completion_tokens": 0,
|
|
||||||
"prompt_tokens_details": null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Extra parameters
|
#### Extra parameters
|
||||||
|
|
||||||
@ -508,8 +526,6 @@ Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
|
|||||||
|
|
||||||
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
|
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
|
||||||
|
|
||||||
Request:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X 'POST' \
|
curl -X 'POST' \
|
||||||
'http://127.0.0.1:8000/score' \
|
'http://127.0.0.1:8000/score' \
|
||||||
@ -523,24 +539,24 @@ curl -X 'POST' \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Response:
|
??? Response
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
{
|
|
||||||
"id": "score-request-id",
|
|
||||||
"object": "list",
|
|
||||||
"created": 693447,
|
|
||||||
"model": "BAAI/bge-reranker-v2-m3",
|
|
||||||
"data": [
|
|
||||||
{
|
{
|
||||||
"index": 0,
|
"id": "score-request-id",
|
||||||
"object": "score",
|
"object": "list",
|
||||||
"score": 1
|
"created": 693447,
|
||||||
|
"model": "BAAI/bge-reranker-v2-m3",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"object": "score",
|
||||||
|
"score": 1
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {}
|
||||||
}
|
}
|
||||||
],
|
```
|
||||||
"usage": {}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Batch inference
|
#### Batch inference
|
||||||
|
|
||||||
@ -548,95 +564,95 @@ You can pass a string to `text_1` and a list to `text_2`, forming multiple sente
|
|||||||
where each pair is built from `text_1` and a string in `text_2`.
|
where each pair is built from `text_1` and a string in `text_2`.
|
||||||
The total number of pairs is `len(text_2)`.
|
The total number of pairs is `len(text_2)`.
|
||||||
|
|
||||||
Request:
|
??? Request
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X 'POST' \
|
curl -X 'POST' \
|
||||||
'http://127.0.0.1:8000/score' \
|
'http://127.0.0.1:8000/score' \
|
||||||
-H 'accept: application/json' \
|
-H 'accept: application/json' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "BAAI/bge-reranker-v2-m3",
|
"model": "BAAI/bge-reranker-v2-m3",
|
||||||
"text_1": "What is the capital of France?",
|
"text_1": "What is the capital of France?",
|
||||||
"text_2": [
|
"text_2": [
|
||||||
"The capital of Brazil is Brasilia.",
|
"The capital of Brazil is Brasilia.",
|
||||||
"The capital of France is Paris."
|
"The capital of France is Paris."
|
||||||
]
|
]
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Response:
|
??? Response
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
{
|
|
||||||
"id": "score-request-id",
|
|
||||||
"object": "list",
|
|
||||||
"created": 693570,
|
|
||||||
"model": "BAAI/bge-reranker-v2-m3",
|
|
||||||
"data": [
|
|
||||||
{
|
{
|
||||||
"index": 0,
|
"id": "score-request-id",
|
||||||
"object": "score",
|
"object": "list",
|
||||||
"score": 0.001094818115234375
|
"created": 693570,
|
||||||
},
|
"model": "BAAI/bge-reranker-v2-m3",
|
||||||
{
|
"data": [
|
||||||
"index": 1,
|
{
|
||||||
"object": "score",
|
"index": 0,
|
||||||
"score": 1
|
"object": "score",
|
||||||
|
"score": 0.001094818115234375
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"object": "score",
|
||||||
|
"score": 1
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {}
|
||||||
}
|
}
|
||||||
],
|
```
|
||||||
"usage": {}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
|
You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
|
||||||
where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
|
where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
|
||||||
The total number of pairs is `len(text_2)`.
|
The total number of pairs is `len(text_2)`.
|
||||||
|
|
||||||
Request:
|
??? Request
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X 'POST' \
|
curl -X 'POST' \
|
||||||
'http://127.0.0.1:8000/score' \
|
'http://127.0.0.1:8000/score' \
|
||||||
-H 'accept: application/json' \
|
-H 'accept: application/json' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "BAAI/bge-reranker-v2-m3",
|
"model": "BAAI/bge-reranker-v2-m3",
|
||||||
"encoding_format": "float",
|
"encoding_format": "float",
|
||||||
"text_1": [
|
"text_1": [
|
||||||
"What is the capital of Brazil?",
|
"What is the capital of Brazil?",
|
||||||
"What is the capital of France?"
|
"What is the capital of France?"
|
||||||
],
|
],
|
||||||
"text_2": [
|
"text_2": [
|
||||||
"The capital of Brazil is Brasilia.",
|
"The capital of Brazil is Brasilia.",
|
||||||
"The capital of France is Paris."
|
"The capital of France is Paris."
|
||||||
]
|
]
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Response:
|
??? Response
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
{
|
|
||||||
"id": "score-request-id",
|
|
||||||
"object": "list",
|
|
||||||
"created": 693447,
|
|
||||||
"model": "BAAI/bge-reranker-v2-m3",
|
|
||||||
"data": [
|
|
||||||
{
|
{
|
||||||
"index": 0,
|
"id": "score-request-id",
|
||||||
"object": "score",
|
"object": "list",
|
||||||
"score": 1
|
"created": 693447,
|
||||||
},
|
"model": "BAAI/bge-reranker-v2-m3",
|
||||||
{
|
"data": [
|
||||||
"index": 1,
|
{
|
||||||
"object": "score",
|
"index": 0,
|
||||||
"score": 1
|
"object": "score",
|
||||||
|
"score": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 1,
|
||||||
|
"object": "score",
|
||||||
|
"score": 1
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {}
|
||||||
}
|
}
|
||||||
],
|
```
|
||||||
"usage": {}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Extra parameters
|
#### Extra parameters
|
||||||
|
|
||||||
@ -675,51 +691,51 @@ Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
|
|||||||
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
|
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
|
||||||
Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
|
Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
|
||||||
|
|
||||||
Request:
|
??? Request
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X 'POST' \
|
curl -X 'POST' \
|
||||||
'http://127.0.0.1:8000/v1/rerank' \
|
'http://127.0.0.1:8000/v1/rerank' \
|
||||||
-H 'accept: application/json' \
|
-H 'accept: application/json' \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "BAAI/bge-reranker-base",
|
"model": "BAAI/bge-reranker-base",
|
||||||
"query": "What is the capital of France?",
|
"query": "What is the capital of France?",
|
||||||
"documents": [
|
"documents": [
|
||||||
"The capital of Brazil is Brasilia.",
|
"The capital of Brazil is Brasilia.",
|
||||||
"The capital of France is Paris.",
|
"The capital of France is Paris.",
|
||||||
"Horses and cows are both animals"
|
"Horses and cows are both animals"
|
||||||
]
|
]
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Response:
|
??? Response
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
{
|
|
||||||
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
|
|
||||||
"model": "BAAI/bge-reranker-base",
|
|
||||||
"usage": {
|
|
||||||
"total_tokens": 56
|
|
||||||
},
|
|
||||||
"results": [
|
|
||||||
{
|
{
|
||||||
"index": 1,
|
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
|
||||||
"document": {
|
"model": "BAAI/bge-reranker-base",
|
||||||
"text": "The capital of France is Paris."
|
"usage": {
|
||||||
|
"total_tokens": 56
|
||||||
},
|
},
|
||||||
"relevance_score": 0.99853515625
|
"results": [
|
||||||
},
|
{
|
||||||
{
|
"index": 1,
|
||||||
"index": 0,
|
"document": {
|
||||||
"document": {
|
"text": "The capital of France is Paris."
|
||||||
"text": "The capital of Brazil is Brasilia."
|
},
|
||||||
},
|
"relevance_score": 0.99853515625
|
||||||
"relevance_score": 0.0005860328674316406
|
},
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"document": {
|
||||||
|
"text": "The capital of Brazil is Brasilia."
|
||||||
|
},
|
||||||
|
"relevance_score": 0.0005860328674316406
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
```
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Extra parameters
|
#### Extra parameters
|
||||||
|
|
||||||
|
|||||||
@ -12,28 +12,32 @@ vllm serve unsloth/Llama-3.2-1B-Instruct
|
|||||||
|
|
||||||
Then query the endpoint to get the latest metrics from the server:
|
Then query the endpoint to get the latest metrics from the server:
|
||||||
|
|
||||||
```console
|
??? Output
|
||||||
$ curl http://0.0.0.0:8000/metrics
|
|
||||||
|
|
||||||
# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
|
```console
|
||||||
# TYPE vllm:iteration_tokens_total histogram
|
$ curl http://0.0.0.0:8000/metrics
|
||||||
vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
|
|
||||||
vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
|
||||||
vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
# TYPE vllm:iteration_tokens_total histogram
|
||||||
vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
|
||||||
vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
||||||
vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
||||||
vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
||||||
vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
||||||
vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
||||||
...
|
vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
||||||
```
|
vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
||||||
|
vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
The following metrics are exposed:
|
The following metrics are exposed:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
--8<-- "vllm/engine/metrics.py:metrics-definitions"
|
|
||||||
```
|
```python
|
||||||
|
--8<-- "vllm/engine/metrics.py:metrics-definitions"
|
||||||
|
```
|
||||||
|
|
||||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||||
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
||||||
|
|||||||
@ -60,68 +60,70 @@ To identify the particular CUDA operation that causes the error, you can add `--
|
|||||||
|
|
||||||
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
|
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
# Test PyTorch NCCL
|
|
||||||
import torch
|
|
||||||
import torch.distributed as dist
|
|
||||||
dist.init_process_group(backend="nccl")
|
|
||||||
local_rank = dist.get_rank() % torch.cuda.device_count()
|
|
||||||
torch.cuda.set_device(local_rank)
|
|
||||||
data = torch.FloatTensor([1,] * 128).to("cuda")
|
|
||||||
dist.all_reduce(data, op=dist.ReduceOp.SUM)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
value = data.mean().item()
|
|
||||||
world_size = dist.get_world_size()
|
|
||||||
assert value == world_size, f"Expected {world_size}, got {value}"
|
|
||||||
|
|
||||||
print("PyTorch NCCL is successful!")
|
```python
|
||||||
|
# Test PyTorch NCCL
|
||||||
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
dist.init_process_group(backend="nccl")
|
||||||
|
local_rank = dist.get_rank() % torch.cuda.device_count()
|
||||||
|
torch.cuda.set_device(local_rank)
|
||||||
|
data = torch.FloatTensor([1,] * 128).to("cuda")
|
||||||
|
dist.all_reduce(data, op=dist.ReduceOp.SUM)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
value = data.mean().item()
|
||||||
|
world_size = dist.get_world_size()
|
||||||
|
assert value == world_size, f"Expected {world_size}, got {value}"
|
||||||
|
|
||||||
# Test PyTorch GLOO
|
print("PyTorch NCCL is successful!")
|
||||||
gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
|
|
||||||
cpu_data = torch.FloatTensor([1,] * 128)
|
|
||||||
dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
|
|
||||||
value = cpu_data.mean().item()
|
|
||||||
assert value == world_size, f"Expected {world_size}, got {value}"
|
|
||||||
|
|
||||||
print("PyTorch GLOO is successful!")
|
# Test PyTorch GLOO
|
||||||
|
gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
|
||||||
|
cpu_data = torch.FloatTensor([1,] * 128)
|
||||||
|
dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
|
||||||
|
value = cpu_data.mean().item()
|
||||||
|
assert value == world_size, f"Expected {world_size}, got {value}"
|
||||||
|
|
||||||
if world_size <= 1:
|
print("PyTorch GLOO is successful!")
|
||||||
exit()
|
|
||||||
|
|
||||||
# Test vLLM NCCL, with cuda graph
|
if world_size <= 1:
|
||||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
exit()
|
||||||
|
|
||||||
pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
|
# Test vLLM NCCL, with cuda graph
|
||||||
# pynccl is enabled by default for 0.6.5+,
|
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||||
# but for 0.6.4 and below, we need to enable it manually.
|
|
||||||
# keep the code for backward compatibility when because people
|
pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
|
||||||
# prefer to read the latest documentation.
|
# pynccl is enabled by default for 0.6.5+,
|
||||||
pynccl.disabled = False
|
# but for 0.6.4 and below, we need to enable it manually.
|
||||||
|
# keep the code for backward compatibility when because people
|
||||||
|
# prefer to read the latest documentation.
|
||||||
|
pynccl.disabled = False
|
||||||
|
|
||||||
|
s = torch.cuda.Stream()
|
||||||
|
with torch.cuda.stream(s):
|
||||||
|
data.fill_(1)
|
||||||
|
out = pynccl.all_reduce(data, stream=s)
|
||||||
|
value = out.mean().item()
|
||||||
|
assert value == world_size, f"Expected {world_size}, got {value}"
|
||||||
|
|
||||||
|
print("vLLM NCCL is successful!")
|
||||||
|
|
||||||
|
g = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(cuda_graph=g, stream=s):
|
||||||
|
out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())
|
||||||
|
|
||||||
s = torch.cuda.Stream()
|
|
||||||
with torch.cuda.stream(s):
|
|
||||||
data.fill_(1)
|
data.fill_(1)
|
||||||
out = pynccl.all_reduce(data, stream=s)
|
g.replay()
|
||||||
|
torch.cuda.current_stream().synchronize()
|
||||||
value = out.mean().item()
|
value = out.mean().item()
|
||||||
assert value == world_size, f"Expected {world_size}, got {value}"
|
assert value == world_size, f"Expected {world_size}, got {value}"
|
||||||
|
|
||||||
print("vLLM NCCL is successful!")
|
print("vLLM NCCL with cuda graph is successful!")
|
||||||
|
|
||||||
g = torch.cuda.CUDAGraph()
|
dist.destroy_process_group(gloo_group)
|
||||||
with torch.cuda.graph(cuda_graph=g, stream=s):
|
dist.destroy_process_group()
|
||||||
out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())
|
```
|
||||||
|
|
||||||
data.fill_(1)
|
|
||||||
g.replay()
|
|
||||||
torch.cuda.current_stream().synchronize()
|
|
||||||
value = out.mean().item()
|
|
||||||
assert value == world_size, f"Expected {world_size}, got {value}"
|
|
||||||
|
|
||||||
print("vLLM NCCL with cuda graph is successful!")
|
|
||||||
|
|
||||||
dist.destroy_process_group(gloo_group)
|
|
||||||
dist.destroy_process_group()
|
|
||||||
```
|
|
||||||
|
|
||||||
If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
|
If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
|
||||||
|
|
||||||
@ -165,25 +167,27 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
|
|||||||
|
|
||||||
or an error from Python that looks like this:
|
or an error from Python that looks like this:
|
||||||
|
|
||||||
```console
|
??? Logs
|
||||||
RuntimeError:
|
|
||||||
An attempt has been made to start a new process before the
|
|
||||||
current process has finished its bootstrapping phase.
|
|
||||||
|
|
||||||
This probably means that you are not using fork to start your
|
```console
|
||||||
child processes and you have forgotten to use the proper idiom
|
RuntimeError:
|
||||||
in the main module:
|
An attempt has been made to start a new process before the
|
||||||
|
current process has finished its bootstrapping phase.
|
||||||
|
|
||||||
if __name__ == '__main__':
|
This probably means that you are not using fork to start your
|
||||||
freeze_support()
|
child processes and you have forgotten to use the proper idiom
|
||||||
...
|
in the main module:
|
||||||
|
|
||||||
The "freeze_support()" line can be omitted if the program
|
if __name__ == '__main__':
|
||||||
is not going to be frozen to produce an executable.
|
freeze_support()
|
||||||
|
...
|
||||||
|
|
||||||
To fix this issue, refer to the "Safe importing of main module"
|
The "freeze_support()" line can be omitted if the program
|
||||||
section in https://docs.python.org/3/library/multiprocessing.html
|
is not going to be frozen to produce an executable.
|
||||||
```
|
|
||||||
|
To fix this issue, refer to the "Safe importing of main module"
|
||||||
|
section in https://docs.python.org/3/library/multiprocessing.html
|
||||||
|
```
|
||||||
|
|
||||||
then you must update your Python code to guard usage of `vllm` behind a `if
|
then you must update your Python code to guard usage of `vllm` behind a `if
|
||||||
__name__ == '__main__':` block. For example, instead of this:
|
__name__ == '__main__':` block. For example, instead of this:
|
||||||
@ -207,20 +211,22 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
|
vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
|
||||||
|
|
||||||
```python
|
??? Code
|
||||||
import torch
|
|
||||||
|
|
||||||
@torch.compile
|
```python
|
||||||
def f(x):
|
import torch
|
||||||
# a simple function to test torch.compile
|
|
||||||
x = x + 1
|
|
||||||
x = x * 2
|
|
||||||
x = x.sin()
|
|
||||||
return x
|
|
||||||
|
|
||||||
x = torch.randn(4, 4).cuda()
|
@torch.compile
|
||||||
print(f(x))
|
def f(x):
|
||||||
```
|
# a simple function to test torch.compile
|
||||||
|
x = x + 1
|
||||||
|
x = x * 2
|
||||||
|
x = x.sin()
|
||||||
|
return x
|
||||||
|
|
||||||
|
x = torch.randn(4, 4).cuda()
|
||||||
|
print(f(x))
|
||||||
|
```
|
||||||
|
|
||||||
If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example.
|
If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example.
|
||||||
|
|
||||||
|
|||||||
@ -10,36 +10,38 @@ The list of data collected by the latest version of vLLM can be found here: <gh-
|
|||||||
|
|
||||||
Here is an example as of v0.4.0:
|
Here is an example as of v0.4.0:
|
||||||
|
|
||||||
```json
|
??? Output
|
||||||
{
|
|
||||||
"uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
|
```json
|
||||||
"provider": "GCP",
|
{
|
||||||
"num_cpu": 24,
|
"uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
|
||||||
"cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
|
"provider": "GCP",
|
||||||
"cpu_family_model_stepping": "6,85,7",
|
"num_cpu": 24,
|
||||||
"total_memory": 101261135872,
|
"cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
|
||||||
"architecture": "x86_64",
|
"cpu_family_model_stepping": "6,85,7",
|
||||||
"platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
|
"total_memory": 101261135872,
|
||||||
"gpu_count": 2,
|
"architecture": "x86_64",
|
||||||
"gpu_type": "NVIDIA L4",
|
"platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
|
||||||
"gpu_memory_per_device": 23580639232,
|
"gpu_count": 2,
|
||||||
"model_architecture": "OPTForCausalLM",
|
"gpu_type": "NVIDIA L4",
|
||||||
"vllm_version": "0.3.2+cu123",
|
"gpu_memory_per_device": 23580639232,
|
||||||
"context": "LLM_CLASS",
|
"model_architecture": "OPTForCausalLM",
|
||||||
"log_time": 1711663373492490000,
|
"vllm_version": "0.3.2+cu123",
|
||||||
"source": "production",
|
"context": "LLM_CLASS",
|
||||||
"dtype": "torch.float16",
|
"log_time": 1711663373492490000,
|
||||||
"tensor_parallel_size": 1,
|
"source": "production",
|
||||||
"block_size": 16,
|
"dtype": "torch.float16",
|
||||||
"gpu_memory_utilization": 0.9,
|
"tensor_parallel_size": 1,
|
||||||
"quantization": null,
|
"block_size": 16,
|
||||||
"kv_cache_dtype": "auto",
|
"gpu_memory_utilization": 0.9,
|
||||||
"enable_lora": false,
|
"quantization": null,
|
||||||
"enable_prefix_caching": false,
|
"kv_cache_dtype": "auto",
|
||||||
"enforce_eager": false,
|
"enable_lora": false,
|
||||||
"disable_custom_all_reduce": true
|
"enable_prefix_caching": false,
|
||||||
}
|
"enforce_eager": false,
|
||||||
```
|
"disable_custom_all_reduce": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
You can preview the collected data by running the following command:
|
You can preview the collected data by running the following command:
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user