Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon 2025-09-18 12:13:56 -07:00
commit c1d83f2bae
479 changed files with 19901 additions and 22731 deletions

View File

@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py

View File

@ -46,24 +46,18 @@ steps:
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/mq_llm_engine
- tests/async_engine
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/utils_
- tests/worker
- tests/standalone_tests/lazy_imports.py
- tests/transformers_utils
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal
- pytest -v -s utils_ # Utils
- pytest -v -s worker # Worker
- pytest -v -s transformers_utils # transformers_utils
- label: Python-only Installation Test # 10min
@ -84,25 +78,12 @@ steps:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test # 22min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: true
source_file_dependencies:
- vllm/core
- vllm/distributed
- tests/core
commands:
- pytest -v -s core
- label: Entrypoints Unit Tests # 5min
timeout_in_minutes: 10
@ -127,8 +108,7 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -230,16 +210,14 @@ steps:
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/metrics
- tests/v1/tracing
commands:
- pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s tracing
- pytest -v -s v1/tracing
##### fast check tests #####
##### 1 GPU test #####
@ -302,6 +280,7 @@ steps:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/offloading
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
@ -809,7 +788,7 @@ steps:
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@ -821,6 +800,20 @@ steps:
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
- label: GPT-OSS Eval (Blackwell)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true # disable while debugging
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
##### 1 GPU test #####
##### multi gpus test #####
@ -946,7 +939,6 @@ steps:
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
# - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
- label: LoRA TP Test (Distributed) # 17 min
timeout_in_minutes: 30
@ -1020,9 +1012,21 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: Qwen MoE EP Test # optional
##### H200 test #####
- label: Distrubted Tests (H200) # optional
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
##### B200 test #####
- label: Distributed Tests (B200) # optional
gpu: b200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py

17
.github/CODEOWNERS vendored
View File

@ -22,7 +22,7 @@
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche
/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
@ -35,12 +35,12 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @heheda12345
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/offloading @ApostaC
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
@ -49,16 +49,21 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @heheda12345
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
# Transformers backend
/vllm/model_executor/models/transformers.py @hmellor
/tests/models/test_transformers.py @hmellor
# Docs
/docs @hmellor

19
.github/mergify.yml vendored
View File

@ -171,7 +171,7 @@ pull_request_rules:
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@ -302,3 +302,20 @@ pull_request_rules:
label:
remove:
- needs-rebase
- name: label-kv-connector
description: Automatically apply kv-connector label
conditions:
- or:
- files~=^examples/online_serving/disaggregated[^/]*/.*
- files~=^examples/offline_inference/disaggregated[^/]*/.*
- files~=^examples/others/lmcache/
- files~=^tests/v1/kv_connector/
- files~=^vllm/distributed/kv_transfer/
- title~=(?i)\bP/?D\b
- title~=(?i)NIXL
- title~=(?i)LMCache
actions:
label:
add:
- kv-connector

View File

@ -164,9 +164,7 @@ repos:
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
types: [python]
pass_filenames: true
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
additional_dependencies: [regex]
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@ -175,6 +175,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
#
# Set CUDA include flags for CXX compiler.
#
if(VLLM_GPU_LANG STREQUAL "CUDA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
endif()
endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@ -298,7 +308,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
"csrc/attention/mla/cutlass_mla_entry.cu"
"csrc/quantization/fp8/per_token_group_quant.cu")
set_gencode_flags_for_srcs(
@ -585,7 +594,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS
"csrc/attention/mla/cutlass_mla_kernels.cu"
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"

View File

@ -1,874 +1,20 @@
# Benchmarking vLLM
# Benchmarks
This README guides you through running benchmark tests with the extensive
datasets supported on vLLM. Its a living document, updated as new features and datasets
become available.
This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
## Dataset Overview
## Contents
<table style="width:100%; border-collapse: collapse;">
<thead>
<tr>
<th style="width:15%; text-align: left;">Dataset</th>
<th style="width:10%; text-align: center;">Online</th>
<th style="width:10%; text-align: center;">Offline</th>
<th style="width:65%; text-align: left;">Data Path</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>ShareGPT</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
</tr>
<tr>
<td><strong>ShareGPT4V (Image)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>
<code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
<br>
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
</td>
</tr>
<tr>
<td><strong>ShareGPT4Video (Video)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>
<code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
</td>
</tr>
<tr>
<td><strong>BurstGPT</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
</tr>
<tr>
<td><strong>Sonnet (deprecated)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
</tr>
<tr>
<td><strong>Random</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>RandomMultiModal (Image/Video)</strong></td>
<td style="text-align: center;">🟡</td>
<td style="text-align: center;">🚧</td>
<td><code>synthetic</code> </td>
</tr>
<tr>
<td><strong>Prefix Repetition</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>HuggingFace-VisionArena</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmarena-ai/VisionArena-Chat</code></td>
</tr>
<tr>
<td><strong>HuggingFace-InstructCoder</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td>
</tr>
<tr>
<td><strong>HuggingFace-AIMO</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr>
<tr>
<td><strong>HuggingFace-MTBench</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>philschmid/mt-bench</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Blazedit</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
</tr>
<tr>
<td><strong>Spec Bench</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
</tr>
<tr>
<td><strong>Custom</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>data.jsonl</code></td>
</tr>
</tbody>
</table>
- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
- **Throughput benchmarks**: Scripts for testing offline batch inference performance
- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
✅: supported
## Usage
🟡: Partial support
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
🚧: to be supported
For full CLI reference see:
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
```bash
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
```
## 🚀 Example - Online Benchmark
<details>
<summary>Show more</summary>
<br/>
First start serving your model
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
Then run the benchmarking script
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--num-prompts 10
```
If successful, you will see the following output
```text
============ Serving Benchmark Result ============
Successful requests: 10
Benchmark duration (s): 5.78
Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03
---------------Inter-token Latency----------------
Mean ITL (ms): 7.74
Median ITL (ms): 7.70
P99 ITL (ms): 8.39
==================================================
```
### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```json
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
```
```bash
# run benchmarking script
vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
### VisionArena Benchmark for Vision Language Models
```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct
```
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--hf-split train \
--num-prompts 1000
```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
``` bash
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
### Spec Bench Benchmark with Speculative Decoding
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
Run all categories:
``` bash
# Download the dataset using:
# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name spec_bench \
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
--num-prompts -1
```
Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
Run only a specific category like "summarization":
``` bash
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name spec_bench \
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
--num-prompts -1
--spec-bench-category "summarization"
```
### Other HuggingFaceDataset Examples
```bash
vllm serve Qwen/Qwen2-VL-7B-Instruct
```
`lmms-lab/LLaVA-OneVision-Data`:
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
`Aeala/ShareGPT_Vicuna_unfiltered`:
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
`AI-MO/aimo-validation-aime`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
```
`philschmid/mt-bench`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path vdaita/edit_5k_char \
--num-prompts 90 \
--blazedit-min-distance 0.01 \
--blazedit-max-distance 0.99
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
```
### Running With Ramp-Up Request Rate
The benchmark tool also supports ramping up the request rate over the
duration of the benchmark run. This can be useful for stress testing the
server or finding the maximum throughput that it can handle, given some latency budget.
Two ramp-up strategies are supported:
- `linear`: Increases the request rate linearly from a start value to an end value.
- `exponential`: Increases the request rate exponentially.
The following arguments can be used to control the ramp-up:
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
</details>
## 📈 Example - Offline Throughput Benchmark
<details>
<summary>Show more</summary>
<br/>
```bash
vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \
--num-prompts 10
```
If successful, you will see the following output
```text
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
Total num prompt tokens: 5014
Total num output tokens: 1500
```
### VisionArena Benchmark for Vision Language Models
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--num-prompts 1000 \
--hf-split train
```
The `num prompt tokens` now includes image token counts
```text
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
Total num prompt tokens: 14527
Total num output tokens: 1280
```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
vllm bench throughput \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
```text
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
### Other HuggingFaceDataset Examples
`lmms-lab/LLaVA-OneVision-Data`:
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
`Aeala/ShareGPT_Vicuna_unfiltered`:
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
`AI-MO/aimo-validation-aime`:
```bash
vllm bench throughput \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
Benchmark with LoRA adapters:
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \
--backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--dataset_name sharegpt \
--num-prompts 10 \
--max-loras 2 \
--max-lora-rank 8 \
--enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test
```
</details>
## 🛠️ Example - Structured Output Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of structured output generation (JSON, grammar, regex).
### Server Setup
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
### JSON Schema Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 10 \
--num-prompts 1000
```
### Grammar-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset grammar \
--structure-type grammar \
--request-rate 10 \
--num-prompts 1000
```
### Regex-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset regex \
--request-rate 10 \
--num-prompts 1000
```
### Choice-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset choice \
--request-rate 10 \
--num-prompts 1000
```
### XGrammar Benchmark Dataset
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset xgrammar_bench \
--request-rate 10 \
--num-prompts 1000
```
</details>
## 📚 Example - Long Document QA Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of long document question-answering with prefix caching.
### Basic Long Document QA Test
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 16 \
--document-length 2000 \
--output-len 50 \
--repeat-count 5
```
### Different Repeat Modes
```bash
# Random mode (default) - shuffle prompts randomly
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode random
# Tile mode - repeat entire prompt list in sequence
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode tile
# Interleave mode - repeat each prompt consecutively
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode interleave
```
</details>
## 🗂️ Example - Prefix Caching Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the efficiency of automatic prefix caching.
### Fixed Prompt with Prefix Caching
```bash
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100 \
--input-length-range 128:256
```
### ShareGPT Dataset with Prefix Caching
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
```
### Prefix Repetition Dataset
```bash
vllm bench serve \
--backend openai \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-name prefix_repetition \
--num-prompts 100 \
--prefix-repetition-prefix-len 512 \
--prefix-repetition-suffix-len 128 \
--prefix-repetition-num-prefixes 5 \
--prefix-repetition-output-len 128
```
</details>
## ⚡ Example - Request Prioritization Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of request prioritization in vLLM.
### Basic Prioritization Test
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority
```
### Multiple Sequences per Prompt
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority \
--n 2
```
</details>
## 👁️ Example - Multi-Modal Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of multi-modal requests in vLLM.
### Images (ShareGPT4V)
Start vLLM:
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"image": 1}' \
--allowed-local-media-path /path/to/sharegpt4v/images
```
Send requests with images:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
--dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
--num-prompts 100 \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completion
```
### Videos (ShareGPT4Video)
Start vLLM:
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"video": 1}' \
--allowed-local-media-path /path/to/sharegpt4video/videos
```
Send requests with videos:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
--num-prompts 100 \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completion
```
### Synthetic Random Images (random-mm)
Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
Notes:
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
- Video sampling is not yet implemented.
Start the server (example):
```bash
vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
--dtype bfloat16 \
--max-model-len 16384 \
--limit-mm-per-prompt '{"image": 3, "video": 0}' \
--mm-processor-kwargs max_pixels=1003520
```
Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name random-mm \
--num-prompts 100 \
--max-concurrency 10 \
--random-prefix-len 25 \
--random-input-len 300 \
--random-output-len 40 \
--random-range-ratio 0.2 \
--random-mm-base-items-per-request 2 \
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
--random-mm-bucket-config '{(224, 224, 1): 1.0}' \
--request-rate inf \
--ignore-eos \
--seed 42
```
The number of items per request can be controlled by passing multiple image buckets:
```bash
--random-mm-base-items-per-request 2 \
--random-mm-num-mm-items-range-ratio 0.5 \
--random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
```
Flags specific to `random-mm`:
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
Behavioral notes:
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
How sampling works:
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
</details>
- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>

View File

@ -149,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
## Batched `auto_tune`
The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
### Prerequisites
- **jq**: This script requires `jq` to parse the JSON configuration file.
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
### How to Run
1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
2. **Execute the script**:
```bash
bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
```
- `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
- `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
### Configuration File
The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
Here is an example `runs_config.json` with two benchmark configurations:
```json
[
{
"base": "/home/user",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"system": "TPU", # OR GPU
"tp": 8,
"input_len": 128,
"output_len": 2048,
"max_model_len": 2300,
"num_seqs_list": "128 256",
"num_batched_tokens_list": "8192 16384"
},
{
"base": "/home/user",
"model": "meta-llama/Llama-3.1-70B-Instruct",
"system": "TPU", # OR GPU
"tp": 8,
"input_len": 4000,
"output_len": 16,
"max_model_len": 4096,
"num_seqs_list": "64 128",
"num_batched_tokens_list": "4096 8192",
"max_latency_allowed_ms": 500
}
]
```
### Output
The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
- `run_id`: A unique identifier for the run, derived from the timestamp.
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
A summary of successful and failed runs is also printed to the console upon completion.

View File

@ -0,0 +1,128 @@
#!/bin/bash
INPUT_JSON="$1"
GCS_PATH="$2" # Optional GCS path for uploading results for each run
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
if [[ -z "$INPUT_JSON" ]]; then
echo "Error: Input JSON file not provided."
echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
exit 1
fi
if [[ ! -f "$INPUT_JSON" ]]; then
echo "Error: File not found at '$INPUT_JSON'"
exit 1
fi
if ! command -v jq &> /dev/null; then
echo "Error: 'jq' command not found. Please install jq to process the JSON input."
exit 1
fi
if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
exit 1
fi
SUCCESS_COUNT=0
FAILURE_COUNT=0
FAILED_RUNS=()
SCRIPT_START_TIME=$(date +%s)
json_content=$(cat "$INPUT_JSON")
if ! num_runs=$(echo "$json_content" | jq 'length'); then
echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
exit 1
fi
echo "Found $num_runs benchmark configurations in $INPUT_JSON."
echo "Starting benchmark runs..."
echo "--------------------------------------------------"
for i in $(seq 0 $(($num_runs - 1))); do
run_object=$(echo "$json_content" | jq ".[$i]")
RUN_START_TIME=$(date +%s)
ENV_VARS_ARRAY=()
# Dynamically create env vars from the JSON object's keys
for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
value=$(echo "$run_object" | jq -r ".$key")
var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
ENV_VARS_ARRAY+=("${var_name}=${value}")
done
echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
# Execute auto_tune.sh and capture output
RUN_OUTPUT_FILE=$(mktemp)
if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
STATUS="SUCCESS"
((SUCCESS_COUNT++))
else
STATUS="FAILURE"
((FAILURE_COUNT++))
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
fi
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
rm "$RUN_OUTPUT_FILE"
# Parse results and optionally upload them to GCS
RUN_ID=""
RESULTS=""
GCS_RESULTS_URL=""
if [[ "$STATUS" == "SUCCESS" ]]; then
RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
RESULTS=$(cat "$RESULT_FILE_PATH")
if [[ -n "$GCS_PATH" ]]; then
GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
echo "Uploading results to GCS..."
if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
echo "GCS upload successful."
else
echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
fi
fi
else
echo "Warning: Could not find result file for a successful run."
STATUS="WARNING_NO_RESULT_FILE"
fi
fi
# Add the results back into the JSON object for this run
json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
'.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
RUN_END_TIME=$(date +%s)
echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
echo "--------------------------------------------------"
# Save intermediate progress back to the file
echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
done
SCRIPT_END_TIME=$(date +%s)
echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
echo
echo "====================== SUMMARY ======================"
echo "Successful runs: $SUCCESS_COUNT"
echo "Failed runs: $FAILURE_COUNT"
echo "==================================================="
if [[ $FAILURE_COUNT -gt 0 ]]; then
echo "Details of failed runs (see JSON file for full parameters):"
for failed in "${FAILED_RUNS[@]}"; do
echo " - $failed"
done
fi
echo "Updated results have been saved to '$INPUT_JSON'."

File diff suppressed because it is too large Load Diff

View File

@ -696,11 +696,11 @@ def evaluate(ret, args):
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
if args.structure_type == "guided_json":
if args.structure_type == "json":
return _eval_correctness_json(expected, actual)
elif args.structure_type == "guided_regex":
elif args.structure_type == "regex":
return _eval_correctness_regex(expected, actual)
elif args.structure_type == "guided_choice":
elif args.structure_type == "choice":
return _eval_correctness_choice(expected, actual)
else:
return None
@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
)
if args.dataset == "grammar":
args.structure_type = "guided_grammar"
args.structure_type = "grammar"
elif args.dataset == "regex":
args.structure_type = "guided_regex"
args.structure_type = "regex"
elif args.dataset == "choice":
args.structure_type = "guided_choice"
args.structure_type = "choice"
else:
args.structure_type = "guided_json"
args.structure_type = "json"
if args.no_structured_output:
args.structured_output_ratio = 0
if args.save_results:
result_file_name = f"{args.structured_output_ratio}guided"
result_file_name = f"{args.structured_output_ratio}so"
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"

View File

@ -2,14 +2,25 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
from unittest.mock import patch
import pandas as pd
import torch
from vllm import _custom_ops as ops
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
def with_triton_mode(fn):
"""Temporarily force the Triton fallback path"""
def wrapped(*args, **kwargs):
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
return fn(*args, **kwargs)
return wrapped
# TODO(luka): use standalone_compile utility
@ -21,78 +32,236 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
return inner
torch._dynamo.config.recompile_limit = 8888
compilation_config = CompilationConfig(custom_ops=["none"])
with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
torch_per_token_quant_fp8 = torch.compile(
QuantFP8(False, GroupShape.PER_TOKEN),
fullgraph=True,
dynamic=False, # recompile for different shapes
)
def bench_compile(fn: Callable):
# recompile for different shapes
fwd = torch.compile(fn, fullgraph=True, dynamic=False)
# First dim is explicitly dynamic to simulate vLLM usage
torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
return with_dyn_arg(fwd, 0, 0)
def cuda_per_token_quant_fp8(
input: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
return ops.scaled_fp8_quant(input)
torch._dynamo.config.recompile_limit = 8888
def calculate_diff(batch_size: int, seq_len: int):
"""Calculate difference between Triton and CUDA implementations."""
def calculate_diff(
batch_size: int,
hidden_size: int,
group_shape: GroupShape,
dtype: torch.dtype,
):
"""Calculate the difference between Inductor and CUDA implementations."""
device = torch.device("cuda")
x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
torch_out, torch_scale = torch_per_token_quant_fp8(x)
cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
if torch.allclose(
cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
out_allclose = lambda o1, o2: torch.allclose(
o1.to(torch.float32),
o2.to(torch.float32),
rtol=1e-3,
atol=1e-5,
)
scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
if (
out_allclose(cuda_out, torch_out)
and scale_allclose(cuda_scale, torch_scale)
and out_allclose(cuda_out, torch_eager_out)
and scale_allclose(cuda_scale, torch_eager_scale)
):
print("✅ All implementations match")
else:
print("❌ Implementations differ")
batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
configs = list(itertools.product(batch_size_range, seq_len_range))
configs = []
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size", "seq_len"],
x_vals=configs,
line_arg="provider",
line_vals=["torch", "cuda"],
line_names=["Torch", "CUDA"],
styles=[("blue", "-"), ("green", "-")],
ylabel="us",
plot_name="per-token-dynamic-quant-fp8-performance",
args={},
)
)
def benchmark_quantization(batch_size, seq_len, provider):
dtype = torch.float16
def benchmark_quantization(
batch_size,
hidden_size,
provider,
group_shape: GroupShape,
col_major: bool,
dtype: torch.dtype,
):
device = torch.device("cuda")
x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
if provider == "torch":
fn = lambda: torch_per_token_quant_fp8(x.clone())
fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
elif provider == "cuda":
fn = lambda: cuda_per_token_quant_fp8(x.clone())
fn = lambda: quant_fp8.forward_cuda(x.clone())
elif provider == "triton":
if not group_shape.is_per_group():
# Triton only supported for per-group
return 0, 0, 0
fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
# TODO(luka) extract to utils
def compute_geomean_speedups(
df: pd.DataFrame,
baseline_col: str,
speedup_cols: list[str],
groupby_cols: list[str] | None = None,
) -> pd.DataFrame:
"""
Compute geometric mean speedups over a baseline column.
Args:
df: Input dataframe
baseline_col: Column to use as baseline
speedup_cols: Columns to compute speedups for
groupby_cols: Columns to group by. If None, compute over entire df.
Returns:
pd.DataFrame with geometric mean speedups
"""
from scipy.stats import gmean
def geo_speedup(group: pd.DataFrame) -> pd.Series:
ratios = {
col: (group[baseline_col] / group[col]).values for col in speedup_cols
}
return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
if groupby_cols is None:
result = geo_speedup(df).to_frame().T
else:
result = (
df.groupby(groupby_cols)
.apply(geo_speedup, include_groups=False)
.reset_index()
)
return result
if __name__ == "__main__":
calculate_diff(batch_size=4, seq_len=4096)
benchmark_quantization.run(print_data=True)
parser = FlexibleArgumentParser(
description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
)
parser.add_argument("-c", "--check", action="store_true")
parser.add_argument(
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
)
parser.add_argument(
"--hidden-sizes",
type=int,
nargs="+",
default=None,
help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
)
parser.add_argument(
"--batch-sizes",
type=int,
nargs="+",
default=None,
help="Batch sizes to benchmark (default: 1,16,32,64,128)",
)
parser.add_argument(
"--group-sizes",
type=int,
nargs="+",
default=None,
help="Group sizes for GroupShape(1,N) to benchmark. "
"Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
)
parser.add_argument(
"--no-column-major",
action="store_true",
help="Disable column-major scales testing",
)
args = parser.parse_args()
assert args
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
if args.group_sizes is not None:
group_shapes = []
for size in args.group_sizes:
if size == 0:
group_shapes.append(GroupShape.PER_TENSOR)
elif size == -1:
group_shapes.append(GroupShape.PER_TOKEN)
else:
group_shapes.append(GroupShape(1, size))
else:
group_shapes = [
GroupShape.PER_TENSOR,
GroupShape.PER_TOKEN,
GroupShape(1, 64),
GroupShape(1, 128),
]
column_major_scales = [False] if args.no_column_major else [True, False]
config_gen = itertools.product(
group_shapes,
column_major_scales,
batch_sizes,
hidden_sizes,
)
# filter out column-major scales for non-group, reverse order
configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
print(f"Running {len(configs)} configurations:")
print(f" Hidden sizes: {hidden_sizes}")
print(f" Batch sizes: {batch_sizes}")
print(f" Group shapes: {[str(g) for g in group_shapes]}")
print(f" Column major scales: {column_major_scales}")
print()
if args.check:
for group_shape in group_shapes:
group_size = group_shape[1]
print(f"{group_size=}")
calculate_diff(
batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
)
benchmark = triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
x_vals=configs,
line_arg="provider",
line_vals=["torch", "cuda", "triton"],
line_names=["Torch (Compiled)", "CUDA", "Triton"],
styles=[("blue", "-"), ("green", "-"), ("black", "-")],
ylabel="us",
plot_name="QuantFP8 performance",
args={},
)
)(benchmark_quantization)
df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
# Print geomean speedups
geo_table_grouped = compute_geomean_speedups(
df,
baseline_col="Torch (Compiled)",
speedup_cols=["CUDA", "Triton"],
groupby_cols=["col_major", "group_shape"],
)
print("Speedup over Torch (Compiled)")
print(geo_table_grouped.to_string(index=False))

View File

@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config,
nvfp4_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
@ -140,6 +144,12 @@ def bench_run(
a_fp8_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
)
for _ in range(num_repeats):
fused_experts(
a,
@ -147,10 +157,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
quant_config=quant_config,
)
def run_cutlass_moe_fp4(
@ -172,25 +179,27 @@ def bench_run(
device: torch.device,
num_repeats: int,
):
quant_config = nvfp4_moe_quant_config(
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_scale=w1_blockscale,
w2_scale=w2_blockscale,
g1_alphas=w1_gs,
g2_alphas=w2_gs,
)
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp4", color="green"):
cutlass_moe_fp4(
a=a,
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_fp4=w1_fp4,
w1_blockscale=w1_blockscale,
w1_alphas=w1_gs,
w2_fp4=w2_fp4,
w2_blockscale=w2_blockscale,
w2_alphas=w2_gs,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
device=device,
quant_config=quant_config,
)
def run_cutlass_from_graph(
@ -211,26 +220,29 @@ def bench_run(
e: int,
device: torch.device,
):
quant_config = nvfp4_moe_quant_config(
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_scale=w1_blockscale,
w2_scale=w2_blockscale,
g1_alphas=w1_gs,
g2_alphas=w2_gs,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
return cutlass_moe_fp4(
a=a,
a1_gscale=a1_gs,
w1_fp4=w1_fp4,
w1_blockscale=w1_blockscale,
w1_alphas=w1_alphas,
a2_gscale=a2_gs,
w2_fp4=w2_fp4,
w2_blockscale=w2_blockscale,
w2_alphas=w2_alphas,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
device=device,
quant_config=quant_config,
)
def run_triton_from_graph(
@ -246,16 +258,18 @@ def bench_run(
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
)
return fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
quant_config=quant_config,
)
def replay_graph(graph, num_repeats):

View File

@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
@ -96,6 +97,11 @@ def bench_run(
a_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
)
for _ in range(num_repeats):
fused_experts(
a,
@ -103,10 +109,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
quant_config=quant_config,
)
def run_cutlass_moe(
@ -125,6 +128,12 @@ def bench_run(
per_act_token: bool,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
per_act_token_quant=per_act_token,
)
for _ in range(num_repeats):
cutlass_moe_fp8(
a,
@ -132,14 +141,11 @@ def bench_run(
w2,
topk_weights,
topk_ids,
w1_scale,
w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
per_act_token,
a1_scale=None,
quant_config=quant_config,
)
def run_cutlass_from_graph(
@ -156,6 +162,12 @@ def bench_run(
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
per_act_token_quant=per_act_token,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@ -165,14 +177,11 @@ def bench_run(
w2_q,
topk_weights,
topk_ids,
w1_scale,
w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
per_act_token,
a1_scale=None,
quant_config=quant_config,
)
def run_triton_from_graph(
@ -185,6 +194,11 @@ def bench_run(
w2_scale: torch.Tensor,
a_scale: torch.Tensor,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@ -194,10 +208,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
quant_config=quant_config,
)
def replay_graph(graph, num_repeats):

View File

@ -464,7 +464,11 @@ class BenchmarkTensors:
for field_name in LoRAKernelMeta.__dataclass_fields__:
field = getattr(self.lora_kernel_meta, field_name)
assert isinstance(field, torch.Tensor)
setattr(self.lora_kernel_meta, field_name, to_device(field))
setattr(
self.lora_kernel_meta,
field_name,
to_device(field) if field_name != "no_lora_flag_cpu" else field,
)
def metadata(self) -> tuple[int, int, int]:
"""
@ -512,6 +516,7 @@ class BenchmarkTensors:
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"scaling": 1.0,
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
@ -552,6 +557,7 @@ class BenchmarkTensors:
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"offset_start": 0,
"add_inputs": add_inputs,
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def bench_fn_kwargs(

View File

@ -14,6 +14,10 @@ import ray
import torch
from ray.experimental.tqdm_ray import tqdm
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
_get_config_dtype_str,
)
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
@ -134,43 +138,36 @@ def benchmark_config(
def run():
from vllm.model_executor.layers.fused_moe import override_config
if use_fp8_w8a8:
quant_dtype = torch.float8_e4m3fn
elif use_int8_w8a16:
quant_dtype = torch.int8
else:
quant_dtype = None
quant_config = FusedMoEQuantConfig.make(
quant_dtype=quant_dtype,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
with override_config(config):
if use_deep_gemm:
topk_weights, topk_ids, token_expert_indices = fused_topk(
x, input_gating, topk, False
)
return fused_experts(
x,
w1,
w2,
topk_weights,
topk_ids,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
allow_deep_gemm=True,
)
else:
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
topk_weights, topk_ids, token_expert_indices = fused_topk(
x, input_gating, topk, renormalize=not use_deep_gemm
)
return fused_experts(
x,
w1,
w2,
topk_weights,
topk_ids,
inplace=True,
quant_config=quant_config,
allow_deep_gemm=use_deep_gemm,
)
# JIT compilation & warmup
run()
@ -414,7 +411,7 @@ class BenchmarkWorker:
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(
dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
@ -547,7 +544,7 @@ def save_configs(
block_quant_shape: list[int],
save_dir: str,
) -> None:
dtype_str = get_config_dtype_str(
dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)

View File

@ -55,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
----------------------------------------------------------------------------------------------------
```
### JSON configuration file for synthetic conversations generation
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
The file `generate_multi_turn.json` is an example file.
The file must contain the sections `prompt_input` and `prompt_output`.
The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
The final value will always be rounded to an even number so each user turn has a reply.
* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
* `num_tokens` - Total token length of each **user** message (one turn).
The `prompt_output` section must contain `num_tokens`:
* `num_tokens` - Total token length of each **assistant** message (one turn).
### Random distributions for synthetic conversations generation
When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
The distribution determines how to randomly sample values for the field.
The available distributions are listed below.
**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
#### constant
```json
{
"distribution": "constant",
"value": 500
}
```
* `value` - the fixed integer value (always returns the same number).
#### uniform
```json
{
"distribution": "uniform",
"min": 12,
"max": 18
}
```
* `min` - minimum value (inclusive).
* `max` - maximum value (inclusive), should be equal or larger than min.
#### lognormal
```json
{
"distribution": "lognormal",
"average": 1000,
"max": 5000
}
```
You can parameterize the lognormal distribution in one of two ways:
Using the average and optional median ratio:
* `average` - target average value of the distribution.
* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
Using the parameters of the underlying normal distribution:
* `mean` - mean of the underlying normal distribution.
* `sigma` - standard deviation of the underlying normal distribution.
#### zipf
```json
{
"distribution": "zipf",
"alpha": 1.2,
"max": 100
}
```
* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
#### poisson
```json
{
"distribution": "poisson",
"alpha": 10,
"max": 50
}
```
* `alpha` - expected value (λ). Also the variance of the distribution.
## ShareGPT Conversations
To run with the ShareGPT data, download the following ShareGPT dataset:

View File

@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution):
def __init__(
self, mean: float, sigma: float, max_val: Optional[int] = None
self,
mean: Optional[float] = None,
sigma: Optional[float] = None,
average: Optional[int] = None,
median_ratio: Optional[float] = None,
max_val: Optional[int] = None,
) -> None:
self.average = average
self.median_ratio = median_ratio
self.max_val = max_val
if average is not None:
if average < 1:
raise ValueError("Lognormal average must be positive")
if mean or sigma:
raise ValueError(
"When using lognormal average, you can't provide mean/sigma"
)
if self.median_ratio is None:
# Default value that provides relatively wide range of values
self.median_ratio = 0.85
# Calculate mean/sigma of np.random.lognormal based on the average
mean, sigma = self._generate_lognormal_by_median(
target_average=self.average, median_ratio=self.median_ratio
)
else:
if mean is None or sigma is None:
raise ValueError(
"Must provide both mean and sigma if average is not used"
)
if mean <= 0 or sigma < 0:
raise ValueError(
"Lognormal mean must be positive and sigma must be non-negative"
)
# Mean and standard deviation of the underlying normal distribution
# Based on numpy.random.lognormal
self.mean = mean
self.sigma = sigma
self.max_val = max_val
@staticmethod
def _generate_lognormal_by_median(
target_average: int, median_ratio: float
) -> tuple[float, float]:
"""
Compute (mu, sigma) for a lognormal distribution given:
- a target average (mean of the distribution)
- a ratio of median / mean (controls skewness), assume mean > median
Background:
If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
* mean(X) = exp(mu + sigma^2 / 2)
* median(X) = exp(mu)
So:
median / mean = exp(mu) / exp(mu + sigma^2 / 2)
= exp(-sigma^2 / 2)
Rearranging:
sigma^2 = 2 * ln(mean / median)
mu = ln(median)
This gives a unique (mu, sigma) for any valid mean and median.
"""
# Check input validity: median must be smaller than mean
if median_ratio <= 0 or median_ratio >= 1:
raise ValueError("median_ratio must be in range (0, 1)")
target_median = target_average * median_ratio
# Solve sigma^2 = 2 * ln(mean / median)
sigma = np.sqrt(2 * np.log(target_average / target_median))
mu = np.log(target_median)
return mu, sigma
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
if self.average is not None:
# Scale to average
samples *= self.average / samples.mean()
if self.max_val:
samples = np.minimum(samples, self.max_val)
return np.round(samples).astype(int)
def __repr__(self) -> str:
return f"LognormalDistribution[{self.mean}, {self.sigma}]"
if self.average:
return (
f"LognormalDistribution[{self.average}, "
f"{self.median_ratio}, {self.max_val}]"
)
return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
class GenConvArgs(NamedTuple):
@ -173,10 +257,21 @@ def get_random_distribution(
return PoissonDistribution(conf["alpha"], max_val=max_val)
elif distribution == "lognormal":
max_val = conf.get("max", None)
if "average" in conf:
# Infer lognormal mean/sigma (numpy) from input average
median_ratio = conf.get("median_ratio", None)
return LognormalDistribution(
average=conf["average"], median_ratio=median_ratio, max_val=max_val
)
# Use mean/sigma directly (for full control over the distribution)
verify_field_exists(conf, "mean", section, subsection)
verify_field_exists(conf, "sigma", section, subsection)
max_val = conf.get("max", None)
return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
return LognormalDistribution(
mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
)
elif distribution == "uniform":
verify_field_exists(conf, "min", section, subsection)

View File

@ -15,9 +15,8 @@
},
"prefix_num_tokens": {
"distribution": "lognormal",
"mean": 6,
"sigma": 4,
"max": 1500
"average": 1000,
"max": 5000
},
"num_tokens": {
"distribution": "uniform",

View File

@ -1,38 +0,0 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <torch/all.h>
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table, double scale);
#endif
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table, double scale) {
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
seq_lens, page_table, scale);
#endif
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
}

View File

@ -1,225 +0,0 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cute/tensor.hpp"
#include "cutlass/cutlass.h"
#include "cutlass/kernel_hardware_info.h"
#include "cutlass_extensions/common.hpp"
#include "device/sm100_mla.hpp"
#include "kernel/sm100_mla_tile_scheduler.hpp"
using namespace cute;
using namespace cutlass::fmha::kernel;
template <typename T, bool PersistenceOption = true>
struct MlaSm100 {
using Element = T;
using ElementAcc = float;
using ElementOut = T;
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
using TileShapeH = cute::tuple_element_t<0, TileShape>;
using TileShapeD = cute::tuple_element_t<2, TileShape>;
// H K (D_latent D_rope) B
using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B
using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B
using StrideO = StrideK; // H D B
using StrideLSE = cute::tuple<_1, int>; // H B
using TileScheduler =
std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
Sm100MlaIndividualTileScheduler>;
using FmhaKernel =
cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
/*kIsCpAsync=*/true>;
using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
};
template <typename T>
typename T::Fmha::Arguments args_from_options(
at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
at::Tensor const& page_table, double scale) {
cutlass::KernelHardwareInfo hw_info;
hw_info.device_id = q_nope.device().index();
hw_info.sm_count =
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
hw_info.device_id);
int batches = q_nope.sizes()[0];
int page_count_per_seq = page_table.sizes()[1];
int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
int page_size = kv_c_and_k_pe_cache.sizes()[1];
int max_seq_len = page_size * page_count_per_seq;
using TileShapeH = typename T::TileShapeH;
using TileShapeD = typename T::TileShapeD;
auto problem_shape =
cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
auto [H, K, D, B] = problem_shape;
auto [D_latent, D_rope] = D;
using StrideQ = typename T::StrideQ;
using StrideK = typename T::StrideK;
using StrideO = typename T::StrideO;
using StrideLSE = typename T::StrideLSE;
StrideQ stride_Q_latent = cute::make_tuple(
static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
static_cast<int64_t>(H * D_rope));
StrideK stride_C =
cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
static_cast<int64_t>(page_size * (D_latent + D_rope)));
StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
static_cast<int64_t>(H * D_latent));
using Element = typename T::Element;
using ElementOut = typename T::ElementOut;
using ElementAcc = typename T::ElementAcc;
auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
auto scale_f = static_cast<float>(scale);
typename T::Fmha::Arguments arguments{
problem_shape,
{scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
stride_C, C_ptr + D_latent, stride_C,
static_cast<int*>(seq_lens.data_ptr()),
static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
page_size},
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
static_cast<ElementAcc*>(nullptr), stride_LSE},
hw_info,
1, // split_kv
nullptr, // is_var_split_kv
};
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
// split_kv automatically based on batch size and sequence length to balance
// workload across available SMs. Consider using var_split_kv for manual
// control if needed.
T::Fmha::set_split_kv(arguments);
return arguments;
}
template <typename Element>
void runMla(at::Tensor const& out, at::Tensor const& q_nope,
at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
at::Tensor const& seq_lens, at::Tensor const& page_table,
float scale, cudaStream_t stream) {
using MlaSm100Type = MlaSm100<Element>;
typename MlaSm100Type::Fmha fmha;
auto arguments = args_from_options<MlaSm100Type>(
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
auto const workspace_options =
torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
auto workspace = torch::empty(workspace_size, workspace_options);
CUTLASS_CHECK(fmha.can_implement(arguments));
CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
}
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table, double scale) {
TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
"kv_c_and_k_pe_cache must be a 3D tensor");
TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
auto B_q_nope = q_nope.size(0);
auto H_q_nope = q_nope.size(1);
auto D_q_nope = q_nope.size(2);
auto B_q_pe = q_pe.size(0);
auto H_q_pe = q_pe.size(1);
auto D_q_pe = q_pe.size(2);
auto B_pt = page_table.size(0);
auto PAGE_NUM = page_table.size(1);
auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
auto D_ckv = kv_c_and_k_pe_cache.size(2);
auto B_o = out.size(0);
auto H_o = out.size(1);
auto D_o = out.size(2);
TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
"H_q_nope, H_q_pe, and H_o must be equal to 128");
TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
"PAGE_SIZE must be a power of 2");
TORCH_CHECK(
B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
"Batch dims must be same for page_table, q_nope and q_pe, and out");
TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
"PAGE_NUM must be divisible by 128 / PAGE_SIZE");
TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
q_nope.dtype() == at::ScalarType::BFloat16 ||
q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
"q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
q_nope.dtype() == q_pe.dtype(),
"kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
"seq_lens must be a 32-bit integer tensor");
TORCH_CHECK(page_table.dtype() == torch::kInt32,
"page_table must be a 32-bit integer tensor");
auto in_dtype = q_nope.dtype();
const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
const cudaStream_t stream =
at::cuda::getCurrentCUDAStream(q_nope.get_device());
if (in_dtype == at::ScalarType::Half) {
runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
page_table, scale, stream);
} else if (in_dtype == at::ScalarType::BFloat16) {
runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
seq_lens, page_table, scale, stream);
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
seq_lens, page_table, scale, stream);
} else {
TORCH_CHECK(false, "Unsupported input data type of MLA");
}
}

View File

@ -133,6 +133,14 @@ public:
// printf(" sm_count = %d\n", sm_count);
int max_splits = ceil_div(K, 128);
max_splits = min(16, max_splits);
// TODO: This avoids a hang when the batch size larger than 1 and
// there is more than 4 kv_splits.
// Discuss with NVIDIA how this can be fixed.
if (B > 1) {
max_splits = min(2, max_splits);
}
// printf(" max_splits = %d\n", max_splits);
int sms_per_batch = max(1, sm_count / B);
// printf(" sms_per_batch = %d\n", sms_per_batch);

View File

@ -17,4 +17,8 @@
#warning "unsupported vLLM cpu implementation"
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
#endif

View File

@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
CPU_KERNEL_GUARD_IN(onednn_mm)
TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.stride(-1) == 1);
TORCH_CHECK(c.is_contiguous());
TORCH_CHECK(c.stride(-1) == 1);
MatMulPrimitiveHandler* ptr =
reinterpret_cast<MatMulPrimitiveHandler*>(handler);

17
csrc/cub_helpers.h Normal file
View File

@ -0,0 +1,17 @@
#pragma once
#ifndef USE_ROCM
#include <cub/cub.cuh>
#if CUB_VERSION >= 200800
#include <cuda/std/functional>
using CubAddOp = cuda::std::plus<>;
using CubMaxOp = cuda::maximum<>;
#else // if CUB_VERSION < 200800
using CubAddOp = cub::Sum;
using CubMaxOp = cub::Max;
#endif // CUB_VERSION
#else
#include <hipcub/hipcub.hpp>
using CubAddOp = cub::Sum;
using CubMaxOp = cub::Max;
#endif // USE_ROCM

View File

@ -1,15 +1,10 @@
#include "type_convert.cuh"
#include "dispatch_utils.h"
#include "cub_helpers.h"
#include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h>
#ifndef USE_ROCM
#include <cub/cub.cuh>
#else
#include <hipcub/hipcub.hpp>
#endif
namespace vllm {
// TODO(woosuk): Further optimize this kernel.
@ -30,7 +25,7 @@ __global__ void rms_norm_kernel(
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -85,7 +80,7 @@ fused_add_rms_norm_kernel(
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -126,7 +121,7 @@ fused_add_rms_norm_kernel(
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);

View File

@ -8,16 +8,11 @@
#include "type_convert.cuh"
#include "quantization/fp8/common.cuh"
#include "dispatch_utils.h"
#include "cub_helpers.h"
#include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h>
#ifndef USE_ROCM
#include <cub/cub.cuh>
#else
#include <hipcub/hipcub.hpp>
#endif
namespace vllm {
// TODO(woosuk): Further optimize this kernel.
@ -39,7 +34,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -100,7 +95,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);
@ -149,7 +144,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
if (threadIdx.x == 0) {
s_variance = rsqrtf(variance / hidden_size + epsilon);

View File

@ -21,6 +21,7 @@
#include <torch/all.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <cuda/std/limits>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;
@ -28,7 +29,6 @@ namespace cg = cooperative_groups;
namespace vllm {
namespace moe {
constexpr float kNegInfinity = INFINITY * -1;
constexpr unsigned FULL_WARP_MASK = 0xffffffff;
constexpr int32_t WARP_SIZE = 32;
constexpr int32_t BLOCK_SIZE = 512;
@ -411,14 +411,21 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
return __bfloat162float(val);
}
template <typename T>
__device__ inline T neg_inf() {
// cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
// so we need to cast from fp32
return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
}
template <typename T>
__device__ void topk_with_k2(T* output, T const* input,
cg::thread_block_tile<32> const& tile,
int32_t const lane_id,
int const num_experts_per_group) {
// Get the top2 per thread
T largest = -INFINITY;
T second_largest = -INFINITY;
T largest = neg_inf<T>();
T second_largest = neg_inf<T>();
if (num_experts_per_group > WARP_SIZE) {
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
@ -513,8 +520,8 @@ __global__ void group_idx_and_topk_idx_kernel(
warp_id * topk;
s_topk_idx += warp_id * topk;
T value = kNegInfinity;
T topk_group_value = kNegInfinity;
T value = neg_inf<T>();
T topk_group_value = neg_inf<T>();
int32_t num_equalto_topkth_group;
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@ -525,11 +532,8 @@ __global__ void group_idx_and_topk_idx_kernel(
if (case_id < num_tokens) {
// calculate group_idx
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
if (lane_id < n_group &&
(isfinite(cuda_cast<float, T>(
group_scores[lane_id])))) // The check is necessary to avoid
// abnormal input
{
// The check is necessary to avoid abnormal input
if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) {
value = group_scores[lane_id];
}
@ -540,11 +544,11 @@ __global__ void group_idx_and_topk_idx_kernel(
__syncwarp(); // Ensure all threads have valid data before reduction
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
if (value == topk_group_value) {
value = kNegInfinity;
value = neg_inf<T>();
}
pre_count_equal_to_top_value = count_equal_to_top_value;
count_equal_to_top_value = __popc(__ballot_sync(
FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
count_equal_to_top_value =
__popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
}
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
}
@ -552,11 +556,10 @@ __global__ void group_idx_and_topk_idx_kernel(
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
/* is_stable */ true>
queue((int32_t)topk, -INFINITY);
queue((int32_t)topk, neg_inf<T>());
int count_equalto_topkth_group = 0;
bool if_proceed_next_topk =
(topk_group_value != cuda_cast<T, float>(kNegInfinity));
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
if (case_id < num_tokens && if_proceed_next_topk) {
for (int i_group = 0; i_group < n_group; i_group++) {
if ((group_scores[i_group] > topk_group_value) ||
@ -566,10 +569,10 @@ __global__ void group_idx_and_topk_idx_kernel(
for (int32_t i = lane_id; i < align_num_experts_per_group;
i += WARP_SIZE) {
T candidates =
(i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
scores_with_bias[offset + i]))
(i < num_experts_per_group) &&
cuda::std::isfinite(scores_with_bias[offset + i])
? scores_with_bias[offset + i]
: cuda_cast<T, float>(kNegInfinity);
: neg_inf<T>();
queue.add(candidates, offset + i);
}
if (group_scores[i_group] == topk_group_value) {
@ -598,7 +601,8 @@ __global__ void group_idx_and_topk_idx_kernel(
if (i < topk) {
s_topk_value[i] = value;
}
topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
topk_sum +=
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
}
}

View File

@ -20,17 +20,7 @@
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "../cuda_compat.h"
#ifndef USE_ROCM
#include <cub/util_type.cuh>
#include <cub/cub.cuh>
#include <cuda/std/functional>
using AddOp = cuda::std::plus<float>;
#else
#include <hipcub/util_type.hpp>
#include <hipcub/hipcub.hpp>
using AddOp = cub::Sum;
#endif
#include "../cub_helpers.h"
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -79,7 +69,7 @@ __launch_bounds__(TPB) __global__
threadData = max(static_cast<float>(input[idx]), threadData);
}
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
if (threadIdx.x == 0)
{
float_max = maxElem;
@ -94,7 +84,7 @@ __launch_bounds__(TPB) __global__
threadData += exp((static_cast<float>(input[idx]) - float_max));
}
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
if (threadIdx.x == 0)
{

View File

@ -365,7 +365,6 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
int32_t compute_pipeline_offset_64 = 0;
for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
__nv_bfloat16 y_max_bf16 = EPS;
__nv_bfloat162 results_bf162[2];
cp_async_wait<NUM_STAGES - 2>();
@ -405,7 +404,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
auto _y_max2 =
__hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
__nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
// An entire group is assigned to a single warp, so a simple warp reduce
// is used.

View File

@ -7,17 +7,10 @@
#include <cmath>
#include "../../cub_helpers.h"
#include "../../dispatch_utils.h"
#include "../vectorization_utils.cuh"
#ifndef USE_ROCM
#include <cub/cub.cuh>
#include <cub/util_type.cuh>
#else
#include <hipcub/hipcub.hpp>
#include <hipcub/util_type.hpp>
#endif
static inline __device__ int8_t float_to_int8_rn(float x) {
#ifdef USE_ROCM
static constexpr auto i8_min =
@ -173,7 +166,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
});
using BlockReduce = cub::BlockReduce<float, 256>;
__shared__ typename BlockReduce::TempStorage tmp;
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
float block_max = BlockReduce(tmp).Reduce(thread_max, CubMaxOp{}, blockDim.x);
__shared__ float absmax;
if (tid == 0) {
absmax = block_max;

View File

@ -25,6 +25,8 @@
#include "cutlass_extensions/common.hpp"
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
#include <cuda_runtime.h>
namespace vllm::cutlass_w4a8 {
using namespace cute;
@ -393,6 +395,71 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
return packed_scales;
}
/*
GPU-accelerated implementation of cutlass::unified_encode_int4b.
Constructs a lookup table in constant memory to map 8 bits
(two 4-bit values) at a time. Assumes memory is contiguous
and pointers are 16-byte aligned.
*/
__constant__ uint8_t kNibbleLUT[256];
__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
size_t nbytes) {
constexpr size_t V = sizeof(uint4); // 16 bytes
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t nthreads = size_t(gridDim.x) * blockDim.x;
const size_t nvec = nbytes / V;
// 1-D grid-stride loop over 16-byte chunks
for (size_t vec = tid; vec < nvec; vec += nthreads) {
uint4 v = reinterpret_cast<const uint4*>(in)[vec];
uint8_t* b = reinterpret_cast<uint8_t*>(&v);
#pragma unroll
for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
reinterpret_cast<uint4*>(out)[vec] = v;
}
}
static bool upload_lut() {
std::array<uint8_t, 256> lut{};
auto map_nib = [](uint8_t v) -> uint8_t {
// 1..7 -> (8 - v); keep 0 and 8..15
return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
};
for (int b = 0; b < 256; ++b) {
uint8_t lo = b & 0xF;
uint8_t hi = (b >> 4) & 0xF;
lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
}
cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
/*offset=*/0, cudaMemcpyHostToDevice);
return (e == cudaSuccess);
}
static bool unified_encode_int4b(cutlass::int4b_t const* in,
cutlass::int4b_t* out, size_t num_int4_elems) {
// Build/upload LUT
if (!upload_lut()) return false;
static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
"int4 storage must be 1 byte");
const size_t nbytes = num_int4_elems >> 1;
auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
auto* out_bytes = reinterpret_cast<uint8_t*>(out);
// kernel launch params
constexpr int block = 256;
const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors
int grid = int((nvec + block - 1) / block);
if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel
unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess);
}
torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
TORCH_CHECK(B.dtype() == torch::kInt32);
TORCH_CHECK(B.dim() == 2);
@ -401,6 +468,7 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
int k = B.size(0) * PackFactor; // logical k
int n = B.size(1);
TORCH_CHECK((n * k) % 32 == 0, "need multiples of 32 int4s for 16B chunks");
auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
auto B_packed_ptr = static_cast<QuantType*>(B_packed.data_ptr());
@ -409,7 +477,9 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
LayoutB_Reordered layout_B_reordered =
cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
cutlass::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
bool ok =
vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
TORCH_CHECK(ok, "unified_encode_int4b failed");
cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
return B_packed;

View File

@ -30,109 +30,41 @@
namespace vllm {
// silu in float32
__device__ __forceinline__ float silu(float x) {
return __fdividef(x, (1.f + __expf(-x)));
}
__device__ __forceinline__ float2 silu2(float2 x) {
return make_float2(silu(x.x), silu(x.y));
}
template <class Type>
__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
PackedVec<Type>& vec2) {
__inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
PackedVec<Type>& vec2) {
PackedVec<Type> result;
using packed_type = typename TypeConverter<Type>::Type;
#pragma unroll
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
// silu_mul in float32
if constexpr (std::is_same_v<Type, half>) {
half2 val(0.5f, 0.5f);
half2 t0 = __hmul2(vec.elts[i], val);
half2 t1 = __hfma2(h2tanh(t0), val, val);
half2 t2 = __hmul2(vec.elts[i], t1);
result.elts[i] = __hmul2(t2, vec2.elts[i]);
float2 silu_vec = silu2(__half22float2(vec.elts[i]));
result.elts[i] =
__float22half2_rn(__fmul2_rn(silu_vec, __half22float2(vec2.elts[i])));
} else {
__nv_bfloat162 val(0.5f, 0.5f);
__nv_bfloat162 t0 = __hmul2(vec.elts[i], val);
__nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val);
__nv_bfloat162 t2 = __hmul2(vec.elts[i], t1);
result.elts[i] = __hmul2(t2, vec2.elts[i]);
float2 silu_vec = silu2(__bfloat1622float2(vec.elts[i]));
result.elts[i] = __float22bfloat162_rn(
__fmul2_rn(silu_vec, __bfloat1622float2(vec2.elts[i])));
}
}
return result;
}
// Quantizes the provided PackedVec into the uint32_t output
template <class Type, bool UE8M0_SF = false>
__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
PackedVec<Type>& vec2,
float SFScaleVal,
uint8_t* SFout) {
PackedVec<Type> out_silu = compute_silu(vec, vec2);
// Get absolute maximum values among the local 8 values.
auto localMax = __habs2(out_silu.elts[0]);
// Local maximum value.
#pragma unroll
for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
}
// Get the absolute maximum among all 16 values (two threads).
localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
// Get the final absolute maximum values.
float vecMax = float(__hmax(localMax.x, localMax.y));
// Get the SF (max value of the vector / max value of e2m1).
// maximum value of e2m1 = 6.0.
// TODO: use half as compute data type.
float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
// 8 bits representation of the SF.
uint8_t fp8SFVal;
// Write the SF to global memory (STG.8).
if constexpr (UE8M0_SF) {
// Extract the 8 exponent bits from float32.
// float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
fp8SFVal = tmp & 0xff;
// Convert back to fp32.
reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
} else {
// Here SFValue is always positive, so E4M3 is the same as UE4M3.
__nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
// Convert back to fp32.
SFValue = float(tmp);
}
// Get the output scale.
// Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
// reciprocal(SFScaleVal))
float outputScale =
SFValue != 0 ? reciprocal_approximate_ftz(
SFValue * reciprocal_approximate_ftz(SFScaleVal))
: 0.0f;
if (SFout) {
// Write the SF to global memory (STG.8).
*SFout = fp8SFVal;
}
// Convert the input to float.
float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
#pragma unroll
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
if constexpr (std::is_same_v<Type, half>) {
fp2Vals[i] = __half22float2(out_silu.elts[i]);
} else {
fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
}
fp2Vals[i].x *= outputScale;
fp2Vals[i].y *= outputScale;
}
// Convert to e2m1 values.
uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
// Write the e2m1 values to global memory.
return e2m1Vec;
}
// Use UE4M3 by default.
template <class Type, bool UE8M0_SF = false>
__global__ void __launch_bounds__(1024, 4)
silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
float const* SFScale, uint32_t* out,
uint32_t* SFout) {
using PackedVec = PackedVec<Type>;
@ -160,16 +92,18 @@ __global__ void __launch_bounds__(1024, 4)
// Get the output tensor offset.
// Same as inOffset because 8 elements are packed into one uint32_t.
int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
;
auto& out_pos = out[outOffset];
// Compute silu and mul
PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numCols, SFout);
out_pos = silu_and_cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(
in_vec, in_vec2, SFScaleVal, sf_out);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
sf_out);
}
}
}
@ -204,7 +138,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
vllm::silu_and_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
m, n, input_ptr, input_sf_ptr,
reinterpret_cast<uint32_t*>(output_ptr),
reinterpret_cast<uint32_t*>(sf_out));

View File

@ -1,15 +1,10 @@
#include "common.cuh"
#include "dispatch_utils.h"
#include "../../cub_helpers.h"
#include "../vectorization_utils.cuh"
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/Exceptions.h>
#ifndef USE_ROCM
#include <cub/cub.cuh>
#else
#include <hipcub/hipcub.hpp>
#endif
namespace vllm {
template <typename scalar_t, typename fp8_type>
@ -116,7 +111,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
using BlockReduce = cub::BlockReduce<float, 256>;
__shared__ typename BlockReduce::TempStorage tmp;
const float block_max =
BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
BlockReduce(tmp).Reduce(absmax_val, CubMaxOp{}, blockDim.x);
__shared__ float token_scale;
if (tid == 0) {

View File

@ -8,11 +8,7 @@
#include "quantization/utils.cuh"
#include "quant_conversions.cuh"
#ifndef USE_ROCM
#include <cub/cub.cuh>
#else
#include <hipcub/hipcub.hpp>
#endif
#include "../../cub_helpers.h"
namespace vllm {
@ -36,7 +32,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);
__shared__ float s_rms;
if (threadIdx.x == 0) {
@ -73,7 +69,7 @@ __device__ void compute_dynamic_per_token_scales(
__shared__ typename BlockReduce::TempStorage reduceStore;
block_absmax_val_maybe =
BlockReduce(reduceStore)
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
__shared__ float s_token_scale;
if (threadIdx.x == 0) {
@ -169,7 +165,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);
__shared__ float s_rms;
if (threadIdx.x == 0) {
@ -240,7 +236,7 @@ __device__ void compute_dynamic_per_token_scales(
__shared__ typename BlockReduce::TempStorage reduceStore;
block_absmax_val_maybe =
BlockReduce(reduceStore)
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
__shared__ float s_token_scale;
if (threadIdx.x == 0) {

View File

@ -510,13 +510,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
// CUTLASS MLA decode
ops.def(
"cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
" Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
" Tensor page_table, float scale) -> ()");
ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
// SM100 CUTLASS MLA decode
ops.def(
"sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"

View File

@ -283,6 +283,10 @@ WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM
ARG GDRCOPY_CUDA_VERSION=12.8
# Keep in line with FINAL_BASE_IMAGE
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
SHELL ["/bin/bash", "-c"]
ARG DEADSNAKES_MIRROR_URL
@ -441,13 +445,21 @@ COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
# Install EP kernels(pplx-kernels and DeepEP), NixL
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
RUN set -eux; \
case "${TARGETPLATFORM}" in \
linux/arm64) UUARCH="aarch64" ;; \
linux/amd64) UUARCH="x64" ;; \
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac; \
./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
rm ./install_gdrcopy.sh
# Install EP kernels(pplx-kernels and DeepEP)
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
COPY tools/install_nixl.sh install_nixl.sh
ENV CUDA_HOME=/usr/local/cuda
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
&& bash install_python_libraries.sh \
&& bash install_nixl.sh --force
&& bash install_python_libraries.sh
#################### vLLM installation IMAGE ####################

View File

@ -29,7 +29,10 @@ ARG VLLM_BRANCH="main"
ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
&& git checkout FETCH_HEAD
&& git checkout FETCH_HEAD \
&& if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
git remote add upstream "https://github.com/vllm-project/vllm.git" \
&& git fetch upstream ; fi
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------

View File

@ -1,25 +1,23 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
ARG HIPBLASLT_BRANCH="aa0bda7b"
ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
ARG LEGACY_HIPBLASLT_OPTION=
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="f717b2af"
ARG PYTORCH_VISION_BRANCH="v0.21.0"
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="f9e5bf54"
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="b2fb6885"
ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa"
ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="4822e675"
ARG AITER_BRANCH="2ab9f4cd"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV AITER_ROCM_ARCH=gfx942;gfx950
ARG PYTHON_VERSION=3.12
@ -45,29 +43,6 @@ RUN apt-get update -y \
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
ARG LEGACY_HIPBLASLT_OPTION
RUN git clone https://github.com/ROCm/hipBLAS-common.git
RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
RUN cd hipBLAS-common \
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
&& mkdir build \
&& cd build \
&& cmake .. \
&& make package \
&& dpkg -i ./*.deb
RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \
&& apt-get install -y llvm-dev \
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \
&& make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
FROM base AS build_triton
ARG TRITON_BRANCH
ARG TRITON_REPO
@ -121,13 +96,11 @@ RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
FROM base AS debs
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
cp /install/*.deb /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@ -138,11 +111,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \
&& perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
&& perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
&& perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@ -153,9 +121,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
pip install /install/*.whl
ARG BASE_IMAGE
ARG HIPBLAS_COMMON_BRANCH
ARG HIPBLASLT_BRANCH
ARG LEGACY_HIPBLASLT_OPTION
ARG TRITON_BRANCH
ARG TRITON_REPO
ARG PYTORCH_BRANCH
@ -167,9 +132,6 @@ ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
&& echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
@ -177,5 +139,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
- [vllm.config.LoRAConfig][]
- [vllm.config.MultiModalConfig][]
- [vllm.config.PoolerConfig][]
- [vllm.config.DecodingConfig][]
- [vllm.config.StructuredOutputsConfig][]
- [vllm.config.ObservabilityConfig][]
- [vllm.config.KVTransferConfig][]
- [vllm.config.CompilationConfig][]
@ -46,7 +46,6 @@ Engine classes for offline and online inference.
Inference parameters for vLLM APIs.
[](){ #sampling-params }
[](){ #pooling-params }
- [vllm.SamplingParams][]
- [vllm.PoolingParams][]

View File

@ -175,6 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
Known supported models:
- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
- InternVL (<gh-pr:23909>)
- Kimi-VL (<gh-pr:23817>)
- Llama4 (<gh-pr:18368>)
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)

View File

@ -26,113 +26,123 @@ See <gh-file:LICENSE>.
## Developing
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
Check out the [building from source][build-from-source] documentation for details.
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
### Building the docs with MkDocs
#### Introduction to MkDocs
[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
#### Install MkDocs and Plugins
Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
```bash
uv pip install -r requirements/docs.txt
```
!!! note
Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
#### Verify Installation
Confirm that MkDocs is correctly installed:
```bash
mkdocs --version
```
Example output:
```console
mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
```
#### Clone the `vLLM` repository
The first step of contributing to vLLM is to clone the GitHub repository:
```bash
git clone https://github.com/vllm-project/vllm.git
cd vllm
```
#### Start the Development Server
Then, configure your Python virtual environment.
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
If you are only developing vLLM's Python code, install vLLM using:
```bash
mkdocs serve
VLLM_USE_PRECOMPILED=1 uv pip install -e .
```
Example output:
If you are developing vLLM's Python and CUDA/C++ code, install vLLM using:
```console
INFO - Documentation built in 106.83 seconds
INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
INFO - [22:02:02] Serving on http://127.0.0.1:8000/
```bash
uv pip install -e .
```
#### View in Your Browser
For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section.
Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
#### Learn More
For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
## Testing
??? console "Commands"
```bash
# These commands are only for Nvidia CUDA platforms.
uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
# Linting, formatting and static type checking
pre-commit install
# You can manually run pre-commit with
pre-commit run --all-files --show-diff-on-failure
# To manually run something from CI that does not run
# locally by default, you can run:
pre-commit run mypy-3.9 --hook-stage manual --all-files
# Unit tests
pytest tests/
# Run tests for a single test file with detailed output
pytest -s -v tests/test_logger.py
```
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
!!! tip
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
!!! note "Install python3-dev if Python.h is missing"
### Linting
vLLM uses `pre-commit` to lint and format the codebase. See <https://pre-commit.com/#usage> if `pre-commit` is new to you. Setting up `pre-commit` is as easy as:
```bash
uv pip install pre-commit
pre-commit install
```
vLLM's `pre-commit` hooks will now run automatically every time you commit.
!!! tip "Tips"
You can manually run the `pre-commit` hooks using:
```bash
pre-commit run # runs on staged files
pre-commit run -a # runs on all files (short for --all-files)
```
---
Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
```bash
pre-commit run --hook-stage manual markdownlint
pre-commit run --hook-stage manual mypy-3.9
```
### Documentation
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
Get started with:
```bash
uv pip install -r requirements/docs.txt
```
!!! tip
Ensure that your Python version is compatible with the plugins
(e.g., `mkdocs-awesome-nav` requires Python 3.10+)
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it.
From the root of the repository, run:
```bash
mkdocs serve # with API ref (~10 minutes)
API_AUTONAV_EXCLUDE=vllm mkdocs serve # API ref off (~15 seconds)
```
Once you see `Serving on http://127.0.0.1:8000/` in the logs, the live preview is ready!
Open <http://127.0.0.1:8000/> in your browser to see it.
For additional features and advanced configurations, refer to the:
- [MkDocs documentation](https://www.mkdocs.org/)
- [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/) (the MkDocs theme we use)
### Testing
vLLM uses `pytest` to test the codebase.
```bash
# Install the test dependencies used in CI (CUDA only)
uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
# Install some common test dependencies (hardware agnostic)
uv pip install pytest pytest-asyncio
# Run all tests
pytest tests/
# Run tests for a single test file with detailed output
pytest -s -v tests/test_logger.py
```
!!! tip "Install python3-dev if Python.h is missing"
If any of the above commands fails with `Python.h: No such file or directory`, install
`python3-dev` with `sudo apt install python3-dev`.
!!! note
!!! warning "Warnings"
Currently, the repository is not fully checked by `mypy`.
!!! note
---
Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
platform to run unit tests locally, rely on the continuous integration system to run the tests for
now.
@ -194,8 +204,7 @@ appropriately to indicate the type of change. Please use one of the following:
The PR needs to meet the following code quality standards:
- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
- Pass all linter checks. Please use `pre-commit` to format your code. See
<https://pre-commit.com/#usage> if `pre-commit` is new to you.
- Pass all linter checks.
- The code needs to be well-documented to ensure future contributors can easily
understand the code.
- Include sufficient tests to ensure the project stays correct and robust. This

View File

@ -1,9 +1,787 @@
---
toc_depth: 4
---
# Benchmark Suites
vLLM contains two sets of benchmarks:
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
- [Performance benchmarks][performance-benchmarks]
- [Nightly benchmarks][nightly-benchmarks]
- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
[Benchmark CLI]: #benchmark-cli
## Benchmark CLI
This section guides you through running benchmark tests with the extensive
datasets supported on vLLM. It's a living document, updated as new features and datasets
become available.
### Dataset Overview
<style>
th {
min-width: 0 !important;
}
</style>
| Dataset | Online | Offline | Data Path |
|---------|--------|---------|-----------|
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
| Random | ✅ | ✅ | `synthetic` |
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
| Prefix Repetition | ✅ | ✅ | `synthetic` |
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
| HuggingFace-InstructCoder | ✅ | ✅ | `likaixin/InstructCoder` |
| HuggingFace-AIMO | ✅ | ✅ | `AI-MO/aimo-validation-aime`, `AI-MO/NuminaMath-1.5`, `AI-MO/NuminaMath-CoT` |
| HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` |
| HuggingFace-MTBench | ✅ | ✅ | `philschmid/mt-bench` |
| HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` |
| Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` |
| Custom | ✅ | ✅ | Local file: `data.jsonl` |
Legend:
- ✅ - supported
- 🟡 - Partial support
- 🚧 - to be supported
!!! note
HuggingFace dataset's `dataset-name` should be set to `hf`.
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
```bash
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
```
### Examples
#### 🚀 Online Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
First start serving your model
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
Then run the benchmarking script
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--num-prompts 10
```
If successful, you will see the following output
```text
============ Serving Benchmark Result ============
Successful requests: 10
Benchmark duration (s): 5.78
Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03
---------------Inter-token Latency----------------
Mean ITL (ms): 7.74
Median ITL (ms): 7.70
P99 ITL (ms): 8.39
==================================================
```
##### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```json
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
```
```bash
# run benchmarking script
vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
##### VisionArena Benchmark for Vision Language Models
```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct
```
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--hf-split train \
--num-prompts 1000
```
##### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
``` bash
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
##### Spec Bench Benchmark with Speculative Decoding
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
Run all categories:
``` bash
# Download the dataset using:
# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name spec_bench \
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
--num-prompts -1
```
Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
Run only a specific category like "summarization":
``` bash
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name spec_bench \
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
--num-prompts -1
--spec-bench-category "summarization"
```
##### Other HuggingFaceDataset Examples
```bash
vllm serve Qwen/Qwen2-VL-7B-Instruct
```
`lmms-lab/LLaVA-OneVision-Data`:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
`Aeala/ShareGPT_Vicuna_unfiltered`:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
`AI-MO/aimo-validation-aime`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
```
`philschmid/mt-bench`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path vdaita/edit_5k_char \
--num-prompts 90 \
--blazedit-min-distance 0.01 \
--blazedit-max-distance 0.99
```
##### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
```
##### Running With Ramp-Up Request Rate
The benchmark tool also supports ramping up the request rate over the
duration of the benchmark run. This can be useful for stress testing the
server or finding the maximum throughput that it can handle, given some latency budget.
Two ramp-up strategies are supported:
- `linear`: Increases the request rate linearly from a start value to an end value.
- `exponential`: Increases the request rate exponentially.
The following arguments can be used to control the ramp-up:
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
</details>
#### 📈 Offline Throughput Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
```bash
vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \
--num-prompts 10
```
If successful, you will see the following output
```text
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
Total num prompt tokens: 5014
Total num output tokens: 1500
```
##### VisionArena Benchmark for Vision Language Models
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--num-prompts 1000 \
--hf-split train
```
The `num prompt tokens` now includes image token counts
```text
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
Total num prompt tokens: 14527
Total num output tokens: 1280
```
##### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
vllm bench throughput \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
```text
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
##### Other HuggingFaceDataset Examples
`lmms-lab/LLaVA-OneVision-Data`:
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
`Aeala/ShareGPT_Vicuna_unfiltered`:
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
`AI-MO/aimo-validation-aime`:
```bash
vllm bench throughput \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
Benchmark with LoRA adapters:
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \
--backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--dataset_name sharegpt \
--num-prompts 10 \
--max-loras 2 \
--max-lora-rank 8 \
--enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test
```
</details>
#### 🛠️ Structured Output Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Benchmark the performance of structured output generation (JSON, grammar, regex).
##### Server Setup
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
##### JSON Schema Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 10 \
--num-prompts 1000
```
##### Grammar-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset grammar \
--structure-type grammar \
--request-rate 10 \
--num-prompts 1000
```
##### Regex-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset regex \
--request-rate 10 \
--num-prompts 1000
```
##### Choice-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset choice \
--request-rate 10 \
--num-prompts 1000
```
##### XGrammar Benchmark Dataset
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset xgrammar_bench \
--request-rate 10 \
--num-prompts 1000
```
</details>
#### 📚 Long Document QA Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Benchmark the performance of long document question-answering with prefix caching.
##### Basic Long Document QA Test
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 16 \
--document-length 2000 \
--output-len 50 \
--repeat-count 5
```
##### Different Repeat Modes
```bash
# Random mode (default) - shuffle prompts randomly
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode random
# Tile mode - repeat entire prompt list in sequence
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode tile
# Interleave mode - repeat each prompt consecutively
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode interleave
```
</details>
#### 🗂️ Prefix Caching Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Benchmark the efficiency of automatic prefix caching.
##### Fixed Prompt with Prefix Caching
```bash
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100 \
--input-length-range 128:256
```
##### ShareGPT Dataset with Prefix Caching
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
```
##### Prefix Repetition Dataset
```bash
vllm bench serve \
--backend openai \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-name prefix_repetition \
--num-prompts 100 \
--prefix-repetition-prefix-len 512 \
--prefix-repetition-suffix-len 128 \
--prefix-repetition-num-prefixes 5 \
--prefix-repetition-output-len 128
```
</details>
#### ⚡ Request Prioritization Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Benchmark the performance of request prioritization in vLLM.
##### Basic Prioritization Test
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority
```
##### Multiple Sequences per Prompt
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority \
--n 2
```
</details>
#### 👁️ Multi-Modal Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Benchmark the performance of multi-modal requests in vLLM.
##### Images (ShareGPT4V)
Start vLLM:
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"image": 1}' \
--allowed-local-media-path /path/to/sharegpt4v/images
```
Send requests with images:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
--dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
--num-prompts 100 \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completion
```
##### Videos (ShareGPT4Video)
Start vLLM:
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"video": 1}' \
--allowed-local-media-path /path/to/sharegpt4video/videos
```
Send requests with videos:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
--num-prompts 100 \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completion
```
##### Synthetic Random Images (random-mm)
Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
Notes:
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
- Video sampling is not yet implemented.
Start the server (example):
```bash
vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
--dtype bfloat16 \
--max-model-len 16384 \
--limit-mm-per-prompt '{"image": 3, "video": 0}' \
--mm-processor-kwargs max_pixels=1003520
```
Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name random-mm \
--num-prompts 100 \
--max-concurrency 10 \
--random-prefix-len 25 \
--random-input-len 300 \
--random-output-len 40 \
--random-range-ratio 0.2 \
--random-mm-base-items-per-request 2 \
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
--random-mm-bucket-config '{(224, 224, 1): 1.0}' \
--request-rate inf \
--ignore-eos \
--seed 42
```
The number of items per request can be controlled by passing multiple image buckets:
```bash
--random-mm-base-items-per-request 2 \
--random-mm-num-mm-items-range-ratio 0.5 \
--random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
```
Flags specific to `random-mm`:
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
Behavioral notes:
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
How sampling works:
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
</details>
[](){ #performance-benchmarks }
@ -13,22 +791,22 @@ The performance benchmarks are used for development to confirm whether new chang
### Manually Trigger the benchmark
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
For CPU environment, please use the image with "-cpu" postfix.
Here is an example for docker run command for CPU.
Here is an example for docker run command for CPU.
```bash
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
```
Then, run below command inside the docker instance.
Then, run below command inside the docker instance.
```bash
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
```
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
#### Runtime environment variables

View File

@ -40,6 +40,16 @@ python tools/generate_cmake_presets.py
The script will prompt you if it cannot automatically determine certain paths (e.g., `nvcc` or a specific Python executable for your vLLM development environment). Follow the on-screen prompts. If an existing `CMakeUserPresets.json` is found, the script will ask for confirmation before overwriting it.
**Force overwrite existing file:**
To automatically overwrite an existing `CMakeUserPresets.json` without prompting, use the `--force-overwrite` flag:
```console
python tools/generate_cmake_presets.py --force-overwrite
```
This is particularly useful in automated scripts or CI/CD environments where interactive prompts are not desired.
After running the script, a `CMakeUserPresets.json` file will be created in the root of your vLLM repository.
### Example `CMakeUserPresets.json`

View File

@ -3,7 +3,7 @@
!!! important
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/compatibility_matrix.md) to optimize their performance.
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
The complexity of integrating a model into vLLM depends heavily on the model's architecture.
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.

View File

@ -0,0 +1,559 @@
# Logits Processors
!!! important
Some logits processors design changes are still in progress and the API may
change in the near future. We hope to stabilize this part of the API soon
This document describes how the vLLM engine interacts with logits processors, and the programming model which vLLM supports for implementing logits processors.
## Logits Processors Background
A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior.
In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax.
## Logits Processors in the vLLM engine
The vLLM engine's persistent batch data structure maintains a list of loaded logits processors.
In order to operate on the entire batch at once, each logits processor may maintain metadata about the requests in the batch (i.e. each request's logits-processor-specific configuration settings). Therefore, logits processors are stateful.
In each engine step, the vLLM engine will (1) update each logits processor's internal state and (2) apply logits processors to the model output logits.
### Updating Logits Processor Internal State
At the beginning of each engine step, the persistent batch may add, discard and/or reorder requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step.
The pseudocode below shows the process by which the vLLM persistent batch notifies each logits processor of changes in batch state:
??? code "Model Runner Updates Logits Processor States"
``` python
# gpu_model_runner.py
class GPUModelRunner(...):
...
def execute_model(self, scheduler_output, ...):
self._update_states(scheduler_output)
...
def _update_states(...):
...
# ...update persistent batch to reflect new/finished requests & reordering
# of requests within batch...
...
self.input_batch.refresh_metadata()
# gpu_input_batch.py
class InputBatch:
...
def refresh_metadata(self):
...
# Update each logits processor's state to reflect persistent batch state
batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
for logit_proc in self.logitsprocs.all:
logit_proc.update_state(batch_update)
...
# vllm/v1/sample/logits_processor/interface.py
@dataclass(frozen=True)
class BatchUpdate:
# Batch state-change data structure which is passed to logits processors'
# update_state() methods
batch_size: int
removed: Sequence[RemovedRequest]
added: Sequence[AddedRequest]
moved: Sequence[MovedRequest]
```
### Applying Logits Processors to the Model Output Logits
After updating persistent batch state, the vLLM model runner performs model inference to obtain logits. Then, the model runner invokes the sampler against the logits. In turn, part of the sampler's operation is to invoke the logits processors' `apply()` methods against the model output logit processors, yielding transformed logits (the `apply()` methods may modify the logits in-place or out-of-place, although in-place is more memory-efficient). This process is shown in the pseudocode below.
Note that the sampler will access the logits processors via `SamplingMetadata.logitsprocs`. When the vLLM engine constructs `SamplingMetadata` (not shown in the code below), the reference to the list of logits processors is passed from the persistent batch data structure to `SamplingMetadata`.
??? code "Apply logits processors to model output logits"
``` python
# gpu_model_runner.py
class GPUModelRunner(...):
...
def execute_model(self, scheduler_output, ...):
# (discussed in previous section)
self._update_states(scheduler_output)
...
# ...run model inference to obtain logits...
...
# Invoke sampler, which applies logits processors
sampler_output = self.sampler(logits=logits,
sampling_metadata=sampling_metadata)
...
# sampler.py
class Sampler(nn.Module):
...
def forward(self, logits, sampling_metadata):
...
# Apply non-argmax-invariant logits processors to model output logits
for processor in (sampling_metadata.logitsprocs.non_argmax_invariant):
logits = processor.apply(logits)
sampled = self.sample(logits, sampling_metadata)
...
# ...return sampler output data structure...
def sample(self, logits, sampling_metadta)
...
# ...exit early if all requests are greedy-sampling...
...
# Apply argmax-invariant logits processors
for processor in sampling_metadata.logitsprocs.argmax_invariant:
logits = processor.apply(logits)
...
# ...perform sampling and return sampling result...
```
At sampling time, the sampler checks whether all requests in the persistent batch employ greedy sampling. If that is the case, the sampler saves compute by skipping "argmax-invariant" logits processors. Here, "argmax" is shorthand for the token ID with the highest logit value in a given row of the logits tensor (i.e. the token which the model weighted the highest for a given request).
* An **argmax-invariant logits processor** is a logits processor (such as Min-P) which does not modify the argmax. For example, a logits processor which masks out the lowest-probability tokens will not change which token ID has the max logit. Greedy sampling always picks the highest-logit-value token ID, and so conceptually an argmax-invariant logits processor can be skipped for greedy sampling requests.
* A **non-argmax-invariant logits processor** is a logits processor which may modify the argmax. For example, a logits processor which masks all tokens except for EOS after a certain number of steps in order to force decoding to terminate might end up masking the max-logit-value token and therefore change the argmax. Conceptually, these logits processors cannot be skipped for greedy sampling requests.
The vLLM logits processor abstraction requires the engine to apply logits processors at batch granularity; therefore in practice the argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling.
## Logits Processor Programming Model
The previous sections alluded to the interfaces which vLLM logits processors must support. This section introduces in full the programming model for implementing logits processors that are compatible with the vLLM engine, including the `LogitsProcessor` base class and its interface methods as well as the `BatchUpdate` data structure for representing persistent batch state changes, both of which are shown in the code below:
??? code "`LogitsProcessor` base class and `BatchUpdate` data structure"
``` python
from abc import ABC, abstractmethod
from collections.abc import Sequence
from dataclasses import dataclass
from enum import Enum, auto
from typing import TYPE_CHECKING, Optional
import torch
from vllm import SamplingParams
if TYPE_CHECKING:
from vllm.config import VllmConfig
class MoveDirectionality(Enum):
# One-way i1->i2 req move within batch
UNIDIRECTIONAL = auto()
# Two-way i1<->i2 req swap within batch
SWAP = auto()
# (index, params, prompt_tok_ids, output_tok_ids) tuples for new
# requests added to the batch.
AddedRequest = tuple[int, SamplingParams, list[int], list[int]]
# (index 1, index 2, directionality) tuples representing
# one-way moves or two-way swaps of requests in batch
MovedRequest = tuple[int, int, MoveDirectionality]
# Batch indices of any removed requests.
RemovedRequest = int
@dataclass(frozen=True)
class BatchUpdate:
"""Persistent batch state change info for logitsprocs"""
batch_size: int # Current num reqs in batch
# Metadata for requests added to, removed from, and moved
# within the persistent batch.
#
# Key assumption: the `output_tok_ids` list (which is an element of each
# tuple in `added`) is a reference to the request's running output tokens
# list; via this reference, the logits processors always see the latest
# list of generated output tokens
removed: Sequence[RemovedRequest]
moved: Sequence[MovedRequest]
added: Sequence[AddedRequest]
class LogitsProcessor(ABC):
@abstractmethod
def __init__(self, vllm_config: "VllmConfig", device: torch.device,
is_pin_memory: bool) -> None:
raise NotImplementedError
@abstractmethod
def apply(self, logits: torch.Tensor) -> torch.Tensor:
raise NotImplementedError
@abstractmethod
def is_argmax_invariant(self) -> bool:
"""True if logits processor has no impact on the
argmax computation in greedy sampling.
NOTE: may or may not have the same value for all
instances of a given LogitsProcessor subclass,
depending on subclass implementation.
"""
raise NotImplementedError
@abstractmethod
def update_state(
self,
batch_update: Optional["BatchUpdate"],
) -> None:
"""Called when there are new output tokens, prior
to each forward pass.
Args:
batch_update is non-None iff there have been
changes to the batch makeup.
"""
raise NotImplementedError
```
A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) the following methods:
* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)`
* `vllm_config`: engine configuration data structure
* `device`: hardware accelerator device info
* `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation
* `apply(self, logits: torch.Tensor) -> torch.Tensor`:
* Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`)
* Apply logits processor transformation at batch granularity
* Return a transformed `(num_requests) x (vocab_size)` logits tensor
* You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient
* `is_argmax_invariant(self) -> bool`:
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
* Use the `BatchUpdate` members to update logits processor internal state
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
### `BatchUpdate` data structure
The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (note that the order in which the operations are mentioned below reflects the order in which they should be processed in `update_state()`):
* **Remove:** remove (without replacement) request at index `i`
* A Remove is represented in `Batchupdate.removed` by an `int` (representing `i`)
* Effect of remove-at-index on batch:
``` text
Batch: [A,B,C]
Remove @ i: 1
=>
New Batch: [A,x,C] # Discard B and leave an empty slot
```
* **Add:** add (or replace existing request with) a new request at index `i`. If a request is replaced, its associated state should be discarded.
* An Add is represented in `Batchupdate.added` as a tuple of
``` text
(index, new request SamplingParams, prompt token ids, output token ids)
```
* `prompt token ids` and `output token ids` are references to the request's prompt token ids and output token ids lists, respectively. Note that the output token ids list grows with each engine step, and this growth is visible to the logits processor because output token ids are passed by reference. **This is important for LogitsProcessors that take into account the tokens generated so far**.
* The implementation of the particular logits processor subclass determines whether or how the fields in the added request tuple are digested into an internal representation. For example, a logits processor that does not utilize prompt or output token ids may only need to utilize `index` and `SamplingParams` and discard the other tuple fields
* If index `i` currently holds a request, a replacement occurs:
``` text
Batch: [A,B,C]
New request to be added @ i: D @ 1
=>
New Batch: [A,D,C] # Add D, discard B
```
* If index `i` does not currently hold a request (because `i` is out of bounds of the current batch size):
``` text
Batch: [A,B,C]
New request to be added @ i: D @ 3
=>
New Batch: [A,B,C,D] # Add D, extending batch
```
* **Move:** move request at index `s` to index `d` OR swap requests at indices `s` and `d`
* A Move is represented in `Batchupdate.moved` as a tuple of
``` text
(s, d, UNIDIRECTIONAL or SWAP)
```
* If the Move specifies `UNIDRECTIONAL`:
* The request at index `s` is moved to index `d`; index `s` becomes an empty slot
``` text
Batch: [A,x,C,D]
Unidirectionally Move s -> d: 3 -> 1
=>
New Batch: [A,D,C,x] # Move D to 1, leaving empty slot at 3
```
* If another request already resided at index `d`, it is replaced and discarded
``` text
Batch: [A,B,C,D]
Unidirectionally Move s -> d: 3 -> 1
=>
New Batch: [A,D,C,x] # Move D to 1, discarding B and leaving empty slot at 3
```
* If the Move specifies `SWAP`, the requests at `s` and `d` exchange indices
``` text
Batch: [A,B,C,D]
Swap Move s <-> d: 3 <-> 1
=>
New Batch: [A,D,C,B] # Swap B and D
```
Additionally, the `BatchUpdate` data structure includes a representation (`batch_size`) of the size of the persistent batch at the beginning of the engine step.
### How the vLLM engine builds the `BatchUpdate` data structure
Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction):
1. Identify indices of requests which finished in the current engine step
2. Identify new requests introduced in the current step
3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index
4. Based on the relative number of new and finished requests:
1. If the numbers of new and finished requests are the same, proceed to next step
2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1`
3. *If there are fewer new requests than finished requests:*
* Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
Notes:
* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves
* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations
* Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3
* In other words Move operations can be assumed to be applied after Adds and Removes
* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved`
* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None`
#### Example: Batch Update with Fewer New Requests Than Finished Requests
The following example models an engine step where 1 new request is introduced and 2 finished requests are eliminated, additionally the attention backend performs a swap to optimize the batch ordering.
``` text
Batch state (beginning of engine step): [A,B,C,D]
Batch size: 4
New requests: E
Finished requests: A, C
Processing steps (using BatchUpdate abstraction):
1. Add E at index 0
[E,B,C,D] # Discard A
Batch size: 4
2. Remove at index 2
[E,B,x,D] # Discard C, empty slot at index 2
Batch size: 4
3. Condense batch with a Unidirectional Move 3 -> 2 operation and shrink batch
[E,B,D] x # Empty slot is now outside batch
Batch size: 3
4. Attention backend optimization: reorder batch with Swap 0 <-> 1
[B,E,D]
Batch size: 3
```
The resulting `BatchUpdate` data structure will look like
``` text
BatchUpdate instance
* added: [(0,E's SamplingParams,E's prompt tokens ref,E's output tokens ref)]
* removed: [2] # request C was removed without replacement
* moved: [(3,2,UNIDIRECTIONAL),(0,1,SWAP)]
```
#### Example: Batch Update with More New Requests Than Finished Requests
The following example models an engine step where 2 new requests are introduced and 1 finished request is eliminated, additionally the attention backend performs a swap to optimize the batch ordering.
``` text
Batch state (beginning of engine step): [A,B,C,D]
Batch size: 4
New requests: E,F
Finished requests: C
Processing steps (using BatchUpdate abstraction):
1. Add E at index 2
[A,B,E,D] # Discard C
Batch size: 4
2. Add F at index 4 (current max batch index + 1)
[A,B,E,D,F] # Extend batch by 1
Batch size: 5
4. Attention backend optimization: reorder batch with Swap 0 <-> 1
[B,A,E,D,F]
Batch size: 5
```
Note that batch condensation is skipped because there are no empty slots left behind by Remove operations.
The resulting `BatchUpdate` data structure will look like
``` text
BatchUpdate instance
* added: [(2,E's SamplingParams,E's prompt tokens ref,E's output tokens ref),(4,F's SamplingParams,F's prompt tokens ref,F's output tokens ref)]
* removed: [] # no requests were removed without replacement
* moved: [(0,1,SWAP)]
```
## How to Introduce a New Logits Processor to vLLM
### Best Practices for Writing Built-In Logits Processors
* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
* For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
* However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor
* It is up to the logits processor author to determine:
1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API
2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the built-in logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor
3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the built-in logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the built-in logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor
* Additionally, an easy way to save compute in `update_state()` is to exit early when the batch_update is `None`
* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove)
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
### Built-In Logits Processors
Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `vllm/v1/sample/logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently employs the following built-in logits processors based on the programming model described above:
* Min-P
* Logit bias
* Min-tokens
Review these logits processor implementations for guidance on writing built-in logits processors.
Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
* Allowed token IDs
* Bad words
* Repetition penalty
* Frequency penalty
* Presence penalty
* Temperature
* Top-K
* Top-P
### Custom Logits Processors
vLLM can be augmented with [user-provided custom logits processors](../features/custom_logitsprocs.md).

View File

@ -0,0 +1,46 @@
# Custom Arguments
You can use vLLM *custom arguments* to pass in arguments which are not part of the vLLM `SamplingParams` and REST API specifications. Adding or removing a vLLM custom argument does not require recompiling vLLM, since the custom arguments are passed in as a dictionary.
Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
## Offline Custom Arguments
Custom arguments passed to `SamplingParams.extra_args` as a `dict` will be visible to any code which has access to `SamplingParams`:
``` python
SamplingParams(extra_args={"your_custom_arg_name": 67})
```
This allows arguments which are not already part of `SamplingParams` to be passed into `LLM` as part of a request.
## Online Custom Arguments
The vLLM REST API allows custom arguments to be passed to the vLLM server via `vllm_xargs`. The example below integrates custom arguments into a vLLM REST API request:
``` bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen2.5-1.5B-Instruct",
...
"vllm_xargs": {"your_custom_arg": 67}
}'
```
Furthermore, OpenAI SDK users can access `vllm_xargs` via the `extra_body` argument:
``` python
batch = await client.completions.create(
model="Qwen/Qwen2.5-1.5B-Instruct",
...,
extra_body={
"vllm_xargs": {
"your_custom_arg": 67
}
}
)
```
!!! note
`vllm_xargs` is assigned to `SamplingParams.extra_args` under the hood, so code which uses `SamplingParams.extra_args` is compatible with both offline and online scenarios.

View File

@ -0,0 +1,445 @@
# Custom Logits Processors
!!! important
Some logits processors design changes are still in progress and the API may
change in the near future. We hope to stabilize this part of the API soon
A "custom" logits processor is written by a user of vLLM and is loaded into vLLM at initialization without needing to modify or recompile the vLLM source code. It is the opposite of a built-in logits processor.
This document shows how to write, load and use a custom logits processor.
## Logits Processors Background
A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior.
In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax.
## Creating a Custom Logits Processor
Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and define (at minimum) the following methods:
* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)`
* `vllm_config`: engine configuration data structure
* `device`: hardware accelerator device info
* `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation
* `apply(self, logits: torch.Tensor) -> torch.Tensor`:
* Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`)
* Apply logits processor transformation at batch granularity
* Return a transformed `(num_requests) x (vocab_size)` logits tensor
* You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient
* `is_argmax_invariant(self) -> bool`:
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
* Use the `BatchUpdate` members to update logits processor internal state
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
### How the vLLM engine builds the `BatchUpdate` data structure
!!! important
Some logits processors design changes are still in progress. We expect
that in the future you will not need to account for batch state changes
when implementing a logits processor, and the information in this section
will become irrelevant.
Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction):
1. Identify indices of requests which finished in the current engine step
2. Identify new requests introduced in the current step
3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index
4. Based on the relative number of new and finished requests:
1. If the numbers of new and finished requests are the same, proceed to next step
2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1`
3. *If there are fewer new requests than finished requests:*
* Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
Notes:
* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves
* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations
* Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3
* In other words Move operations can be assumed to be applied after Adds and Removes
* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved`
* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None`
### Passing Custom Argument to a Custom Logits Processor
Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although you are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.)
### Example Custom Logits Processor Implementation
The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request:
??? code "Example custom logits processor definition"
``` python
from typing import Optional
import torch
from vllm.config import VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.v1.sample.logits_processor import (BatchUpdate,
LogitsProcessor,
MoveDirectionality)
class DummyLogitsProcessor(LogitsProcessor):
"""Fake logit processor to support unit testing and examples"""
def __init__(self, vllm_config: "VllmConfig", device: torch.device,
is_pin_memory: bool):
self.req_info: dict[int, int] = {}
def is_argmax_invariant(self) -> bool:
"""Never impacts greedy sampling"""
return False
def update_state(self, batch_update: Optional[BatchUpdate]):
if not batch_update:
return
# Process added requests.
for index, params, _, _ in batch_update.added:
assert params is not None
if params.extra_args and (target_token :=
params.extra_args.get("target_token")):
self.req_info[index] = target_token
else:
self.req_info.pop(index, None)
if self.req_info:
# Process removed requests.
for index in batch_update.removed:
self.req_info.pop(index, None)
# Process moved requests, unidirectional move (a->b) and swap
# (a<->b)
for adx, bdx, direct in batch_update.moved:
a_val = self.req_info.pop(adx, None)
b_val = self.req_info.pop(bdx, None)
if a_val is not None:
self.req_info[bdx] = a_val
if direct == MoveDirectionality.SWAP and b_val is not None:
self.req_info[adx] = b_val
def apply(self, logits: torch.Tensor) -> torch.Tensor:
if not self.req_info:
return logits
# Save target values before modification
cols = torch.tensor(
list(self.req_info.values()), dtype=torch.long, device=logits.device
)
rows = torch.tensor(
list(self.req_info.keys()), dtype=torch.long, device=logits.device
)
values_to_keep = logits[rows, cols].clone()
# Mask all but target tokens
logits[rows] = float('-inf')
logits[rows, cols] = values_to_keep
return logits
```
In the rest of this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor.
The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" representation of the batched requests in the `self.req_info` dictionary: only those requests which specify a `target_token` value have a key in the dictionary. `update_state()` adjusts the stored request indices and `target_token` values (keys and values respectively in `self.req_info`) in response to Add, Remove and Move operations against the persistent batch.
### Wrapping an Existing Request-Level Logits Processor
Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here](https://docs.vllm.ai/en/v0.10.1.1/api/vllm/logits_process.html)) conforming to the following type annotation:
``` python
RequestLogitsProcessor = Union[
# (output token ids, logits tensor) -> logits tensor
Callable[[list[int], Tensor], Tensor],
# (prompt token ids, output token ids, logits tensor) -> logits tensor
Callable[[list[int], list[int], Tensor], Tensor],
]
```
While request-level logits processors are explicitly *not* supported in the vLLM engine, vLLM *does* provide a convenient process to wrap an existing `Callable` request-level logits processor and create a batch-level logits processor that is compatible with vLLM. The `Callable` must conform to the type annotation above; if your request-level logits processor has a different interface, then in order to wrap it, you may need to modify it or implement an additional wrapper layer to comply with the interface specification above.
You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance:
??? code "Example of Wrapping a Request-Level Logits Processor"
``` python
...
from vllm.v1.sample.logits_processor import (
AdapterLogitsProcessor, # Wrapper base-class
RequestLogitsProcessor, # Request-level logitsproc type annotation
)
...
# Stand-in for your request-level logits processor:
class DummyPerReqLogitsProcessor:
"""The request-level logits processor masks out all logits except the
token id identified by `target_token`"""
def __init__(self, target_token: int) -> None:
"""Specify `target_token`"""
self.target_token = target_token
def __call__(
self,
output_ids: list[int],
logits: torch.Tensor,
) -> torch.Tensor:
val_to_keep = logits[self.target_token].item()
logits[:] = float("-inf")
logits[self.target_token] = val_to_keep
return logits
...
# Example of wrapping the request-level logits processor:
class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
"""Example of wrapping a fake request-level logit processor to create a
batch-level logits processor"""
def is_argmax_invariant(self) -> bool:
return False
def new_req_logits_processor(
self,
params: SamplingParams,
) -> Optional[RequestLogitsProcessor]:
"""This method returns a new request-level logits processor, customized
to the `target_token` value associated with a particular request.
Returns None if the logits processor should not be applied to the
particular request. To use the logits processor the request must have
a "target_token" custom argument with an integer value.
Args:
params: per-request sampling params
Returns:
`Callable` request logits processor, or None
"""
target_token: Optional[Any] = params.extra_args and params.extra_args.get(
"target_token"
)
if target_token is None:
return None
if not isinstance(target_token, int):
logger.warning(
"target_token value %s is not int; not applying logits"
" processor to request.",
target_token,
)
return None
return DummyPerReqLogitsProcessor(target_token)
```
!!! note
Your `new_req_logits_processor()` override can return `None` to signal that the wrapped logits processor should not be applied to the request in question.
Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) which wraps your request level logits processor, you can pass the custom subclass to vLLM via any of the methods described in the following section.
## Ways to Load Your Custom Logits Processor in vLLM
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
### Method 1: Pass the Custom Logits Processor Fully-Qualified Class Name (FQCN) to vLLM at Initialization Time
This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` and `AsyncLLM` Python constructors, or as a CLI argument to `vllm serve` with the following syntax
``` bash
vllm serve ... --logits_processors <logits processor 1> <logits processor 2> ...
```
The only requirements on the FQCN are
1. Python's `importlib.import_module()` must be able to resolve the dotted path portion of the FQCN and load it as a module
2. The class-name portion of the FQCN must be possible to import from the loaded module
3. The object pointed to by the FQCN must be a subclass of `LogitsProcessor`
See examples below:
??? code "Passing custom logits processor FQCN to `LLM` in Python"
``` python
# Pass in FQCN
llm = LLM(
model="facebook/opt-125m",
logits_processors=["your.module.path:DummyLogitsProcessor"],
)
```
??? code "Passing custom logits processor FQCN to `AsyncLLM` in Python"
``` python
# Pass in FQCN
engine_args = AsyncEngineArgs(model="facebook/opt-125m",
logits_processors=["your.module.path:DummyLogitsProcessor"])
async_llm = AsyncLLM.from_engine_args(engine_args)
```
??? code "Passing custom logits processor FQCN to vLLM server via CLI"
```bash
vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor
```
### Method 2: Automatically Detect Custom Logits Processors Installed in Your Python Environment As Entry Points
[`setuptools`](https://setuptools.pypa.io/en/latest/userguide/entry_point.html) can enable installed packages to make themselves available as plugins to other Python programs, via pieces of metadata known as "entry points".
During initialization, vLLM automatically scans the `vllm.logits_processors` entry point group and loads any installed logits processors which it finds.
Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your logits processor Python package. The example below shows how to add an entrypoint to your project's `pyproject.toml` file:
??? code "Exposing a custom logits processor as a Python entrypoint"
``` toml
[project.entry-points."vllm.logits_processors"]
dummy_logits_processor = "your.module.path:DummyLogitsProcessor"
```
Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to the `LLM` or `AsyncLLM` constructors or to the vLLM server explicitly at initialization time if your logits processor is exposed as an entry point.
!!! note
vLLM will *always* load *all* logits processors which are exposed via entrypoints under the `vllm.logits_processors` grouping.
### Method 3 (Offline-only): Pass a Python Class Object to the vLLM Constructor
You can pass one or more custom logits processor class objects to the `LLM` and `AsyncLLM` constructors. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` or `AsyncLLM` is instantiated, or (2) imported from a Python package.
??? code "Passing custom logits processor class object to `LLM` or `AsyncLLM` in Python"
``` python
# Import custom logits processor
from some.module import DummyLogitsProcessor
# ...or...
# Define custom logits processor locally
from vllm.v1.sample.logits_processor import LogitsProcessor
class DummyLogitsProcessor(LogitsProcessor):
# See DummyLogitsProcessor implementation above
...
# Pass class object to LLM constructor
llm = LLM(
model="facebook/opt-125m",
logits_processors=[DummyLogitsProcessor],
)
# Pass class object to AsyncLLM constructor
engine_args = AsyncEngineArgs(model="facebook/opt-125m",
logits_processors=[DummyLogitsProcessor])
async_llm = AsyncLLM.from_engine_args(engine_args)
```
## Invoking a Custom Logits Processor Against a Request
The design of the custom logits processor determines whether the logits processor must be enabled/disabled for a given request, and what arguments must be provided to configure the logits processor.
The examples below show how a user would pass a custom argument (`target_token`) to `DummyLogitsProcessor` in order to (1) enable the logits processor for that particular request and (2) control the logits processor's behavior.
??? code "vLLM REST API: configure custom logits processor for a request"
``` bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen2.5-1.5B-Instruct",
...
"vllm_xargs": {"target_token": 67}
}'
```
??? code "OpenAI SDK: configure custom logits processor for a request"
``` python
batch = await client.completions.create(
model="Qwen/Qwen2.5-1.5B-Instruct",
...,
extra_body={
"vllm_xargs": {
"target_token": 67
}
}
)
```
??? code "Offline: configure custom logits processor for an `LLM` request"
``` python
outputs_logitproc = llm.generate("your prompt",
SamplingParams(...,
extra_args={"target_token": 67}))
```
??? code "Offline: configure custom logits processor for an `AsyncLLM` request"
``` python
async for out in engine.generate(request_id="your request id",
prompt="your prompt",
sampling_params=SamplingParams(...,
extra_args={"target_token": 67})):
# Process async request outputs
...
```
## Best Practices for Writing Custom Logits Processors
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
* For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
* However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor
* **Note:** wrapped request-level logits processors do not need to implement `apply()` and `update_state()`; the default `AdapterLogitsProcessor.update_state()` implementation maintains a sparse representation of request state, wherein requests for which `new_req_logits_processor()` returns `None` are not represented in the base-class state dictionary. The default implementation of `AdapterLogitsProcessor.apply()` applies the request-level logits processor to each row of input logits sequentially and assembles the output logits tensor. If the performance of this `AdapterLogitsProcessor` default implementation is insufficient, then avoid wrapping your request-level logits processor and instead re-implement it as a `LogitsProcessor` subclass with optimized `apply()` and `update_state()` implementations that operate at batch granularity
* It is up to the logits processor author to determine:
1. **The per-request attributes which configure the logits processor's behavior against that request.** Your custom logits processor's `update_state()` override determines how `SamplingParams` fields are mapped into logits processor state
* **Note:** for wrapped request-level logits processors, `new_req_logits_processor()` determines how `SamplingParams` fields are used to initialize a request-level logits processor instance.
2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor
* **Note:** for wrapped per-request logits processors, the default `AdapterLogitsProcessor.update_state()` implementation ensures that the request-level logits processor is disabled when `new_req_logits_processor()` returns `None` for that request
3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor
* Additionally, an easy way to save compute in `update_state()` is to exit early when the `batch_update` is `None`
* **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class implements the above optimizations by default
* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove)
* **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method

View File

@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models:
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|--------------|-------------|------------------|-------------|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
!!! note
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.

View File

@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
The following parameters are supported, which must be added as extra parameters:
- `guided_choice`: the output will be exactly one of the choices.
- `guided_regex`: the output will follow the regex pattern.
- `guided_json`: the output will follow the JSON schema.
- `guided_grammar`: the output will follow the context free grammar.
- `choice`: the output will be exactly one of the choices.
- `regex`: the output will follow the regex pattern.
- `json`: the output will follow the JSON schema.
- `grammar`: the output will follow the context free grammar.
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
Structured outputs are supported by default in the OpenAI-Compatible Server. You
may choose to specify the backend to use by setting the
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
which will try to choose an appropriate backend based on the details of the
request. You may also choose a specific backend, along with
some options. A full set of options is available in the `vllm serve --help`
text.
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
??? code
@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
],
extra_body={"guided_choice": ["positive", "negative"]},
extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
)
print(completion.choices[0].message.content)
```
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
??? code
@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
}
],
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
)
print(completion.choices[0].message.content)
```
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
For this we can use the `guided_json` parameter in two different ways:
For this we can use the `json` parameter in two different ways:
- Using directly a [JSON Schema](https://json-schema.org/)
- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
The next example shows how to use the `guided_json` parameter with a Pydantic model:
The next example shows how to use the `response_format` parameter with a Pydantic model:
??? code
@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
JSON schema and how the fields should be populated. This can improve the
results notably in most cases.
Finally we have the `guided_grammar` option, which is probably the most
Finally we have the `grammar` option, which is probably the most
difficult to use, but it´s really powerful. It allows us to define complete
languages like SQL queries. It works by using a context free EBNF grammar.
As an example, we can use to define a specific format of simplified SQL queries:
@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
}
],
extra_body={"guided_grammar": simplified_sql_grammar},
extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
)
print(completion.choices[0].message.content)
```
@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
## Offline Inference
Offline inference allows for the same types of structured outputs.
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
The main available options inside `GuidedDecodingParams` are:
To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
The main available options inside `StructuredOutputsParams` are:
- `json`
- `regex`
@ -309,12 +309,12 @@ shown below:
```python
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
from vllm.sampling_params import StructuredOutputsParams
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
outputs = llm.generate(
prompts="Classify this sentiment: vLLM is wonderful!",
sampling_params=sampling_params,

View File

@ -71,7 +71,7 @@ This example demonstrates:
* Making a request with `tool_choice="auto"`
* Handling the structured response and executing the corresponding function
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
Remember that it's the caller's responsibility to:
@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
## Named Function Calling
vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
high-quality one.
vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
## Required Function Calling
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.

View File

@ -52,6 +52,24 @@ uv pip install -e .
1 error generated.
```
---
If the build fails with C++11/C++17 compatibility errors like the following, the issue is that the build system is defaulting to an older C++ standard:
```text
[...] error: 'constexpr' is not a type
[...] error: expected ';' before 'constexpr'
[...] error: 'constexpr' does not name a type
```
**Solution**: Your compiler might be using an older C++ standard. Edit `cmake/cpu_extension.cmake` and add `set(CMAKE_CXX_STANDARD 17)` before `set(CMAKE_CXX_STANDARD_REQUIRED ON)`.
To check your compiler's C++ standard support:
```bash
clang++ -std=c++17 -pedantic -dM -E -x c++ /dev/null | grep __cplusplus
```
On Apple Clang 16 you should see: `#define __cplusplus 201703L`
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]

View File

@ -1,4 +1,4 @@
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
```bash
uv venv --python 3.12 --seed

View File

@ -554,6 +554,17 @@ If your model is not in the above list, we will try to automatically convert the
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
#### Token Classification
These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ |
!!! note
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
[](){ #supported-mm-models }
## List of Multimodal Language Models
@ -661,6 +672,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |

View File

@ -10,7 +10,7 @@ Before using EP, you need to install the necessary dependencies. We are actively
1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](gh-file:tools/ep_kernels).
2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation).
3. **For disaggregated serving**: Install UCX and NIXL following the [script](gh-file:tools/install_nixl.sh).
3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](gh-file:tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
### Backend Selection Guide
@ -191,7 +191,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok
### Setup Steps
1. **Install KV Connector**: Install NIXL using the [installation script](gh-file:tools/install_nixl.sh)
1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip.
2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`

View File

@ -133,7 +133,7 @@ completion = client.chat.completions.create(
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
],
extra_body={
"guided_choice": ["positive", "negative"]
"structured_outputs": {"choice": ["positive", "negative"]}
}
)
```
@ -317,10 +317,11 @@ Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_cli
#### Extra parameters
The following [pooling parameters][pooling-params] are supported.
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params"
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:embedding-pooling-params"
```
The following extra parameters are supported by default:
@ -374,7 +375,7 @@ The following extra parameters are supported:
```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
```
[](){ #translations-api }
### Translations API
@ -527,10 +528,11 @@ curl -v "http://127.0.0.1:8000/classify" \
#### Extra parameters
The following [pooling parameters][pooling-params] are supported.
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params"
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
```
The following extra parameters are supported:
@ -733,10 +735,11 @@ Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_mu
#### Extra parameters
The following [pooling parameters][pooling-params] are supported.
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params"
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
```
The following extra parameters are supported:
@ -815,10 +818,11 @@ Result documents will be sorted by relevance, and the `index` property can be us
#### Extra parameters
The following [pooling parameters][pooling-params] are supported.
The following [pooling parameters][vllm.PoolingParams] are supported.
```python
--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params"
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
```
The following extra parameters are supported:

View File

@ -87,6 +87,11 @@ def parse_args():
default=0.8,
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
)
parser.add_argument(
"--enable-dbo",
action="store_true",
help=("Enable microbatched execution"),
)
parser.add_argument(
"--compilation-config",
type=int,
@ -113,6 +118,7 @@ def main(
max_model_len,
compilation_config,
gpu_memory_utilization,
enable_dbo,
quantization,
):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
@ -167,6 +173,7 @@ def main(
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enable_dbo=enable_dbo,
quantization=quantization,
compilation_config=compilation_config,
)
@ -227,6 +234,7 @@ if __name__ == "__main__":
args.max_model_len,
args.compilation_config,
args.gpu_memory_utilization,
args.enable_dbo,
args.quantization,
),
)

View File

@ -56,7 +56,6 @@ class DummyLogitsProcessor(LogitsProcessor):
self.req_info: dict[int, int] = {}
def is_argmax_invariant(self) -> bool:
"""Never impacts greedy sampling"""
return False
def update_state(self, batch_update: Optional[BatchUpdate]):
@ -75,13 +74,12 @@ class DummyLogitsProcessor(LogitsProcessor):
return logits
# Save target values before modification
rows_list = list(self.req_info.keys())
cols = torch.tensor(
[self.req_info[i] for i in rows_list],
dtype=torch.long,
device=logits.device,
list(self.req_info.values()), dtype=torch.long, device=logits.device
)
rows = torch.tensor(
list(self.req_info.keys()), dtype=torch.long, device=logits.device
)
rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device)
values_to_keep = logits[rows, cols].clone()
# Mask all but target tokens

View File

@ -26,8 +26,14 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
python examples/offline_inference/pooling/embed_matryoshka_fy.py
```
## Named Entity Recognition (NER) usage
```bash
python examples/offline_inference/pooling/ner.py
```
## Qwen3 reranker usage
```bash
python qwen3_reranker.py
python examples/offline_inference/pooling/qwen3_reranker.py
```

View File

@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="boltuix/NeuroBERT-NER",
runner="pooling",
enforce_eager=True,
trust_remote_code=True,
)
return parser.parse_args()
def main(args: Namespace):
# Sample prompts.
prompts = [
"Barack Obama visited Microsoft headquarters in Seattle on January 2025."
]
# Create an LLM.
llm = LLM(**vars(args))
tokenizer = llm.get_tokenizer()
label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
# Run inference
outputs = llm.encode(prompts)
for prompt, output in zip(prompts, outputs):
logits = output.outputs.data
predictions = logits.argmax(dim=-1)
# Map predictions to labels
tokens = tokenizer.convert_ids_to_tokens(output.prompt_token_ids)
labels = [label_map[p.item()] for p in predictions]
# Print results
for token, label in zip(tokens, labels):
if token not in tokenizer.all_special_tokens:
print(f"{token:15}{label}")
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -53,7 +53,6 @@ def parse_args():
"--method",
type=str,
default="eagle",
choices=["ngram", "eagle", "eagle3", "mtp"],
)
parser.add_argument("--num-spec-tokens", type=int, default=2)
parser.add_argument("--prompt-lookup-max", type=int, default=5)
@ -118,6 +117,11 @@ def main():
"prompt_lookup_max": args.prompt_lookup_max,
"prompt_lookup_min": args.prompt_lookup_min,
}
elif args.method.endswith("mtp"):
speculative_config = {
"method": args.method,
"num_speculative_tokens": args.num_spec_tokens,
}
else:
raise ValueError(f"unknown method: {args.method}")

View File

@ -1,11 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of guided decoding
to generate structured outputs using vLLM. It shows how to apply
different guided decoding techniques such as Choice, Regex, JSON schema,
and Grammar to produce structured and formatted results
based on specific prompts.
This file demonstrates the example usage of structured outputs
in vLLM. It shows how to apply different constraints such as choice,
regex, json schema, and grammar to produce structured and formatted
results based on specific prompts.
"""
from enum import Enum
@ -13,19 +12,23 @@ from enum import Enum
from pydantic import BaseModel
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
from vllm.sampling_params import StructuredOutputsParams
MAX_TOKENS = 50
# Guided decoding by Choice (list of possible options)
guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
# Structured outputs by Choice (list of possible options)
structured_outputs_params_choice = StructuredOutputsParams(
choice=["Positive", "Negative"]
)
sampling_params_choice = SamplingParams(
structured_outputs=structured_outputs_params_choice
)
prompt_choice = "Classify this sentiment: vLLM is wonderful!"
# Guided decoding by Regex
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
# Structured outputs by Regex
structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
sampling_params_regex = SamplingParams(
guided_decoding=guided_decoding_params_regex,
structured_outputs=structured_outputs_params_regex,
stop=["\n"],
max_tokens=MAX_TOKENS,
)
@ -36,7 +39,7 @@ prompt_regex = (
)
# Guided decoding by JSON using Pydantic schema
# Structured outputs by JSON using Pydantic schema
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
@ -51,17 +54,16 @@ class CarDescription(BaseModel):
json_schema = CarDescription.model_json_schema()
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
sampling_params_json = SamplingParams(
guided_decoding=guided_decoding_params_json,
max_tokens=MAX_TOKENS,
structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS
)
prompt_json = (
"Generate a JSON with the brand, model and car_type of"
"Generate a JSON with the brand, model and car_type of "
"the most iconic car from the 90's"
)
# Guided decoding by Grammar
# Structured outputs by Grammar
simplified_sql_grammar = """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
@ -70,13 +72,15 @@ table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
structured_outputs_params_grammar = StructuredOutputsParams(
grammar=simplified_sql_grammar
)
sampling_params_grammar = SamplingParams(
guided_decoding=guided_decoding_params_grammar,
structured_outputs=structured_outputs_params_grammar,
max_tokens=MAX_TOKENS,
)
prompt_grammar = (
"Generate an SQL query to show the 'username' and 'email'from the 'users' table."
"Generate an SQL query to show the 'username' and 'email' from the 'users' table."
)
@ -93,16 +97,16 @@ def main():
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
format_output("Guided decoding by Choice", choice_output)
format_output("Structured outputs by Choice", choice_output)
regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
format_output("Guided decoding by Regex", regex_output)
format_output("Structured outputs by Regex", regex_output)
json_output = generate_output(prompt_json, sampling_params_json, llm)
format_output("Guided decoding by JSON", json_output)
format_output("Structured outputs by JSON", json_output)
grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
format_output("Guided decoding by Grammar", grammar_output)
format_output("Structured outputs by Grammar", grammar_output)
if __name__ == "__main__":

View File

@ -1437,6 +1437,80 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
)
# Qwen3-VL-Dense
def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-VL-4B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Qwen3-VL-MOE
def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# R-4B
def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -1645,6 +1719,8 @@ model_example_map = {
"qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl,
"qwen2_5_omni": run_qwen2_5_omni,
"qwen3_vl": run_qwen3_vl,
"qwen3_vl_moe": run_qwen3_vl_moe,
"rvl": run_r_vl,
"skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
@ -1658,6 +1734,8 @@ MODELS_NEED_VIDEO_METADATA = [
"glm4_1v",
"glm4_5v",
"glm4_5v_fp8",
"qwen3_vl",
"qwen3_vl_moe",
]

View File

@ -0,0 +1,87 @@
# Monitoring Dashboards
This directory contains monitoring dashboard configurations for vLLM, providing
comprehensive observability for your vLLM deployments.
## Dashboard Platforms
We provide dashboards for two popular observability platforms:
- **[Grafana](https://grafana.com)**
- **[Perses](https://perses.dev)**
## Dashboard Format Approach
All dashboards are provided in **native formats** that work across different
deployment methods:
### Grafana (JSON)
- ✅ Works with any Grafana instance (cloud, self-hosted, Docker)
- ✅ Direct import via Grafana UI or API
- ✅ Can be wrapped in Kubernetes operators when needed
- ✅ No vendor lock-in or deployment dependencies
### Perses (YAML)
- ✅ Works with standalone Perses instances
- ✅ Compatible with Perses API and CLI
- ✅ Supports Dashboard-as-Code workflows
- ✅ Can be wrapped in Kubernetes operators when needed
## Dashboard Contents
Both platforms provide equivalent monitoring capabilities:
| Dashboard | Description |
|-----------|-------------|
| **Performance Statistics** | Tracks latency, throughput, and performance metrics |
| **Query Statistics** | Monitors request volume, query performance, and KPIs |
## Quick Start
First, navigate to this example's directory:
```bash
cd examples/online_serving/dashboards
```
### Grafana
Import the JSON directly into the Grafana UI, or use the API:
```bash
curl -X POST http://grafana/api/dashboards/db \
-H "Content-Type: application/json" \
-d @grafana/performance_statistics.json
```
### Perses
Import via the Perses CLI:
```bash
percli apply -f perses/performance_statistics.yaml
```
## Requirements
- **Prometheus** metrics from your vLLM deployment
- **Data source** configured in your monitoring platform
- **vLLM metrics** enabled and accessible
## Platform-Specific Documentation
For detailed deployment instructions and platform-specific options, see:
- **[Grafana Documentation](./grafana)** - JSON dashboards, operator usage, manual import
- **[Perses Documentation](./perses)** - YAML specs, CLI usage, operator wrapping
## Contributing
When adding new dashboards, please:
1. Provide native formats (JSON for Grafana, YAML specs for Perses)
2. Update platform-specific README files
3. Ensure dashboards work across deployment methods
4. Test with the latest platform versions

View File

@ -0,0 +1,59 @@
# Grafana Dashboards for vLLM Monitoring
This directory contains Grafana dashboard configurations (as JSON) designed to monitor
vLLM performance and metrics.
## Requirements
- Grafana 8.0+
- Prometheus data source configured in Grafana
- vLLM deployment with Prometheus metrics enabled
## Dashboard Descriptions
- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
throughput for your vLLM service.
- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
performance indicators for your vLLM service.
## Deployment Options
### Manual Import (Recommended)
The easiest way to use these dashboards is to manually import the JSON configurations
directly into your Grafana instance:
1. Navigate to your Grafana instance
2. Click the '+' icon in the sidebar
3. Select 'Import'
4. Copy and paste the JSON content from the dashboard files, or upload the JSON files
directly
### Grafana Operator
If you're using the [Grafana Operator](https://github.com/grafana-operator/grafana-operator)
in Kubernetes, you can wrap these JSON configurations in a `GrafanaDashboard` custom
resource:
```yaml
# Note: Adjust the instanceSelector to match your Grafana instance's labels
# You can check with: kubectl get grafana -o yaml
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: vllm-performance-dashboard
spec:
instanceSelector:
matchLabels:
dashboards: grafana # Adjust to match your Grafana instance labels
folder: "vLLM Monitoring"
json: |
# Replace this comment with the complete JSON content from
# performance_statistics.json - The JSON should start with { and end with }
```
Then apply to your cluster:
```bash
kubectl apply -f your-dashboard.yaml -n <namespace>
```

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,760 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "High-level overview of VLLM model deployment behavior and key performance indicators. Designed for Data Scientists and Product Managers to monitor request volume, token throughput, and latency",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 47,
"links": [],
"panels": [
{
"collapsed": true,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 20,
"panels": [],
"title": "Request Over Time",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "auto",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "req/s"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 1 },
"id": 1,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"pluginVersion": "11.3.0",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"editorMode": "code",
"expr": "sum by (model_name) (\n rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval])\n)",
"interval": "1",
"legendFormat": "{{model_name}}",
"range": true,
"refId": "A"
}
],
"title": "Successful Requests Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "req/s"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Requests Avg Rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calcultaions": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "p50 Latency",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 4 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "p90 Latency",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 4 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "p99 Latency",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
"id": 19,
"panels": [],
"title": "Size Distribution",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"fillOpacity": 80,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineWidth": 1,
"stacking": { "group": "A", "mode": "none" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 8 },
"id": 6,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum by (le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "{{model_name}} le={{le}}",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size Distribution",
"type": "histogram"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "calculation ": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 8 },
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size p90",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 8 },
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size p50",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calcultaion": { "index": 0, "text": "mean" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 11 },
"id": 7,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))\n/\nsum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size Avg",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 11 },
"id": 10,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Token Size p99",
"type": "stat"
},
{
"collapsed": true,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 18,
"panels": [],
"title": "Input Token Over Time",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "auto",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 15 },
"id": 11,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "{{model_name}}",
"range": true,
"refId": "A"
}
],
"title": "Input Tokens Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 15 },
"id": 12,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Input Tokens/Sec Avg",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
"id": 17,
"panels": [],
"title": "Output Token Over Time",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "auto",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 22 },
"id": 13,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "{{model_name}}",
"range": true,
"refId": "A"
}
],
"title": "Output Tokens Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
},
"unit": "cps"
},
"overrides": []
},
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 22 },
"id": 14,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0",
"targets": [
{
"editorMode": "code",
"expr": "sum(rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Output Tokens/Sec Avg",
"type": "stat"
}
],
"preload": false,
"schemaVersion": 40,
"tags": [],
"templating": {
"list": [
{
"current": { "text": "Prometheus", "value": "4184fc20-68a7-483a-8d9b-7caa59c680dd" },
"label": "datasource",
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"type": "datasource"
},
{
"current": { "text": ["All"], "value": ["$__all"] },
"definition": "label_values(vllm:request_success_total,model_name)",
"includeAll": true,
"label": "Deployment_ID",
"multi": true,
"name": "Deployment_id",
"options": [],
"query": {
"qryType": 1,
"query": "label_values(vllm:request_success_total,model_name)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"sort": 1,
"type": "query"
},
{
"current": { "text": "All hours", "value": "All hours" },
"hide": 2,
"label": "Rush Hours Only",
"name": "rush_hours",
"options": [
{ "selected": true, "text": "false", "value": "All hours" },
{ "selected": false, "text": "true", "value": "Rush hours" }
],
"query": "false : All hours, true : Rush hours",
"type": "custom"
},
{
"current": { "text": "All", "value": "All" },
"hide": 2,
"label": "Rush Hours Type",
"name": "rush_hours_type",
"options": [
{ "selected": true, "text": "^All__.*$", "value": "All" },
{ "selected": false, "text": "^Static__.*$", "value": "Static" },
{ "selected": false, "text": "^Dynamic__.*$", "value": "Dynamic" }
],
"query": "^All__.*$ : All, ^Static__.*$ : Static, ^Dynamic__.*$ : Dynamic",
"type": "custom"
},
{
"current": { "text": "", "value": "" },
"hide": 2,
"name": "query0",
"options": [],
"query": "",
"refresh": 1,
"regex": "",
"type": "query"
}
]
},
"time": { "from": "now-12h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Query Statistics_New4",
"uid": "query-statistics4",
"version": 2,
"weekStart": ""
}

View File

@ -0,0 +1,48 @@
# Perses Dashboards for vLLM Monitoring
This directory contains Perses dashboard configurations designed to monitor vLLM
performance and metrics.
## Requirements
- Perses instance (standalone or via operator)
- Prometheus data source configured in Perses
- vLLM deployment with Prometheus metrics enabled
## Dashboard Format
We provide dashboards in the **native Perses YAML format** that works across all
deployment methods:
- **Files**: `*.yaml` (native Perses dashboard specifications)
- **Format**: Pure dashboard specifications that work everywhere
- **Usage**: Works with standalone Perses, API imports, CLI, and file provisioning
- **Kubernetes**: Directly compatible with Perses Operator
## Dashboard Descriptions
- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
statistics
- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
## Deployment Options
### Direct Import to Perses
Import the dashboard specifications via Perses API or CLI:
```bash
percli apply -f performance_statistics.yaml
```
### Perses Operator (Kubernetes)
The native YAML format works directly with the Perses Operator:
```bash
kubectl apply -f performance_statistics.yaml -n <namespace>
```
### File Provisioning
Place the YAML files in a Perses provisioning folder for automatic loading.

View File

@ -0,0 +1,764 @@
kind: PersesDashboard
metadata:
name: performance-statistics
createdAt: 0001-01-01T00:00:00Z
updatedAt: 0001-01-01T00:00:00Z
version: 0
project: ""
spec:
display:
name: Performance Statistics
variables:
- kind: ListVariable
spec:
display:
name: Deployment_ID
hidden: false
name: Deployment_id
allowAllValue: true
allowMultiple: true
defaultValue:
- $__all
sort: alphabetical-asc
plugin:
kind: PrometheusLabelValuesVariable
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
labelName: model_name
matchers:
# Any one vllm metric that always carries model_name
- vllm:generation_tokens_total{}
panels:
"1":
kind: Panel
spec:
display:
name: E2E Latency over Time
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
# avg latency by model = sum(rate(sum)) / sum(rate(count))
query: >
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
/
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}}'
"2":
kind: Panel
spec:
display:
name: E2E Latency (Avg)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
/
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
"3":
kind: Panel
spec:
display:
name: E2E Latency (P50)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"4":
kind: Panel
spec:
display:
name: E2E Latency (P90)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"5":
kind: Panel
spec:
display:
name: E2E Latency (P99)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"6":
kind: Panel
spec:
display:
name: TTFT over Time
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
/
sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}}'
"7":
kind: Panel
spec:
display:
name: TTFT (Avg)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
/
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
"8":
kind: Panel
spec:
display:
name: TTFT (P50)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"9":
kind: Panel
spec:
display:
name: TTFT (P90)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"10":
kind: Panel
spec:
display:
name: TTFT (P99)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"11":
kind: Panel
spec:
display:
name: ITL (Time per Output Token) over Time
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
/
sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}}'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p50'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p90'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p99'
"12":
kind: Panel
spec:
display:
name: ITL (Avg)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
/
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
"13":
kind: Panel
spec:
display:
name: ITL (P50)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"14":
kind: Panel
spec:
display:
name: ITL (P90)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"15":
kind: Panel
spec:
display:
name: ITL (P99)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
"16":
kind: Panel
spec:
display:
name: TPS (Tokens/sec) over Time
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}} generation'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}} prompt'
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
# overall iteration tokens/sec if exposed
query: >
rate(vllm:iteration_tokens_total_count[$__interval])
seriesNameFormat: 'iteration overall'
"17":
kind: Panel
spec:
display:
name: KV Cache Usage (avg %)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
# Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
query: >
100 * avg(vllm:gpu_cache_usage_perc)
"18":
kind: Panel
spec:
display:
name: Running Requests by Pod
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (pod) (vllm:num_requests_running)
seriesNameFormat: '{{pod}}'
"19":
kind: Panel
spec:
display:
name: Waiting Requests by Pod
plugin:
kind: TimeSeriesChart
spec:
legend:
mode: table
position: bottom
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (pod) (vllm:num_requests_waiting)
seriesNameFormat: '{{pod}}'
"20":
kind: Panel
spec:
display:
name: Running Requests (sum)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: sum(vllm:num_requests_running)
"21":
kind: Panel
spec:
display:
name: Waiting Requests (sum)
plugin:
kind: StatChart
spec:
calculation: last-number
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: sum(vllm:num_requests_waiting)
layouts:
- kind: Grid
spec:
display:
title: Overview
items:
- x: 0
y: 0
width: 6
height: 3
content: { $ref: '#/spec/panels/17' } # KV cache %
- x: 6
y: 0
width: 6
height: 3
content: { $ref: '#/spec/panels/20' } # running sum
- x: 12
y: 0
width: 6
height: 3
content: { $ref: '#/spec/panels/21' } # waiting sum
- kind: Grid
spec:
display:
title: E2E Latency
items:
- x: 0
y: 1
width: 10
height: 6
content: { $ref: '#/spec/panels/1' }
- x: 10
y: 1
width: 7
height: 3
content: { $ref: '#/spec/panels/2' }
- x: 17
y: 1
width: 7
height: 3
content: { $ref: '#/spec/panels/3' }
- x: 10
y: 4
width: 7
height: 3
content: { $ref: '#/spec/panels/4' }
- x: 17
y: 4
width: 7
height: 3
content: { $ref: '#/spec/panels/5' }
- kind: Grid
spec:
display:
title: TTFT
items:
- x: 0
y: 8
width: 10
height: 6
content: { $ref: '#/spec/panels/6' }
- x: 10
y: 8
width: 7
height: 3
content: { $ref: '#/spec/panels/7' }
- x: 17
y: 8
width: 7
height: 3
content: { $ref: '#/spec/panels/8' }
- x: 10
y: 11
width: 7
height: 3
content: { $ref: '#/spec/panels/9' }
- x: 17
y: 11
width: 7
height: 3
content: { $ref: '#/spec/panels/10' }
- kind: Grid
spec:
display:
title: ITL (Time per Output Token)
items:
- x: 0
y: 15
width: 10
height: 6
content: { $ref: '#/spec/panels/11' }
- x: 10
y: 15
width: 7
height: 3
content: { $ref: '#/spec/panels/12' }
- x: 17
y: 15
width: 7
height: 3
content: { $ref: '#/spec/panels/13' }
- x: 10
y: 18
width: 7
height: 3
content: { $ref: '#/spec/panels/14' }
- x: 17
y: 18
width: 7
height: 3
content: { $ref: '#/spec/panels/15' }
- kind: Grid
spec:
display:
title: TPS (Prompt / Generation / Iteration)
items:
- x: 0
y: 22
width: 14
height: 6
content: { $ref: '#/spec/panels/16' }
- kind: Grid
spec:
display:
title: Per-Pod Request State
items:
- x: 0
y: 28
width: 12
height: 6
content: { $ref: '#/spec/panels/18' }
- x: 12
y: 28
width: 12
height: 6
content: { $ref: '#/spec/panels/19' }

View File

@ -0,0 +1,392 @@
kind: PersesDashboard
metadata:
name: query-statistics
createdAt: 0001-01-01T00:00:00Z
updatedAt: 0001-01-01T00:00:00Z
version: 0
project: ""
spec:
display:
name: Query Statistics_New
variables:
- kind: ListVariable
spec:
name: NS
display: { name: Namespace }
allowMultiple: false
defaultValue: llm-d
plugin:
kind: PrometheusLabelValuesVariable
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
labelName: namespace
matchers:
- up{service=~".*vllm.*"}
- kind: ListVariable
spec:
name: SVC
display: { name: Service }
allowMultiple: false
defaultValue: vllm-qwen2-0-5b-sim
plugin:
kind: PrometheusLabelValuesVariable
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
labelName: service
matchers:
- up{namespace="$NS",service=~".*vllm.*"}
- kind: ListVariable
spec:
name: MODEL
display: { name: Model (real vLLM) }
allowAllValue: true
allowMultiple: true
defaultValue: ["$__all"]
plugin:
kind: PrometheusLabelValuesVariable
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
labelName: model_name
matchers:
- vllm:request_success_total{namespace="$NS",service="$SVC"}
panels:
# --- Core (works on Simulator & Real) ---
core_running_now:
kind: Panel
spec:
display: { name: Running Requests (now) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_waiting_now:
kind: Panel
spec:
display: { name: Waiting Requests (now) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_kv_usage_now:
kind: Panel
spec:
display: { name: KV Cache Usage (01) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_running_ts:
kind: Panel
spec:
display: { name: Running Over Time }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_waiting_ts:
kind: Panel
spec:
display: { name: Waiting Over Time }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
core_targets_up:
kind: Panel
spec:
display: { name: Scrape Targets Up }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0)
minStep: "15s"
# --- KV Cache as Percent (works on Simulator & Real) ---
core_kv_usage_pct_now:
kind: Panel
spec:
display: { name: KV Cache Usage (%) now }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
# multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
query: (avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
minStep: "15s"
core_kv_usage_pct_ts:
kind: Panel
spec:
display: { name: KV Cache Usage (%) over time }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: (avg by (service) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
minStep: "15s"
# --- Per-Pod breakdowns (works on Simulator & Real) ---
per_pod_running_ts:
kind: Panel
spec:
display: { name: Running by Pod }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
per_pod_waiting_ts:
kind: Panel
spec:
display: { name: Waiting by Pod }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
minStep: "15s"
per_pod_kv_pct_ts:
kind: Panel
spec:
display: { name: KV Cache (%) by Pod }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
# if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
query: (avg by (pod) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
minStep: "15s"
# --- Real vLLM only (zeros on simulator) ---
real_req_rate_ts:
kind: Panel
spec:
display: { name: Request Rate (real vLLM) }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
minStep: "15s"
real_p50:
kind: Panel
spec:
display: { name: p50 Latency (real vLLM) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
minStep: "15s"
real_p90:
kind: Panel
spec:
display: { name: p90 Latency (real vLLM) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
minStep: "15s"
real_p99:
kind: Panel
spec:
display: { name: p99 Latency (real vLLM) }
plugin: { kind: StatChart, spec: { calculation: last-number } }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
minStep: "15s"
real_input_tokens_ts:
kind: Panel
spec:
display: { name: Input Tokens / sec (real vLLM) }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
minStep: "15s"
real_output_tokens_ts:
kind: Panel
spec:
display: { name: Output Tokens / sec (real vLLM) }
plugin:
kind: TimeSeriesChart
spec:
legend: { mode: table, position: bottom }
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
queries:
- kind: TimeSeriesQuery
spec:
plugin:
kind: PrometheusTimeSeriesQuery
spec:
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
minStep: "15s"
layouts:
- kind: Grid
spec:
display: { title: Core (Sim & Real) }
items:
- { x: 0, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_running_now' } }
- { x: 6, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } }
- { x: 12, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } }
- { x: 18, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_targets_up' } }
- { x: 0, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } }
- { x: 12, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } }
- kind: Grid
spec:
display: { title: KV Cache (%) }
items:
- { x: 0, y: 9, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } }
- { x: 6, y: 9, width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } }
- kind: Grid
spec:
display: { title: Per-Pod breakdowns }
items:
- { x: 0, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } }
- { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } }
- { x: 0, y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } }
- kind: Grid
spec:
display: { title: Real vLLM only (shows 0 on simulator) }
items:
- { x: 0, y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } }
- { x: 12, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p50' } }
- { x: 16, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p90' } }
- { x: 20, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p99' } }
- { x: 0, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } }
- { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } }

View File

@ -6,7 +6,7 @@ without any specific flags:
```bash
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--guided-decoding-backend outlines
--structured-outputs-config.backend outlines
```
This example demonstrates how to generate chat completions

View File

@ -12,6 +12,12 @@ python examples/online_serving/pooling/cohere_rerank_client.py
python examples/online_serving/pooling/jinaai_rerank_client.py
```
## Named Entity Recognition (NER) usage
```bash
python examples/online_serving/pooling/ner.py
```
## Openai chat embedding for multimodal usage
```bash

View File

@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
"""
Example online usage of Pooling API for Named Entity Recognition (NER).
Run `vllm serve <model> --runner pooling`
to start up the server in vLLM. e.g.
vllm serve boltuix/NeuroBERT-NER
"""
import argparse
import requests
import torch
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
response = requests.post(api_url, headers=headers, json=prompt)
return response
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="boltuix/NeuroBERT-NER")
return parser.parse_args()
def main(args):
from transformers import AutoConfig, AutoTokenizer
api_url = f"http://{args.host}:{args.port}/pooling"
model_name = args.model
# Load tokenizer and config
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
label_map = config.id2label
# Input text
text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
prompt = {"model": model_name, "input": text}
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
# Run inference
output = pooling_response.json()["data"][0]
logits = torch.tensor(output["data"])
predictions = logits.argmax(dim=-1)
inputs = tokenizer(text, return_tensors="pt")
# Map predictions to labels
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = [label_map[p.item()] for p in predictions]
assert len(tokens) == len(predictions)
# Print results
for token, label in zip(tokens, labels):
if token not in tokenizer.all_special_tokens:
print(f"{token:15}{label}")
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
"content": "Classify this sentiment: vLLM is wonderful!",
}
],
"extra_body": {"guided_choice": ["positive", "negative"]},
"extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
},
"regex": {
"messages": [
@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
}
],
"extra_body": {
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
"structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
},
},
"json": {
@ -122,7 +122,8 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
}
],
"extra_body": {
"guided_grammar": """
"structured_outputs": {
"grammar": """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
@ -135,6 +136,7 @@ condition ::= column "= " number
number ::= "1 " | "2 "
""",
}
},
},
"structural_tag": {

View File

@ -1,36 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import traceback
from typing import Callable
from unittest.mock import patch
def find_cuda_init(fn: Callable[[], object]) -> None:
"""
Helper function to debug CUDA re-initialization errors.
If `fn` initializes CUDA, prints the stack trace of how this happens.
"""
from torch.cuda import _lazy_init
stack = None
def wrapper():
nonlocal stack
stack = traceback.extract_stack()
return _lazy_init()
with patch("torch.cuda._lazy_init", wrapper):
fn()
if stack is not None:
print("==== CUDA Initialized ====")
print("".join(traceback.format_list(stack)).strip())
print("==========================")
if __name__ == "__main__":
find_cuda_init(
lambda: importlib.import_module("vllm.model_executor.models.llava"))

View File

@ -79,6 +79,7 @@ plugins:
- "re:vllm\\._.*" # Internal modules
- "vllm.third_party"
- "vllm.vllm_flash_attn"
- !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default
- mkdocstrings:
handlers:
python:

View File

@ -115,7 +115,6 @@ follow_imports = "silent"
# move the directory here and remove it from tools/mypy.sh
files = [
"vllm/*.py",
"vllm/adapter_commons",
"vllm/assets",
"vllm/entrypoints",
"vllm/core",

View File

@ -1 +1,2 @@
lmcache
lmcache
nixl >= 0.5.1 # Required for disaggregated prefill

View File

@ -14,3 +14,4 @@ setuptools-scm>=8
wheel
jinja2>=3.1.6
amdsmi==6.2.4
timm>=1.0.17

View File

@ -1,5 +1,6 @@
# Common dependencies
-r common.txt
tblib==3.1.0
# entrypoints test
# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai

View File

@ -17,4 +17,5 @@ setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
conch-triton-kernels==1.2.1
conch-triton-kernels==1.2.1
timm>=1.0.17

View File

@ -1,54 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""vllm.entrypoints.api_server with some extra logging for testing."""
from collections.abc import Iterable
from typing import Any
import uvicorn
from fastapi.responses import JSONResponse, Response
import vllm.entrypoints.api_server
import vllm.envs as envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.utils import FlexibleArgumentParser
app = vllm.entrypoints.api_server.app
class AsyncLLMEngineWithStats(AsyncLLMEngine):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._num_aborts = 0
async def _engine_abort(self, request_ids: Iterable[str]):
ids = list(request_ids)
self._num_aborts += len(ids)
await super()._engine_abort(ids)
def testing_stats(self) -> dict[str, Any]:
return {"num_aborted_requests": self._num_aborts}
@app.get("/stats")
def stats() -> Response:
"""Get the statistics of the engine."""
return JSONResponse(engine.testing_stats())
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
vllm.entrypoints.api_server.engine = engine
uvicorn.run(app,
host=args.host,
port=args.port,
log_level="debug",
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)

View File

@ -1,12 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')

View File

@ -1,139 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copyreg
import os
import subprocess
import sys
import time
from multiprocessing import Pool
from pathlib import Path
import pytest
import requests
import urllib3.exceptions
def _pickle_new_connection_error(obj):
"""Custom pickler for NewConnectionError to fix tblib compatibility."""
# Extract the original message by removing the "conn: " prefix
full_message = obj.args[0] if obj.args else ""
if ': ' in full_message:
# Split off the connection part and keep the actual message
_, actual_message = full_message.split(': ', 1)
else:
actual_message = full_message
return _unpickle_new_connection_error, (actual_message, )
def _unpickle_new_connection_error(message):
"""Custom unpickler for NewConnectionError."""
# Create with None as conn and the actual message
return urllib3.exceptions.NewConnectionError(None, message)
# Register the custom pickle/unpickle functions for tblib compatibility
copyreg.pickle(urllib3.exceptions.NewConnectionError,
_pickle_new_connection_error)
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
response = requests.post("http://localhost:8000/generate",
json={
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": 0,
"ignore_eos": True
})
response.raise_for_status()
return response.json()
def _query_server_long(prompt: str) -> dict:
return _query_server(prompt, max_tokens=500)
@pytest.fixture
def api_server(distributed_executor_backend: str):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
commands = [
sys.executable,
"-u",
str(script_path),
"--model",
"facebook/opt-125m",
"--host",
"127.0.0.1",
"--distributed-executor-backend",
distributed_executor_backend,
]
# API Server Test Requires V0.
my_env = os.environ.copy()
my_env["VLLM_USE_V1"] = "0"
uvicorn_process = subprocess.Popen(commands, env=my_env)
yield
uvicorn_process.terminate()
@pytest.mark.timeout(300)
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, distributed_executor_backend: str):
"""
Run the API server and test it.
We run both the server and requests in separate processes.
We test that the server can handle incoming requests, including
multiple requests at the same time, and that it can handle requests
being cancelled without crashing.
"""
with Pool(32) as pool:
# Wait until the server is ready
prompts = ["warm up"] * 1
result = None
while not result:
try:
for r in pool.map(_query_server, prompts):
result = r
break
except requests.exceptions.ConnectionError:
time.sleep(1)
# Actual tests start here
# Try with 1 prompt
for result in pool.map(_query_server, prompts):
assert result
num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests == 0
# Try with 100 prompts
prompts = ["test prompt"] * 100
for result in pool.map(_query_server, prompts):
assert result
with Pool(32) as pool:
# Cancel requests
prompts = ["canceled requests"] * 100
pool.map_async(_query_server_long, prompts)
time.sleep(0.01)
pool.terminate()
pool.join()
# check cancellation stats
# give it some time to update the stats
time.sleep(1)
num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests > 0
# check that server still runs after cancellations
with Pool(32) as pool:
# Try with 100 prompts
prompts = ["test prompt after canceled"] * 100
for result in pool.map(_query_server, prompts):
assert result

View File

@ -1,71 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.engine.async_llm_engine import RequestTracker
from vllm.outputs import RequestOutput
@pytest.mark.asyncio
async def test_request_tracker():
tracker = RequestTracker()
stream_1 = tracker.add_request("1")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert len(new) == 1
assert new[0]["request_id"] == "1"
assert not aborted
assert not stream_1.finished
stream_2 = tracker.add_request("2")
stream_3 = tracker.add_request("3")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert len(new) == 2
assert new[0]["request_id"] == "2"
assert new[1]["request_id"] == "3"
assert not aborted
assert not stream_2.finished
assert not stream_3.finished
# request_ids must be unique
with pytest.raises(KeyError):
tracker.add_request("1")
assert not tracker.new_requests_event.is_set()
tracker.abort_request("1")
new, aborted = tracker.get_new_and_aborted_requests()
assert len(aborted) == 1
assert "1" in aborted
assert not new
assert stream_1.finished
stream_4 = tracker.add_request("4")
tracker.abort_request("4")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, aborted = tracker.get_new_and_aborted_requests()
# aborted new requests will cancel each other out -
# there's no need for them to propagate into the
# engine
assert not aborted
assert not new
assert stream_4.finished
stream_5 = tracker.add_request("5")
assert tracker.new_requests_event.is_set()
tracker.process_request_output(
RequestOutput("2", "output", [], [], [], finished=True))
await tracker.wait_for_new_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert not aborted
assert len(new) == 1
assert new[0]["request_id"] == "5"
assert stream_2.finished
assert not stream_5.finished

View File

@ -1,189 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the short outputs of HF and vLLM when using greedy sampling.
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`.
"""
import pytest
from prometheus_client import REGISTRY
import vllm.envs as envs
from vllm import SamplingParams
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
ENABLE_ARTIFICIAL_PREEMPT)
from ..models.utils import check_outputs_equal
MODELS = [
"distilbert/distilgpt2",
]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
so use VLLM_USE_V1=0 for all tests in the file.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(scope="module", autouse=True)
def check_settings():
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
"pytest tests/basic_correctness/test_preemption.py`")
@pytest.fixture
def distributed_executor_backend() -> str:
# When SPMD worker is used, use distributed_executor_backend="ray"
# to test delta input optimization works with preemption.
return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
def test_chunked_prefill_recompute(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
distributed_executor_backend: str,
) -> None:
"""Ensure that chunked prefill works with preemption."""
max_num_seqs = min(chunked_prefill_token_size, 256)
enable_chunked_prefill = False
max_num_batched_tokens = None
if chunked_prefill_token_size != -1:
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
distributed_executor_backend=distributed_executor_backend,
disable_log_stats=False,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [96])
def test_preemption(
caplog_vllm,
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
distributed_executor_backend: str,
) -> None:
"""By default, recompute preemption is enabled"""
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(
model,
dtype=dtype,
disable_log_stats=False,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
"is not enough KV cache space." in caplog_vllm.text)
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# generated
preemption_metrics = None
for m in REGISTRY.collect():
if m.name == "vllm:num_preemptions":
preemption_metrics = m
assert preemption_metrics is not None
total_recorded_preemption = 0
for sample in preemption_metrics.samples:
total_recorded_preemption += sample.value
assert total_preemption == total_recorded_preemption
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [96])
def test_preemption_infeasible(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
distributed_executor_backend: str,
) -> None:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16
prefill_blocks = 2
decode_blocks = max_tokens // BLOCK_SIZE
with vllm_runner(
model,
dtype=dtype,
block_size=BLOCK_SIZE,
# Not enough gpu blocks to complete a single sequence.
# preemption should happen, and the sequence should be
# ignored instead of hanging forever.
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True)
req_outputs = vllm_model.llm.generate(
example_prompts,
sampling_params=sampling_params,
)
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
# Verify the request is ignored and not hang.
for req_output in req_outputs:
outputs = req_output.outputs
assert len(outputs) == 1
assert outputs[0].finish_reason == "length"

View File

@ -68,7 +68,7 @@ def test_bench_serve_chat(server):
"5",
"--endpoint",
"/v1/chat/completions",
"--endpoint-type",
"--backend",
"openai-chat",
]
result = subprocess.run(command, capture_output=True, text=True)

View File

@ -98,8 +98,9 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
return [FUSED_OPS[kNvfp4Quant]]
@pytest.mark.parametrize("num_tokens", [64])
@pytest.mark.parametrize("hidden_size", [128])
@pytest.mark.parametrize("num_tokens", [32, 64])
@pytest.mark.parametrize("hidden_size", [128, 256])
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize(
"model_class",
cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
@ -110,13 +111,13 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
[True, False] if cutlass_fp8_supported() else [True])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
reason="Only test on CUDA and ROCm")
def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class,
cuda_force_torch):
if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch:
pytest.skip("Duplicate tests for NVFP4")
torch.set_default_device("cuda")
torch.set_default_dtype(torch.float16)
torch.set_default_dtype(dtype)
x = torch.rand(num_tokens, hidden_size * 2)
@ -145,8 +146,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
elif model_class == TestSiluMulNvfp4QuantModel:
atol, rtol = 1e-1, 1e-1
torch.testing.assert_close(result[0].to(dtype=torch.float16),
result2[0].to(dtype=torch.float16),
torch.testing.assert_close(result[0].to(dtype=dtype),
result2[0].to(dtype=dtype),
atol=atol,
rtol=rtol)

View File

View File

@ -1,15 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
@pytest.fixture()
def should_do_global_cleanup_after_test() -> bool:
"""Disable the global cleanup fixture for tests in this directory. This
provides a ~10x speedup for unit tests that don't load a model to GPU.
This requires that tests in this directory clean up after themselves if they
use the GPU.
"""
return False

View File

@ -1,71 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Callable, Optional
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.utils import set_random_seed
@pytest.fixture
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, seed):
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, seed)
@pytest.fixture
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed):
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed)
def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
distinct_llm_kwargs, seed):
kwargs = {
**common_llm_kwargs,
**per_test_common_llm_kwargs,
**distinct_llm_kwargs,
}
def generator_inner():
llm = LLM(**kwargs)
set_random_seed(seed)
yield llm
del llm
cleanup_dist_env_and_memory()
for llm in generator_inner():
yield llm
del llm
def get_text_from_llm_generator(llm_generator: Iterable[LLM],
prompts,
sampling_params,
llm_cb: Optional[Callable[[LLM],
None]] = None):
for llm in llm_generator:
if llm_cb:
llm_cb(llm)
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
text = [output.outputs[0].text for output in outputs]
del llm
return text
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
for llm in llm_generator:
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
token_ids = [output.outputs[0].token_ids for output in outputs]
del llm
return token_ids

View File

@ -1,479 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import cycle
import pytest
from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"preemption_mode": "swap"
}, {
"preemption_mode": "recompute"
}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_block_manager_with_preemption(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify block manager produces same outputs even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
output_len = 1024
temperature = 0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# Our prompts will generate 128 tokens; since the prompts themselves are
# small, we don't need much KV space beyond 128.
"max_model_len": 160,
# skip cuda graph creation for fast test.
"enforce_eager": True,
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[
{
"block_size": 16,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
"num_gpu_blocks_override": 2 * (8 + 1),
},
{
"block_size": 8,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 16 = 128/block_size
"num_gpu_blocks_override": 2 * (16 + 2),
}
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"num_lookahead_slots": 0,
}])
@pytest.mark.parametrize(
"test_llm_kwargs",
[
{
# We run one test with block_size < lookahead_slots, one test with
# block_size > lookahead_slots
"num_lookahead_slots": 10,
"preemption_mode": "swap",
},
{
"num_lookahead_slots": 10,
"preemption_mode": "recompute",
}
])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
test_llm_generator,
batch_size):
"""Verify vLLM produces the same output with greedy sampling, when lookahead
scheduling is used vs. not.
Lookahead scheduling is not expected to modify the output, as it simply
allocates empty slots ahead of the known token ids in a sliding fashion.
This test constrains the total number of blocks to force preemption. It also
varies the block size so that the lookahead size is less than and greater
than the block size.
"""
output_len = 128
temperature = 0.0
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids without lookahead scheduling')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with lookahead scheduling')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
"enable_chunked_prefill": True,
},
])
@pytest.mark.parametrize("per_test_common_llm_kwargs",
[{
"block_size": 16,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, {
"block_size": 16,
"max_num_batched_tokens": 3,
"max_num_seqs": 2,
}, {
"block_size": 16,
"max_num_batched_tokens": 256,
"max_num_seqs": 10,
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [
{},
])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"num_lookahead_slots": 0,
},
{
"num_lookahead_slots": 5,
},
])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
def test_chunked_prefill_block_manager(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify that chunked prefill works with SelfAttnBlockSpaceManager,
with and without lookahead scheduling.
"""
output_len = 32
temperature = 0.0
prompts = [
"Hello, my name is",
"The president of the United States is",
("1 + " * 50) + " 1 = ", # Longer prompt.
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids with BlockManager')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with BlockManager, with lookahead slots.')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache
"enable_prefix_caching": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"preemption_mode": "swap"
}, {
"preemption_mode": "recompute"
}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_block_manager_prefix_caching_enabled_with_preemption(
baseline_llm_generator, test_llm_generator, batch_size):
"""Verify block manager produces same outputs even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
output_len = 1024
temperature = 0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids from block manager')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids from block manager, with preemption')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"enable_prefix_caching": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"enable_prefix_caching": True,
"preemption_mode": "swap"
}, {
"enable_prefix_caching": True,
"preemption_mode": "recompute"
}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify block manager v2 with auto prefix caching enabled produces same
outputs as auto prefix caching disabled, even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that auto
prefix caching itself at least don't cause result error.
"""
output_len = 1024
temperature = 0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids with APC disabled')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with APC enabled')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"enable_prefix_caching": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"enable_prefix_caching": True,
}])
@pytest.mark.parametrize("seed", [1])
def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
test_llm_generator):
"""Verify block manager v2 with auto prefix caching could work normally
even when eviction started.
With APC enabled, all blocks are held by native block at the beginning.
Then blocks are managed by evictor instead. If cache hit at the evictor's
block, then it could be reused, or we need to recompute its kv cache.
"""
output_len = 10
temperature = 0.0
prompts = [
"You are a helpful assistant. Please answer truthfully and write "
"out your thinking step by step to be sure you get the right answer. "
"If you make a mistake, attempt to correct it. who are you?",
"You are a helpful assistant. Please answer truthfully and write out "
"your thinking step by step to be sure you get the right answer. You "
"are helpful and harmless and you follow ethical guidelines. "
"who are you?"
]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids with APC disabled')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with APC enabled')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids

View File

@ -1,185 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
from .conftest import get_text_from_llm_generator
# relatively small model with 4k sliding window
MODEL = "bigcode/starcoder2-3b"
BLOCK_SIZE = 16
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": MODEL,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"block_size": BLOCK_SIZE,
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
batch_size, seed, backend, monkeypatch):
"""
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
asks for value of one of them (which is outside the sliding window).
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
Additionally, we compare the results of the v1 and v2 managers.
"""
if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.")
override_backend_env_variable(monkeypatch, backend)
sampling_params = SamplingParams(
max_tokens=1024,
ignore_eos=True,
temperature=0.0,
)
prompts, answer, indices = prep_prompts(batch_size)
baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
prompts,
sampling_params,
llm_cb=check_window(prompts))
check_answers(indices, answer, baseline_texts)
print('Getting token ids from block manager v2')
test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
sampling_params)
check_answers(indices, answer, test_texts)
cmp = [
expected_text == actual_text
for expected_text, actual_text in zip(baseline_texts, test_texts)
]
print(cmp)
# make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
# however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
# states that xformers and flash_attn have different ideas about the window
# size anyways
assert sum(cmp) > 0.7 * len(cmp)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": MODEL,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"block_size": BLOCK_SIZE,
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
backend, monkeypatch):
"""
This is similar to test_sliding_window_retrieval, however, it doesn't
compare against the v1 block manager since v1 doesn't support
chunked prefill with sliding window.
The results with and without chunked prefill are not the same due to
numerical instabilities.
"""
if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.")
override_backend_env_variable(monkeypatch, backend)
sampling_params = SamplingParams(
max_tokens=10,
ignore_eos=True,
temperature=0.0,
)
prompts, answer, indices = prep_prompts(batch_size)
# We don't compare with the baseline model here, since the results
# slightly different due to different tailing in attention.
test_texts = get_text_from_llm_generator(test_llm_generator,
prompts,
sampling_params,
llm_cb=check_window(prompts))
check_answers(indices, answer, test_texts)
def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
"""
Generate prompts which a bunch of assignments,
then asking for the value of one of them.
The prompt is just under 10k tokens; sliding window is 4k
so the answer is outside sliding window, but should still be correct.
Args:
batch_size: number of prompts to generate
ln_range: an argument to control the length of the prompt
"""
prompts: list[str] = []
answer: list[int] = []
indices: list[int] = []
random.seed(1)
for _ in range(batch_size):
idx = random.randint(30, 90)
indices.append(idx)
prompt = "```python\n# We set a number of variables, " + \
f"x{idx} will be important later\n"
ln = random.randint(*ln_range)
for k in range(30, ln):
v = random.randint(10, 99)
if k == idx:
answer.append(v)
prompt += f"x{k} = {v}\n"
prompt += f"# Now, we check the value of x{idx}:\n"
prompt += f"assert x{idx} == "
prompts.append(prompt)
return prompts, answer, indices
def check_answers(indices: list[int],
answer: list[int],
outputs: list[str],
accept_rate: float = 0.7):
answer2 = [int(text[0:2].strip()) for text in outputs]
print(list(zip(indices, zip(answer, answer2))))
numok = 0
for a1, a2 in zip(answer, answer2):
if a1 == a2:
numok += 1
frac_ok = numok / len(answer)
print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
assert frac_ok >= accept_rate
def check_window(prompts: list[str]):
def inner(llm: LLM):
sliding_window = llm.llm_engine.model_config.get_sliding_window()
assert sliding_window and sliding_window > 0
assert any(
len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
for prompt in prompts)
return inner

View File

@ -1,341 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.core.block_manager import SelfAttnBlockSpaceManager
from vllm.core.interfaces import AllocStatus
from vllm.sequence import Logprob, SequenceStatus
from vllm.utils import chunk_list
from ..utils import create_dummy_prompt, create_seq_group
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
@pytest.mark.parametrize("watermark", [0.0, 0.5])
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
num_gpu_blocks: int, watermark: float):
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
watermark=watermark,
)
num_watermark_blocks = int(watermark * num_gpu_blocks)
num_output_blocks_per_seq = 1
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
# the current implementation assumes all seqs are new prompts / don't have
# different output lens.
num_output_blocks = num_output_blocks_per_seq
for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
seq_group = create_seq_group(
seq_prompt_len=block_size * num_prompt_blocks,
seq_output_lens=[
block_size * num_output_blocks_per_seq
for _ in range(num_seqs_per_group)
],
)
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
can_allocate_result = block_manager.can_allocate(seq_group)
num_required_blocks = num_prompt_blocks + num_output_blocks
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
assert can_allocate_result == AllocStatus.NEVER
elif num_gpu_blocks >= num_required_blocks:
assert can_allocate_result == AllocStatus.OK
else:
assert can_allocate_result == AllocStatus.LATER
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
def test_append_slots(block_size, prompt_len, num_slots_to_append,
num_lookahead_slots):
"""Verify append_slots consumes the correct number of blocks from the block
table.
"""
num_gpu_blocks = 1024
watermark = 0.1
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
watermark=watermark,
)
seq_group = create_seq_group(
seq_prompt_len=prompt_len,
seq_output_lens=[0],
)
# Allocate seq
assert block_manager.can_allocate(seq_group)
block_manager.allocate(seq_group)
# Seq seq to RUNNING
seq = seq_group.get_seqs()[0]
seq.status = SequenceStatus.RUNNING
# Append tokens to the sequeqnce
for token_id in range(num_slots_to_append):
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
# Append slots for new tokens and lookahead slots.
free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
block_manager.append_slots(seq, num_lookahead_slots)
num_consumed_blocks = (free_blocks_before_append -
block_manager.get_num_free_gpu_blocks())
# Expect consumed blocks to be new blocks required to support the new slots.
expected_consumed_blocks = len(
list(
chunk_list(
list(
range(prompt_len + num_slots_to_append +
num_lookahead_slots)),
block_size))) - len(
list(chunk_list(list(range(prompt_len)), block_size)))
assert num_consumed_blocks == expected_consumed_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("num_cpu_blocks", [4])
@pytest.mark.parametrize("num_gpu_blocks", [4])
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
@pytest.mark.parametrize("enable_caching", [False, True])
def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
enable_caching):
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
"""
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id = 0
prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt)
assert block_manager.can_swap_out(seq_group)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_out(seq_group)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == gpu_blocks
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
prompt.status = SequenceStatus.SWAPPED
# Swap seq group from CPU -> GPU.
assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_in(seq_group)
cpu_blocks = block_manager.get_block_table(prompt)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == [cpu_blocks[0]]
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("num_gpu_blocks", [4])
@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
@pytest.mark.parametrize("enable_caching", [True, False])
def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
enable_caching):
""" Verify the block manager can correctly determine if a sequence group
can be swapped in/out.
"""
num_cpu_blocks = num_gpu_blocks
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt(
"1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
prompt.status = SequenceStatus.RUNNING
# Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt)
assert block_manager.can_swap_out(seq_group)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_out(seq_group)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == gpu_blocks
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
prompt.status = SequenceStatus.SWAPPED
# At this moment, we still have enough free blocks to swap in the seq group.
if num_lookahead_slots <= block_size:
assert block_manager.can_swap_in(seq_group,
num_lookahead_slots) == AllocStatus.OK
else:
assert block_manager.can_swap_in(
seq_group, num_lookahead_slots) == AllocStatus.NEVER
# During Swapped out, 2 cached blocks were evicted from the GPU,
# so the prompt1 can't be swapped in
prompt2_len = 2 * block_size - 1
prompt2, seq_group2 = create_dummy_prompt(
"2",
prompt_length=prompt2_len,
prompt_tokens=[10000 + i for i in range(prompt2_len)])
prompt2.status = SequenceStatus.WAITING
block_manager.allocate(seq_group2)
# Swap seq group from CPU -> GPU.
if num_lookahead_slots <= block_size:
assert block_manager.can_swap_in(
seq_group, num_lookahead_slots) == AllocStatus.LATER
else:
assert block_manager.can_swap_in(
seq_group, num_lookahead_slots) == AllocStatus.NEVER
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
@pytest.mark.parametrize("enable_caching", [False, True])
def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
"""Verifies that swapping fails if there is not enough free blocks
to account for unseen tokens and lookahead_slots.
"""
block_size = 8
num_cpu_blocks = 1
num_gpu_blocks = 1
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt_length = block_size - 3
assert prompt_length > 0
prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id = 0
prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU.
assert block_manager.can_swap_out(seq_group)
block_manager.swap_out(seq_group)
prompt.status = SequenceStatus.SWAPPED
# Swap seq group from CPU -> GPU.
# The number of unseen tokens is 1. If the number of existing
# tokens plus the unseen ones and number of lookahead slots exceeds
# the total number of available GPU blocks then the swap
# should fail.
num_unseen_tokens = 1
if (num_lookahead_slots + num_unseen_tokens +
prompt_length) <= (block_size * num_gpu_blocks):
assert block_manager.can_swap_in(seq_group,
num_lookahead_slots) == AllocStatus.OK
else:
assert block_manager.can_swap_in(
seq_group, num_lookahead_slots) == AllocStatus.NEVER
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
@pytest.mark.parametrize("block_size", [8, 16])
@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
@pytest.mark.parametrize("num_slots_to_append", [50])
@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
def test_sliding_window(block_size, prompt_len, num_slots_to_append,
sliding_window):
"""Verify append_slots consumes the correct number of blocks from the block
table.
"""
num_gpu_blocks = 1024
watermark = 0.1
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
watermark=watermark,
sliding_window=sliding_window,
)
def check_used(min_n, max_n=None):
if max_n is None:
max_n = min_n
used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
assert min_n <= used
assert used <= max_n
def num_blocks(num_tokens):
return (num_tokens + block_size - 1) // block_size
check_used(0)
seq_group = create_seq_group(
seq_prompt_len=prompt_len,
seq_output_lens=[0],
)
check_used(0)
# Allocate seq
assert block_manager.can_allocate(seq_group)
block_manager.allocate(seq_group)
check_used(num_blocks(prompt_len))
# Seq seq to RUNNING
seq = seq_group.get_seqs()[0]
seq.status = SequenceStatus.RUNNING
seq.data.update_num_computed_tokens(prompt_len)
check_used(num_blocks(prompt_len))
# this is how we compute it in SelfAttnBlockSpaceManager.__init__
sliding_blocks = (sliding_window // block_size) + 2
# plus one block for null block
sliding_blocks += 1
# Append tokens to the sequeqnce
for token_id in range(num_slots_to_append):
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
seq.data.update_num_computed_tokens(1)
block_manager.append_slots(seq, num_lookahead_slots=0)
if prompt_len < sliding_window + 10:
check_used(0, sliding_blocks + 1)
else:
check_used(sliding_blocks, sliding_blocks + 1)

View File

@ -1,577 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.core.block.block_table import BlockTable
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.utils import Device, cdiv, chunk_list
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
def test_allocate_naive(block_size: int, sequence_len: int):
"""Test the allocation of blocks using the naive allocator.
This test creates a CpuGpuBlockAllocator with the specified block size and
number of blocks. It then allocates multiple BlockTables with varying
sequence lengths and verifies that the number of free blocks decreases as
expected after each allocation.
"""
assert block_size > 1
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type="naive",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
block_tables: list[BlockTable] = []
for i in range(5):
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
block_tables.append(
BlockTable(
block_size=block_size,
block_allocator=allocator,
))
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
def test_allocate_prefix_caching(block_size: int, sequence_len: int):
"""Test the allocation of blocks using the prefix caching allocator.
This test creates a CpuGpuBlockAllocator with the specified block size and
number of blocks, using the prefix caching allocator. It then allocates
multiple BlockTables with varying sequence lengths and verifies that the
number of free blocks decreases as expected after each allocation.
The test expects all sequences to share allocations, except for their last
block, which may be mutable. It calculates the expected number of immutable
and mutable blocks per allocation based on the sequence length and block
size.
"""
assert block_size > 1
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type="prefix_caching",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
chunked_tokens = list(chunk_list(token_ids, block_size))
num_mutable_blocks_per_alloc = 0 if len(
chunked_tokens[-1]) == block_size else 1
num_immutable_blocks_per_alloc = len(
chunked_tokens) - num_mutable_blocks_per_alloc
block_tables: list[BlockTable] = []
for alloc_i in range(1, 6):
block_tables.append(
BlockTable(
block_size=block_size,
block_allocator=allocator,
))
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
# Expect all sequences to share allocations, except for their last block
# (which may be mutable).
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - (
num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
(alloc_i))
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
@pytest.mark.parametrize("device", ["cpu", "gpu"])
def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
device: str):
"""Test the allocation and freeing of blocks using different allocators and
devices.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, allocator type, and device. It then allocates a BlockTable
multiple times with the same sequence and verifies that the number of free
blocks remains consistent after each allocation and freeing.
"""
device = Device[device.upper()]
num_device_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_device_blocks,
num_cpu_blocks=num_device_blocks,
block_size=block_size,
)
token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
for i in range(5):
block_table.allocate(token_ids=token_ids, device=device)
assert allocator.get_num_free_blocks(
device) == num_device_blocks - num_blocks_per_alloc
assert all(block_id is not None
for block_id in block_table.physical_block_ids)
block_table.free()
assert allocator.get_num_free_blocks(device) == num_device_blocks
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_append_token_ids_allocation(block_size: int, sequence_len: int,
append_len: int, allocator_type: str):
"""Test the allocation behavior when appending token IDs to a BlockTable.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, and allocator type. It then allocates a BlockTable with an
initial sequence and appends additional token IDs to it. The test verifies
that the number of allocated blocks before and after appending matches the
expected values.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_blocks_before_append = len(
list(chunk_list(token_ids, block_size)))
num_expected_appended_blocks = len(
list(chunk_list(token_ids + token_ids_to_append,
block_size))) - num_expected_blocks_before_append
block_table.allocate(token_ids=token_ids, device=Device.GPU)
assert len(
block_table.physical_block_ids) == num_expected_blocks_before_append
block_table.append_token_ids(token_ids_to_append)
assert len(
block_table.physical_block_ids
) == num_expected_blocks_before_append + num_expected_appended_blocks
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
num_empty_slots: int,
allocator_type: str):
"""Test the allocation behavior when ensuring a certain number of empty
slots in a BlockTable.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, and allocator type. It then allocates a BlockTable with an
initial sequence and ensures a certain number of empty slots. The test
verifies that the number of allocated blocks before and after ensuring empty
slots matches the expected values. It also checks that filling up the empty
slots does not consume additional blocks.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_blocks_before_append = len(
list(chunk_list(token_ids, block_size)))
num_expected_appended_blocks = len(
list(chunk_list(token_ids + [-1] * num_empty_slots,
block_size))) - num_expected_blocks_before_append
block_table.allocate(token_ids=token_ids, device=Device.GPU)
# Assert that the empty slots consume the expected number of additional
# blocks.
assert len(
block_table.physical_block_ids) == num_expected_blocks_before_append
block_table.ensure_num_empty_slots(num_empty_slots)
assert len(
block_table.physical_block_ids
) == num_expected_blocks_before_append + num_expected_appended_blocks
# Now, ensure no additional blocks consumed as we fill up the empty slots.
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 9])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("append_size", [1, 4, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
append_len: int, allocator_type: str,
append_size: int):
"""Verify token ids are correctly appended. Appends various amounts of
token ids in various append sizes, and verifies the final sequence is
correct.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
block_table.allocate(token_ids=token_ids, device=Device.GPU)
appended_so_far: list[int] = []
for append in chunk_list(token_ids_to_append, append_size):
block_table.append_token_ids(append)
appended_so_far.extend(append)
assert block_table._get_all_token_ids() == token_ids + appended_so_far
assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
@pytest.mark.parametrize("seq_len", [1, 9, 129])
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_fork(seq_len: int, block_size: int, allocator_type: str):
"""Create a sequence using the specified allocator.
1. Assert that after forking the sequence, the free block count is the
same.
2. Assert that the forked sequence has the same physical mappings.
3. Then free the original sequence; verify that the free block count is
the same.
4. Finally, free the forked sequence and verify that the free block
count drops to zero.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(seq_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
block_table.allocate(token_ids)
num_free_blocks_before_fork = allocator.get_num_free_blocks(
device=Device.GPU)
forked_block_table = block_table.fork()
# Expect physical_block_ids and token_ids to match.
assert (block_table.physical_block_ids ==
forked_block_table.physical_block_ids)
assert block_table._get_all_token_ids(
) == forked_block_table._get_all_token_ids()
# Do not expect any additional allocations.
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_free_blocks_before_fork
# Free the original blocks. Assert num free blocks does not change, since
# refcount is nonzero.
block_table.free()
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_free_blocks_before_fork
# Expect the forked block table to be unaffected by the free.
assert all(block_id is not None
for block_id in forked_block_table.physical_block_ids)
# Free the forked blocks. Assert num free blocks does change, since
# refcount is now zero.
forked_block_table.free()
assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("appender", ["forked", "original"])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_cow(block_size: int, sequence_len: int, append_len: int,
allocator_type: str, appender: str):
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
original_block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
num_expected_cow_blocks = cdiv(sequence_len + append_len,
block_size) - (sequence_len // block_size)
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
original_block_ids = original_block_table.physical_block_ids[:]
print("original_block_ids = {}".format(original_block_ids))
forked_block_table = original_block_table.fork()
# Expect no additional allocation (copy on _write_).
assert allocator.get_num_free_blocks(
Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
if appender == "forked":
appender_block_table = forked_block_table
static_block_table = original_block_table
elif appender == "original":
appender_block_table = original_block_table
static_block_table = forked_block_table
else:
raise ValueError(f"unknown test config {appender=}")
# Write tokens.
appender_block_table.append_token_ids(token_ids_to_append)
# Expect the non-appending block table to have no change.
assert static_block_table.physical_block_ids == original_block_ids
assert appender_block_table.physical_block_ids != original_block_ids
# Expect the blocks changed during append to have a CoW.
assert allocator.get_num_free_blocks(
Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
num_expected_cow_blocks)
cows = allocator.clear_copy_on_writes()
if sequence_len % block_size > 0:
# If the last block in the sequence is not full, then when appending we
# expect a CoW.
assert cows
cow_block_id = sequence_len // block_size
expected_src = static_block_table.physical_block_ids[cow_block_id]
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
assert (expected_src, expected_dst) in cows
else:
# Otherwise, there should be no copy-on-write.
assert not cows
static_block_table.free()
appender_block_table.free()
# After free, expect all blocks to be freed.
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
@pytest.mark.parametrize("appender", ["forked", "original"])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_cow_lookahead_simple(block_size: int, sequence_len: int,
append_len: int, lookahead_slots: int,
allocator_type: str, appender: str):
"""Similar to test_cow, except with lookahead allocation. The assertions are
less rigorous due to the complexity of the property under test.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
original_block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
# Allocate lookahead slots.
original_block_table.ensure_num_empty_slots(lookahead_slots)
original_block_ids = original_block_table.physical_block_ids[:]
forked_block_table = original_block_table.fork()
if appender == "forked":
appender_block_table = forked_block_table
static_block_table = original_block_table
elif appender == "original":
appender_block_table = original_block_table
static_block_table = forked_block_table
else:
raise ValueError(f"unknown test config {appender=}")
# Write tokens.
appender_block_table.append_token_ids(token_ids_to_append)
# Expect the non-appending block table to have no change.
assert static_block_table.physical_block_ids == original_block_ids
assert appender_block_table.physical_block_ids != original_block_ids
cows = allocator.clear_copy_on_writes()
# Always expect copy-on-write
assert cows
if sequence_len % block_size > 0:
# If the last block in the sequence is not full, then when appending we
# expect a CoW.
assert cows
cow_block_id = sequence_len // block_size
expected_src = static_block_table.physical_block_ids[cow_block_id]
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
assert (expected_src, expected_dst) in cows
static_block_table.free()
appender_block_table.free()
# After free, expect all blocks to be freed.
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
num_new_tokens: int,
num_lookahead_slots: int,
allocator_type: str):
"""Verify correct calculation of get_num_blocks_touched_by_append_slots.
This is done by using copy-on-write, which requires any modified block to
be copied before write if the refcount > 1. We set the refcount>1 by forking
a sequence, then measure the free blocks before and after an append. If the
number of consumed blocks equals what `get_num_blocks_touched_by_append_
slots` returns, then the calculation is correct.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(num_new_tokens))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
block_table.allocate(token_ids=token_ids, device=Device.GPU)
# Add lookahead before fork so both sequences have the same lookahead
# blocks.
block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
# Fork sequence so that every block has refcount > 1.
_ = block_table.fork()
# Determine how many blocks should be touched.
expected_num_touched_blocks = (
block_table.get_num_blocks_touched_by_append_slots(
token_ids=token_ids_to_append,
num_lookahead_slots=num_lookahead_slots))
# Measure how many blocks are touched by measuring num_free_blocks before
# and after the append.
#
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
num_consumed_blocks = (num_free_blocks_before_append -
allocator.get_num_free_blocks(Device.GPU))
# TODO(cade) ensure equality when num_lookahead_slots > 0.
# The reason we have < is because lookahead blocks are not copied eagerly;
# they are copied on first write. This will cause issues for beam search +
# speculative decoding. This is acceptable for now as it is a large effort
# to combine the two. To fix this, we can ensure single sequence ownership
# of lookahead blocks by appending empty slots to each block, which will
# trigger the CoW.
#
# Until then, we can accept that the consumed tokens are <= the expected
# tokens when appending with lookahead.
if num_lookahead_slots > 0:
assert num_consumed_blocks <= expected_num_touched_blocks
else:
assert num_consumed_blocks == expected_num_touched_blocks

View File

@ -1,45 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import pytest
from vllm.core.block.common import RefCounter
@pytest.mark.parametrize("seed", list(range(20)))
@pytest.mark.parametrize("num_incrs", [1, 100])
@pytest.mark.parametrize("num_blocks", [1024])
def test_incr(seed: int, num_incrs: int, num_blocks: int):
random.seed(seed)
all_block_indices = list(range(num_blocks))
counter = RefCounter(all_block_indices=all_block_indices)
block_id = random.randint(0, num_blocks - 1)
for i in range(num_incrs):
value = counter.incr(block_id)
assert value == i + 1
@pytest.mark.parametrize("seed", list(range(20)))
@pytest.mark.parametrize("num_incrs", [1, 100])
@pytest.mark.parametrize("num_blocks", [1024])
def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
random.seed(seed)
all_block_indices = list(range(num_blocks))
counter = RefCounter(all_block_indices=all_block_indices)
block_id = random.randint(0, num_blocks - 1)
for i in range(num_incrs):
value = counter.incr(block_id)
assert value == i + 1
for i in range(num_incrs):
value = counter.decr(block_id)
assert value == num_incrs - (i + 1)
with pytest.raises(AssertionError):
counter.decr(block_id)

View File

@ -1,96 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.utils import Device, chunk_list
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
@pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
)
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [
allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
for _ in range(num_cpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [
allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
for _ in range(num_gpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in cpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
@pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [2])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
)
unique_token_ids = list(
range((num_cpu_blocks + num_gpu_blocks) * block_size))
gpu_token_ids = list(
chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
cpu_token_ids = list(
chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.CPU)
for token_ids in cpu_token_ids
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.GPU)
for token_ids in gpu_token_ids
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in cpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks

View File

@ -1,148 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
from vllm.core.block.interfaces import Block, BlockAllocator
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
class TestNaiveBlockAllocator:
@staticmethod
def create_allocate_lambda(allocate_type: str,
allocator: NaiveBlockAllocator,
prev_block: Optional[Block],
token_ids: list[int]):
if allocate_type == "immutable":
allocate_block = lambda: allocator.allocate_immutable_block(
prev_block=prev_block, token_ids=token_ids)
elif allocate_type == "mutable":
allocate_block = lambda: allocator.allocate_mutable_block(
prev_block=prev_block)
else:
raise ValueError()
return allocate_block
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_ooms(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
[allocate_block() for _ in range(num_blocks)]
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_free_prevents_oom(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
blocks = [allocate_block() for _ in range(num_blocks)]
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
block_to_free = blocks.pop()
for _ in range(100):
block_id = block_to_free.block_id
allocator.free(block_to_free)
assert block_to_free.block_id is None
new_block = allocate_block()
assert new_block.block_id == block_id
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
block_to_free = new_block
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
assert allocator.get_num_free_blocks() == num_blocks
blocks = [allocate_block() for _ in range(num_blocks)]
for i, block in enumerate(blocks):
assert allocator.get_num_free_blocks() == i
allocator.free(block)
@staticmethod
@pytest.mark.parametrize("num_blocks", [4])
@pytest.mark.parametrize("block_size", [8])
def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
""" Verify the allocator can correctly return the number of
full blocks touched.
"""
allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
# Create a chain of cacheable blocks in the dst
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
"immutable",
allocator_src,
prev_block=None,
token_ids=list(range(block_size)))
src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
# All blocks are cached
assert allocator_dst.get_num_full_blocks_touched(
src_blocks) == num_blocks - 1
# Insert one non-full block in the src
allocate_non_full_block = \
TestNaiveBlockAllocator.create_allocate_lambda(
"mutable", allocator_src,
prev_block=src_blocks[-1],token_ids=[]
)
src_blocks.append(allocate_non_full_block())
src_blocks[-1].append_token_ids([0])
assert allocator_dst.get_num_full_blocks_touched(
src_blocks) == num_blocks - 1
# Fill up the last source block and then invoke
# get_num_blocks_touched
src_blocks[-1].append_token_ids([0] * (block_size - 1))
assert allocator_dst.get_num_full_blocks_touched(
src_blocks) == num_blocks

Some files were not shown because too many files have changed in this diff Show More