mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-08 18:17:05 +08:00
merge
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
commit
c1d83f2bae
@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#Obsolete currently
|
||||
##ignore certain Entrypoints/llm tests
|
||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
#fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
|
||||
@ -46,24 +46,18 @@ steps:
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/mq_llm_engine
|
||||
- tests/async_engine
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
- tests/worker
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/transformers_utils
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s utils_ # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
- pytest -v -s transformers_utils # transformers_utils
|
||||
|
||||
- label: Python-only Installation Test # 10min
|
||||
@ -84,25 +78,12 @@ steps:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
- tests/basic_correctness/test_cpu_offload
|
||||
- tests/basic_correctness/test_preemption
|
||||
- tests/basic_correctness/test_cumem.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Core Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/core
|
||||
- vllm/distributed
|
||||
- tests/core
|
||||
commands:
|
||||
- pytest -v -s core
|
||||
|
||||
- label: Entrypoints Unit Tests # 5min
|
||||
timeout_in_minutes: 10
|
||||
@ -127,8 +108,7 @@ steps:
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
@ -230,16 +210,14 @@ steps:
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/metrics
|
||||
- tests/v1/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0' \
|
||||
'opentelemetry-api>=1.26.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||
- pytest -v -s tracing
|
||||
- pytest -v -s v1/tracing
|
||||
|
||||
##### fast check tests #####
|
||||
##### 1 GPU test #####
|
||||
@ -302,6 +280,7 @@ steps:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/offloading
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
@ -809,7 +788,7 @@ steps:
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
|
||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
@ -821,6 +800,20 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
|
||||
- label: GPT-OSS Eval (Blackwell)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true # disable while debugging
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
@ -946,7 +939,6 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
# - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
|
||||
|
||||
- label: LoRA TP Test (Distributed) # 17 min
|
||||
timeout_in_minutes: 30
|
||||
@ -1020,9 +1012,21 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: Qwen MoE EP Test # optional
|
||||
##### H200 test #####
|
||||
- label: Distrubted Tests (H200) # optional
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
|
||||
##### B200 test #####
|
||||
- label: Distributed Tests (B200) # optional
|
||||
gpu: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
|
||||
17
.github/CODEOWNERS
vendored
17
.github/CODEOWNERS
vendored
@ -22,7 +22,7 @@
|
||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||
/vllm/distributed/kv_transfer @NickLucche
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
@ -35,12 +35,12 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @heheda12345
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/offloading @ApostaC
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
@ -49,16 +49,21 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/v1/core @heheda12345
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
/tests/lora @jeejeelee
|
||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
# Transformers backend
|
||||
/vllm/model_executor/models/transformers.py @hmellor
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
# Docs
|
||||
/docs @hmellor
|
||||
|
||||
19
.github/mergify.yml
vendored
19
.github/mergify.yml
vendored
@ -171,7 +171,7 @@ pull_request_rules:
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
@ -302,3 +302,20 @@ pull_request_rules:
|
||||
label:
|
||||
remove:
|
||||
- needs-rebase
|
||||
|
||||
- name: label-kv-connector
|
||||
description: Automatically apply kv-connector label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||
- files~=^examples/others/lmcache/
|
||||
- files~=^tests/v1/kv_connector/
|
||||
- files~=^vllm/distributed/kv_transfer/
|
||||
- title~=(?i)\bP/?D\b
|
||||
- title~=(?i)NIXL
|
||||
- title~=(?i)LMCache
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- kv-connector
|
||||
@ -164,9 +164,7 @@ repos:
|
||||
name: Validate configuration has default values and that each field has a docstring
|
||||
entry: python tools/validate_config.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: true
|
||||
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
|
||||
additional_dependencies: [regex]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@ -175,6 +175,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Set CUDA include flags for CXX compiler.
|
||||
#
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
||||
@ -298,7 +308,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/attention/mla/cutlass_mla_entry.cu"
|
||||
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
@ -585,7 +594,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu"
|
||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
|
||||
@ -1,874 +1,20 @@
|
||||
# Benchmarking vLLM
|
||||
# Benchmarks
|
||||
|
||||
This README guides you through running benchmark tests with the extensive
|
||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
||||
become available.
|
||||
This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
|
||||
|
||||
## Dataset Overview
|
||||
## Contents
|
||||
|
||||
<table style="width:100%; border-collapse: collapse;">
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width:15%; text-align: left;">Dataset</th>
|
||||
<th style="width:10%; text-align: center;">Online</th>
|
||||
<th style="width:10%; text-align: center;">Offline</th>
|
||||
<th style="width:65%; text-align: left;">Data Path</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><strong>ShareGPT</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>ShareGPT4V (Image)</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td>
|
||||
<code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
|
||||
<br>
|
||||
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
|
||||
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>ShareGPT4Video (Video)</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td>
|
||||
<code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>BurstGPT</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Sonnet (deprecated)</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Random</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>synthetic</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>RandomMultiModal (Image/Video)</strong></td>
|
||||
<td style="text-align: center;">🟡</td>
|
||||
<td style="text-align: center;">🚧</td>
|
||||
<td><code>synthetic</code> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Prefix Repetition</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>synthetic</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-VisionArena</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-InstructCoder</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>likaixin/InstructCoder</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-AIMO</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-Other</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-MTBench</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>philschmid/mt-bench</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-Blazedit</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Spec Bench</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Custom</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td>Local file: <code>data.jsonl</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
|
||||
- **Throughput benchmarks**: Scripts for testing offline batch inference performance
|
||||
- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
|
||||
- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
|
||||
|
||||
✅: supported
|
||||
## Usage
|
||||
|
||||
🟡: Partial support
|
||||
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
|
||||
|
||||
🚧: to be supported
|
||||
For full CLI reference see:
|
||||
|
||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
|
||||
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
|
||||
|
||||
```bash
|
||||
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
|
||||
```
|
||||
|
||||
## 🚀 Example - Online Benchmark
|
||||
|
||||
<details>
|
||||
<summary>Show more</summary>
|
||||
|
||||
<br/>
|
||||
|
||||
First start serving your model
|
||||
|
||||
```bash
|
||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
||||
```
|
||||
|
||||
Then run the benchmarking script
|
||||
|
||||
```bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
If successful, you will see the following output
|
||||
|
||||
```text
|
||||
============ Serving Benchmark Result ============
|
||||
Successful requests: 10
|
||||
Benchmark duration (s): 5.78
|
||||
Total input tokens: 1369
|
||||
Total generated tokens: 2212
|
||||
Request throughput (req/s): 1.73
|
||||
Output token throughput (tok/s): 382.89
|
||||
Total Token throughput (tok/s): 619.85
|
||||
---------------Time to First Token----------------
|
||||
Mean TTFT (ms): 71.54
|
||||
Median TTFT (ms): 73.88
|
||||
P99 TTFT (ms): 79.49
|
||||
-----Time per Output Token (excl. 1st token)------
|
||||
Mean TPOT (ms): 7.91
|
||||
Median TPOT (ms): 7.96
|
||||
P99 TPOT (ms): 8.03
|
||||
---------------Inter-token Latency----------------
|
||||
Mean ITL (ms): 7.74
|
||||
Median ITL (ms): 7.70
|
||||
P99 ITL (ms): 8.39
|
||||
==================================================
|
||||
```
|
||||
|
||||
### Custom Dataset
|
||||
|
||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||
|
||||
```json
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
|
||||
```bash
|
||||
# start server
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||
```
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
vllm bench serve --port 9001 --save-result --save-detailed \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name custom \
|
||||
--dataset-path <path-to-your-data-jsonl> \
|
||||
--custom-skip-chat-template \
|
||||
--num-prompts 80 \
|
||||
--max-concurrency 1 \
|
||||
--temperature=0.3 \
|
||||
--top-p=0.75 \
|
||||
--result-dir "./log/"
|
||||
```
|
||||
|
||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||
|
||||
### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
```bash
|
||||
# need a model with vision capability here
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||
--hf-split train \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
### InstructCoder Benchmark with Speculative Decoding
|
||||
|
||||
``` bash
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name hf \
|
||||
--dataset-path likaixin/InstructCoder \
|
||||
--num-prompts 2048
|
||||
```
|
||||
|
||||
### Spec Bench Benchmark with Speculative Decoding
|
||||
|
||||
``` bash
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
|
||||
|
||||
Run all categories:
|
||||
|
||||
``` bash
|
||||
# Download the dataset using:
|
||||
# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
|
||||
|
||||
vllm bench serve \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name spec_bench \
|
||||
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
|
||||
--num-prompts -1
|
||||
```
|
||||
|
||||
Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
|
||||
|
||||
Run only a specific category like "summarization":
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name spec_bench \
|
||||
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
|
||||
--num-prompts -1
|
||||
--spec-bench-category "summarization"
|
||||
```
|
||||
|
||||
### Other HuggingFaceDataset Examples
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```
|
||||
|
||||
`lmms-lab/LLaVA-OneVision-Data`:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||
--hf-split train \
|
||||
--hf-subset "chart2text(cauldron)" \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
`AI-MO/aimo-validation-aime`:
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
--num-prompts 10 \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
`philschmid/mt-bench`:
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path philschmid/mt-bench \
|
||||
--num-prompts 80
|
||||
```
|
||||
|
||||
`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path vdaita/edit_5k_char \
|
||||
--num-prompts 90 \
|
||||
--blazedit-min-distance 0.01 \
|
||||
--blazedit-max-distance 0.99
|
||||
```
|
||||
|
||||
### Running With Sampling Parameters
|
||||
|
||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
parameters can be specified. Example client command:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--top-k 10 \
|
||||
--top-p 0.9 \
|
||||
--temperature 0.5 \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
### Running With Ramp-Up Request Rate
|
||||
|
||||
The benchmark tool also supports ramping up the request rate over the
|
||||
duration of the benchmark run. This can be useful for stress testing the
|
||||
server or finding the maximum throughput that it can handle, given some latency budget.
|
||||
|
||||
Two ramp-up strategies are supported:
|
||||
|
||||
- `linear`: Increases the request rate linearly from a start value to an end value.
|
||||
- `exponential`: Increases the request rate exponentially.
|
||||
|
||||
The following arguments can be used to control the ramp-up:
|
||||
|
||||
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
||||
|
||||
</details>
|
||||
|
||||
## 📈 Example - Offline Throughput Benchmark
|
||||
|
||||
<details>
|
||||
<summary>Show more</summary>
|
||||
|
||||
<br/>
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
If successful, you will see the following output
|
||||
|
||||
```text
|
||||
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
||||
Total num prompt tokens: 5014
|
||||
Total num output tokens: 1500
|
||||
```
|
||||
|
||||
### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||
--num-prompts 1000 \
|
||||
--hf-split train
|
||||
```
|
||||
|
||||
The `num prompt tokens` now includes image token counts
|
||||
|
||||
```text
|
||||
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
||||
Total num prompt tokens: 14527
|
||||
Total num output tokens: 1280
|
||||
```
|
||||
|
||||
### InstructCoder Benchmark with Speculative Decoding
|
||||
|
||||
``` bash
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_USE_V1=1 \
|
||||
vllm bench throughput \
|
||||
--dataset-name=hf \
|
||||
--dataset-path=likaixin/InstructCoder \
|
||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--input-len=1000 \
|
||||
--output-len=100 \
|
||||
--num-prompts=2048 \
|
||||
--async-engine \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
```text
|
||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
||||
Total num prompt tokens: 261136
|
||||
Total num output tokens: 204800
|
||||
```
|
||||
|
||||
### Other HuggingFaceDataset Examples
|
||||
|
||||
`lmms-lab/LLaVA-OneVision-Data`:
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||
--hf-split train \
|
||||
--hf-subset "chart2text(cauldron)" \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
`AI-MO/aimo-validation-aime`:
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model Qwen/QwQ-32B \
|
||||
--backend vllm \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
Benchmark with LoRA adapters:
|
||||
|
||||
``` bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
vllm bench throughput \
|
||||
--model meta-llama/Llama-2-7b-hf \
|
||||
--backend vllm \
|
||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--dataset_name sharegpt \
|
||||
--num-prompts 10 \
|
||||
--max-loras 2 \
|
||||
--max-lora-rank 8 \
|
||||
--enable-lora \
|
||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## 🛠️ Example - Structured Output Benchmark
|
||||
|
||||
<details>
|
||||
<summary>Show more</summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
||||
|
||||
### Server Setup
|
||||
|
||||
```bash
|
||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
||||
```
|
||||
|
||||
### JSON Schema Benchmark
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset json \
|
||||
--structured-output-ratio 1.0 \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
### Grammar-based Generation Benchmark
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset grammar \
|
||||
--structure-type grammar \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
### Regex-based Generation Benchmark
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset regex \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
### Choice-based Generation Benchmark
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset choice \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
### XGrammar Benchmark Dataset
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset xgrammar_bench \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## 📚 Example - Long Document QA Benchmark
|
||||
|
||||
<details>
|
||||
<summary>Show more</summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the performance of long document question-answering with prefix caching.
|
||||
|
||||
### Basic Long Document QA Test
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 16 \
|
||||
--document-length 2000 \
|
||||
--output-len 50 \
|
||||
--repeat-count 5
|
||||
```
|
||||
|
||||
### Different Repeat Modes
|
||||
|
||||
```bash
|
||||
# Random mode (default) - shuffle prompts randomly
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode random
|
||||
|
||||
# Tile mode - repeat entire prompt list in sequence
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode tile
|
||||
|
||||
# Interleave mode - repeat each prompt consecutively
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode interleave
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## 🗂️ Example - Prefix Caching Benchmark
|
||||
|
||||
<details>
|
||||
<summary>Show more</summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the efficiency of automatic prefix caching.
|
||||
|
||||
### Fixed Prompt with Prefix Caching
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prefix_caching.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-prompts 1 \
|
||||
--repeat-count 100 \
|
||||
--input-length-range 128:256
|
||||
```
|
||||
|
||||
### ShareGPT Dataset with Prefix Caching
|
||||
|
||||
```bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
python3 benchmarks/benchmark_prefix_caching.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--enable-prefix-caching \
|
||||
--num-prompts 20 \
|
||||
--repeat-count 5 \
|
||||
--input-length-range 128:256
|
||||
```
|
||||
|
||||
### Prefix Repetition Dataset
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--dataset-name prefix_repetition \
|
||||
--num-prompts 100 \
|
||||
--prefix-repetition-prefix-len 512 \
|
||||
--prefix-repetition-suffix-len 128 \
|
||||
--prefix-repetition-num-prefixes 5 \
|
||||
--prefix-repetition-output-len 128
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ Example - Request Prioritization Benchmark
|
||||
|
||||
<details>
|
||||
<summary>Show more</summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the performance of request prioritization in vLLM.
|
||||
|
||||
### Basic Prioritization Test
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prioritization.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--input-len 128 \
|
||||
--output-len 64 \
|
||||
--num-prompts 100 \
|
||||
--scheduling-policy priority
|
||||
```
|
||||
|
||||
### Multiple Sequences per Prompt
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prioritization.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--input-len 128 \
|
||||
--output-len 64 \
|
||||
--num-prompts 100 \
|
||||
--scheduling-policy priority \
|
||||
--n 2
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## 👁️ Example - Multi-Modal Benchmark
|
||||
|
||||
<details>
|
||||
<summary>Show more</summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the performance of multi-modal requests in vLLM.
|
||||
|
||||
### Images (ShareGPT4V)
|
||||
|
||||
Start vLLM:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--limit-mm-per-prompt '{"image": 1}' \
|
||||
--allowed-local-media-path /path/to/sharegpt4v/images
|
||||
```
|
||||
|
||||
Send requests with images:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
|
||||
--num-prompts 100 \
|
||||
--save-result \
|
||||
--result-dir ~/vllm_benchmark_results \
|
||||
--save-detailed \
|
||||
--endpoint /v1/chat/completion
|
||||
```
|
||||
|
||||
### Videos (ShareGPT4Video)
|
||||
|
||||
Start vLLM:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--limit-mm-per-prompt '{"video": 1}' \
|
||||
--allowed-local-media-path /path/to/sharegpt4video/videos
|
||||
```
|
||||
|
||||
Send requests with videos:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
|
||||
--num-prompts 100 \
|
||||
--save-result \
|
||||
--result-dir ~/vllm_benchmark_results \
|
||||
--save-detailed \
|
||||
--endpoint /v1/chat/completion
|
||||
```
|
||||
|
||||
### Synthetic Random Images (random-mm)
|
||||
|
||||
Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
|
||||
|
||||
Notes:
|
||||
|
||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
||||
- Video sampling is not yet implemented.
|
||||
|
||||
Start the server (example):
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--max-model-len 16384 \
|
||||
--limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
||||
--mm-processor-kwargs max_pixels=1003520
|
||||
```
|
||||
|
||||
Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
|
||||
|
||||
Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2.5-VL-3B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name random-mm \
|
||||
--num-prompts 100 \
|
||||
--max-concurrency 10 \
|
||||
--random-prefix-len 25 \
|
||||
--random-input-len 300 \
|
||||
--random-output-len 40 \
|
||||
--random-range-ratio 0.2 \
|
||||
--random-mm-base-items-per-request 2 \
|
||||
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
||||
--random-mm-bucket-config '{(224, 224, 1): 1.0}' \
|
||||
--request-rate inf \
|
||||
--ignore-eos \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
The number of items per request can be controlled by passing multiple image buckets:
|
||||
|
||||
```bash
|
||||
--random-mm-base-items-per-request 2 \
|
||||
--random-mm-num-mm-items-range-ratio 0.5 \
|
||||
--random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
|
||||
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
|
||||
```
|
||||
|
||||
Flags specific to `random-mm`:
|
||||
|
||||
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
|
||||
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
|
||||
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
|
||||
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
|
||||
|
||||
Behavioral notes:
|
||||
|
||||
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
|
||||
|
||||
How sampling works:
|
||||
|
||||
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
|
||||
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
|
||||
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
|
||||
This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
|
||||
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
|
||||
|
||||
</details>
|
||||
- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
|
||||
- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
|
||||
- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>
|
||||
|
||||
@ -149,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
|
||||
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
|
||||
|
||||
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
|
||||
|
||||
## Batched `auto_tune`
|
||||
|
||||
The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- **jq**: This script requires `jq` to parse the JSON configuration file.
|
||||
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
|
||||
|
||||
### How to Run
|
||||
|
||||
1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
|
||||
|
||||
2. **Execute the script**:
|
||||
|
||||
```bash
|
||||
bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
|
||||
```
|
||||
|
||||
- `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
|
||||
- `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
|
||||
|
||||
### Configuration File
|
||||
|
||||
The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
|
||||
|
||||
Here is an example `runs_config.json` with two benchmark configurations:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"base": "/home/user",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"system": "TPU", # OR GPU
|
||||
"tp": 8,
|
||||
"input_len": 128,
|
||||
"output_len": 2048,
|
||||
"max_model_len": 2300,
|
||||
"num_seqs_list": "128 256",
|
||||
"num_batched_tokens_list": "8192 16384"
|
||||
},
|
||||
{
|
||||
"base": "/home/user",
|
||||
"model": "meta-llama/Llama-3.1-70B-Instruct",
|
||||
"system": "TPU", # OR GPU
|
||||
"tp": 8,
|
||||
"input_len": 4000,
|
||||
"output_len": 16,
|
||||
"max_model_len": 4096,
|
||||
"num_seqs_list": "64 128",
|
||||
"num_batched_tokens_list": "4096 8192",
|
||||
"max_latency_allowed_ms": 500
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Output
|
||||
|
||||
The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
|
||||
|
||||
- `run_id`: A unique identifier for the run, derived from the timestamp.
|
||||
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
|
||||
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
|
||||
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
|
||||
|
||||
A summary of successful and failed runs is also printed to the console upon completion.
|
||||
|
||||
128
benchmarks/auto_tune/batch_auto_tune.sh
Executable file
128
benchmarks/auto_tune/batch_auto_tune.sh
Executable file
@ -0,0 +1,128 @@
|
||||
#!/bin/bash
|
||||
|
||||
INPUT_JSON="$1"
|
||||
GCS_PATH="$2" # Optional GCS path for uploading results for each run
|
||||
|
||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||
AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
|
||||
|
||||
if [[ -z "$INPUT_JSON" ]]; then
|
||||
echo "Error: Input JSON file not provided."
|
||||
echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$INPUT_JSON" ]]; then
|
||||
echo "Error: File not found at '$INPUT_JSON'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v jq &> /dev/null; then
|
||||
echo "Error: 'jq' command not found. Please install jq to process the JSON input."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
|
||||
echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SUCCESS_COUNT=0
|
||||
FAILURE_COUNT=0
|
||||
FAILED_RUNS=()
|
||||
SCRIPT_START_TIME=$(date +%s)
|
||||
|
||||
json_content=$(cat "$INPUT_JSON")
|
||||
if ! num_runs=$(echo "$json_content" | jq 'length'); then
|
||||
echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Found $num_runs benchmark configurations in $INPUT_JSON."
|
||||
echo "Starting benchmark runs..."
|
||||
echo "--------------------------------------------------"
|
||||
|
||||
for i in $(seq 0 $(($num_runs - 1))); do
|
||||
run_object=$(echo "$json_content" | jq ".[$i]")
|
||||
|
||||
RUN_START_TIME=$(date +%s)
|
||||
ENV_VARS_ARRAY=()
|
||||
# Dynamically create env vars from the JSON object's keys
|
||||
for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
|
||||
value=$(echo "$run_object" | jq -r ".$key")
|
||||
var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
|
||||
ENV_VARS_ARRAY+=("${var_name}=${value}")
|
||||
done
|
||||
|
||||
echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
|
||||
|
||||
# Execute auto_tune.sh and capture output
|
||||
RUN_OUTPUT_FILE=$(mktemp)
|
||||
if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
|
||||
STATUS="SUCCESS"
|
||||
((SUCCESS_COUNT++))
|
||||
else
|
||||
STATUS="FAILURE"
|
||||
((FAILURE_COUNT++))
|
||||
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
|
||||
fi
|
||||
|
||||
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
|
||||
rm "$RUN_OUTPUT_FILE"
|
||||
|
||||
# Parse results and optionally upload them to GCS
|
||||
RUN_ID=""
|
||||
RESULTS=""
|
||||
GCS_RESULTS_URL=""
|
||||
if [[ "$STATUS" == "SUCCESS" ]]; then
|
||||
RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
|
||||
|
||||
if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
|
||||
RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
|
||||
RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
|
||||
RESULTS=$(cat "$RESULT_FILE_PATH")
|
||||
|
||||
if [[ -n "$GCS_PATH" ]]; then
|
||||
GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
|
||||
echo "Uploading results to GCS..."
|
||||
if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
|
||||
echo "GCS upload successful."
|
||||
else
|
||||
echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "Warning: Could not find result file for a successful run."
|
||||
STATUS="WARNING_NO_RESULT_FILE"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Add the results back into the JSON object for this run
|
||||
json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
|
||||
'.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
|
||||
|
||||
RUN_END_TIME=$(date +%s)
|
||||
echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
|
||||
echo "--------------------------------------------------"
|
||||
|
||||
# Save intermediate progress back to the file
|
||||
echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
|
||||
|
||||
done
|
||||
|
||||
SCRIPT_END_TIME=$(date +%s)
|
||||
echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
|
||||
echo
|
||||
echo "====================== SUMMARY ======================"
|
||||
echo "Successful runs: $SUCCESS_COUNT"
|
||||
echo "Failed runs: $FAILURE_COUNT"
|
||||
echo "==================================================="
|
||||
|
||||
if [[ $FAILURE_COUNT -gt 0 ]]; then
|
||||
echo "Details of failed runs (see JSON file for full parameters):"
|
||||
for failed in "${FAILED_RUNS[@]}"; do
|
||||
echo " - $failed"
|
||||
done
|
||||
fi
|
||||
|
||||
echo "Updated results have been saved to '$INPUT_JSON'."
|
||||
File diff suppressed because it is too large
Load Diff
@ -696,11 +696,11 @@ def evaluate(ret, args):
|
||||
return re.match(args.regex, actual) is not None
|
||||
|
||||
def _eval_correctness(expected, actual):
|
||||
if args.structure_type == "guided_json":
|
||||
if args.structure_type == "json":
|
||||
return _eval_correctness_json(expected, actual)
|
||||
elif args.structure_type == "guided_regex":
|
||||
elif args.structure_type == "regex":
|
||||
return _eval_correctness_regex(expected, actual)
|
||||
elif args.structure_type == "guided_choice":
|
||||
elif args.structure_type == "choice":
|
||||
return _eval_correctness_choice(expected, actual)
|
||||
else:
|
||||
return None
|
||||
@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
|
||||
)
|
||||
|
||||
if args.dataset == "grammar":
|
||||
args.structure_type = "guided_grammar"
|
||||
args.structure_type = "grammar"
|
||||
elif args.dataset == "regex":
|
||||
args.structure_type = "guided_regex"
|
||||
args.structure_type = "regex"
|
||||
elif args.dataset == "choice":
|
||||
args.structure_type = "guided_choice"
|
||||
args.structure_type = "choice"
|
||||
else:
|
||||
args.structure_type = "guided_json"
|
||||
args.structure_type = "json"
|
||||
|
||||
if args.no_structured_output:
|
||||
args.structured_output_ratio = 0
|
||||
if args.save_results:
|
||||
result_file_name = f"{args.structured_output_ratio}guided"
|
||||
result_file_name = f"{args.structured_output_ratio}so"
|
||||
result_file_name += f"_{backend}"
|
||||
result_file_name += f"_{args.request_rate}qps"
|
||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||
|
||||
@ -2,14 +2,25 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import itertools
|
||||
from typing import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
|
||||
def with_triton_mode(fn):
|
||||
"""Temporarily force the Triton fallback path"""
|
||||
|
||||
def wrapped(*args, **kwargs):
|
||||
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
|
||||
return fn(*args, **kwargs)
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
# TODO(luka): use standalone_compile utility
|
||||
@ -21,78 +32,236 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
|
||||
return inner
|
||||
|
||||
|
||||
torch._dynamo.config.recompile_limit = 8888
|
||||
compilation_config = CompilationConfig(custom_ops=["none"])
|
||||
with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
|
||||
torch_per_token_quant_fp8 = torch.compile(
|
||||
QuantFP8(False, GroupShape.PER_TOKEN),
|
||||
fullgraph=True,
|
||||
dynamic=False, # recompile for different shapes
|
||||
)
|
||||
def bench_compile(fn: Callable):
|
||||
# recompile for different shapes
|
||||
fwd = torch.compile(fn, fullgraph=True, dynamic=False)
|
||||
|
||||
# First dim is explicitly dynamic to simulate vLLM usage
|
||||
torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
|
||||
return with_dyn_arg(fwd, 0, 0)
|
||||
|
||||
|
||||
def cuda_per_token_quant_fp8(
|
||||
input: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return ops.scaled_fp8_quant(input)
|
||||
torch._dynamo.config.recompile_limit = 8888
|
||||
|
||||
|
||||
def calculate_diff(batch_size: int, seq_len: int):
|
||||
"""Calculate difference between Triton and CUDA implementations."""
|
||||
def calculate_diff(
|
||||
batch_size: int,
|
||||
hidden_size: int,
|
||||
group_shape: GroupShape,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
"""Calculate the difference between Inductor and CUDA implementations."""
|
||||
device = torch.device("cuda")
|
||||
x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
|
||||
x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
|
||||
|
||||
torch_out, torch_scale = torch_per_token_quant_fp8(x)
|
||||
cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
|
||||
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
|
||||
|
||||
if torch.allclose(
|
||||
cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
|
||||
) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
|
||||
torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
|
||||
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
|
||||
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
|
||||
|
||||
out_allclose = lambda o1, o2: torch.allclose(
|
||||
o1.to(torch.float32),
|
||||
o2.to(torch.float32),
|
||||
rtol=1e-3,
|
||||
atol=1e-5,
|
||||
)
|
||||
scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
|
||||
|
||||
if (
|
||||
out_allclose(cuda_out, torch_out)
|
||||
and scale_allclose(cuda_scale, torch_scale)
|
||||
and out_allclose(cuda_out, torch_eager_out)
|
||||
and scale_allclose(cuda_scale, torch_eager_scale)
|
||||
):
|
||||
print("✅ All implementations match")
|
||||
else:
|
||||
print("❌ Implementations differ")
|
||||
|
||||
|
||||
batch_size_range = [1, 16, 32, 64, 128]
|
||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
|
||||
configs = list(itertools.product(batch_size_range, seq_len_range))
|
||||
configs = []
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size", "seq_len"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["torch", "cuda"],
|
||||
line_names=["Torch", "CUDA"],
|
||||
styles=[("blue", "-"), ("green", "-")],
|
||||
ylabel="us",
|
||||
plot_name="per-token-dynamic-quant-fp8-performance",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark_quantization(batch_size, seq_len, provider):
|
||||
dtype = torch.float16
|
||||
def benchmark_quantization(
|
||||
batch_size,
|
||||
hidden_size,
|
||||
provider,
|
||||
group_shape: GroupShape,
|
||||
col_major: bool,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
device = torch.device("cuda")
|
||||
|
||||
x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
|
||||
x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
|
||||
|
||||
if provider == "torch":
|
||||
fn = lambda: torch_per_token_quant_fp8(x.clone())
|
||||
fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
|
||||
elif provider == "cuda":
|
||||
fn = lambda: cuda_per_token_quant_fp8(x.clone())
|
||||
fn = lambda: quant_fp8.forward_cuda(x.clone())
|
||||
elif provider == "triton":
|
||||
if not group_shape.is_per_group():
|
||||
# Triton only supported for per-group
|
||||
return 0, 0, 0
|
||||
|
||||
fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
|
||||
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
|
||||
# TODO(luka) extract to utils
|
||||
def compute_geomean_speedups(
|
||||
df: pd.DataFrame,
|
||||
baseline_col: str,
|
||||
speedup_cols: list[str],
|
||||
groupby_cols: list[str] | None = None,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Compute geometric mean speedups over a baseline column.
|
||||
|
||||
Args:
|
||||
df: Input dataframe
|
||||
baseline_col: Column to use as baseline
|
||||
speedup_cols: Columns to compute speedups for
|
||||
groupby_cols: Columns to group by. If None, compute over entire df.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame with geometric mean speedups
|
||||
"""
|
||||
from scipy.stats import gmean
|
||||
|
||||
def geo_speedup(group: pd.DataFrame) -> pd.Series:
|
||||
ratios = {
|
||||
col: (group[baseline_col] / group[col]).values for col in speedup_cols
|
||||
}
|
||||
return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
|
||||
|
||||
if groupby_cols is None:
|
||||
result = geo_speedup(df).to_frame().T
|
||||
else:
|
||||
result = (
|
||||
df.groupby(groupby_cols)
|
||||
.apply(geo_speedup, include_groups=False)
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
calculate_diff(batch_size=4, seq_len=4096)
|
||||
benchmark_quantization.run(print_data=True)
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
|
||||
)
|
||||
parser.add_argument("-c", "--check", action="store_true")
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hidden-sizes",
|
||||
type=int,
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-sizes",
|
||||
type=int,
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="Batch sizes to benchmark (default: 1,16,32,64,128)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--group-sizes",
|
||||
type=int,
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="Group sizes for GroupShape(1,N) to benchmark. "
|
||||
"Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-column-major",
|
||||
action="store_true",
|
||||
help="Disable column-major scales testing",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
assert args
|
||||
|
||||
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
||||
|
||||
hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
|
||||
|
||||
if args.group_sizes is not None:
|
||||
group_shapes = []
|
||||
for size in args.group_sizes:
|
||||
if size == 0:
|
||||
group_shapes.append(GroupShape.PER_TENSOR)
|
||||
elif size == -1:
|
||||
group_shapes.append(GroupShape.PER_TOKEN)
|
||||
else:
|
||||
group_shapes.append(GroupShape(1, size))
|
||||
else:
|
||||
group_shapes = [
|
||||
GroupShape.PER_TENSOR,
|
||||
GroupShape.PER_TOKEN,
|
||||
GroupShape(1, 64),
|
||||
GroupShape(1, 128),
|
||||
]
|
||||
|
||||
column_major_scales = [False] if args.no_column_major else [True, False]
|
||||
|
||||
config_gen = itertools.product(
|
||||
group_shapes,
|
||||
column_major_scales,
|
||||
batch_sizes,
|
||||
hidden_sizes,
|
||||
)
|
||||
|
||||
# filter out column-major scales for non-group, reverse order
|
||||
configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
|
||||
|
||||
print(f"Running {len(configs)} configurations:")
|
||||
print(f" Hidden sizes: {hidden_sizes}")
|
||||
print(f" Batch sizes: {batch_sizes}")
|
||||
print(f" Group shapes: {[str(g) for g in group_shapes]}")
|
||||
print(f" Column major scales: {column_major_scales}")
|
||||
print()
|
||||
|
||||
if args.check:
|
||||
for group_shape in group_shapes:
|
||||
group_size = group_shape[1]
|
||||
print(f"{group_size=}")
|
||||
calculate_diff(
|
||||
batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
|
||||
)
|
||||
|
||||
benchmark = triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["torch", "cuda", "triton"],
|
||||
line_names=["Torch (Compiled)", "CUDA", "Triton"],
|
||||
styles=[("blue", "-"), ("green", "-"), ("black", "-")],
|
||||
ylabel="us",
|
||||
plot_name="QuantFP8 performance",
|
||||
args={},
|
||||
)
|
||||
)(benchmark_quantization)
|
||||
|
||||
df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
|
||||
|
||||
# Print geomean speedups
|
||||
geo_table_grouped = compute_geomean_speedups(
|
||||
df,
|
||||
baseline_col="Torch (Compiled)",
|
||||
speedup_cols=["CUDA", "Triton"],
|
||||
groupby_cols=["col_major", "group_shape"],
|
||||
)
|
||||
|
||||
print("Speedup over Torch (Compiled)")
|
||||
print(geo_table_grouped.to_string(index=False))
|
||||
|
||||
@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
fp8_w8a8_moe_quant_config,
|
||||
nvfp4_moe_quant_config,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||
from vllm.scalar_type import scalar_types
|
||||
@ -140,6 +144,12 @@ def bench_run(
|
||||
a_fp8_scale: torch.Tensor,
|
||||
num_repeats: int,
|
||||
):
|
||||
quant_config = fp8_w8a8_moe_quant_config(
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a_fp8_scale,
|
||||
)
|
||||
|
||||
for _ in range(num_repeats):
|
||||
fused_experts(
|
||||
a,
|
||||
@ -147,10 +157,7 @@ def bench_run(
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
use_fp8_w8a8=True,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a_fp8_scale,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
def run_cutlass_moe_fp4(
|
||||
@ -172,25 +179,27 @@ def bench_run(
|
||||
device: torch.device,
|
||||
num_repeats: int,
|
||||
):
|
||||
quant_config = nvfp4_moe_quant_config(
|
||||
a1_gscale=a1_gs,
|
||||
a2_gscale=a2_gs,
|
||||
w1_scale=w1_blockscale,
|
||||
w2_scale=w2_blockscale,
|
||||
g1_alphas=w1_gs,
|
||||
g2_alphas=w2_gs,
|
||||
)
|
||||
for _ in range(num_repeats):
|
||||
with nvtx.annotate("cutlass_moe_fp4", color="green"):
|
||||
cutlass_moe_fp4(
|
||||
a=a,
|
||||
a1_gscale=a1_gs,
|
||||
a2_gscale=a2_gs,
|
||||
w1_fp4=w1_fp4,
|
||||
w1_blockscale=w1_blockscale,
|
||||
w1_alphas=w1_gs,
|
||||
w2_fp4=w2_fp4,
|
||||
w2_blockscale=w2_blockscale,
|
||||
w2_alphas=w2_gs,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
m=m,
|
||||
n=n,
|
||||
k=k,
|
||||
e=num_experts,
|
||||
device=device,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
def run_cutlass_from_graph(
|
||||
@ -211,26 +220,29 @@ def bench_run(
|
||||
e: int,
|
||||
device: torch.device,
|
||||
):
|
||||
quant_config = nvfp4_moe_quant_config(
|
||||
a1_gscale=a1_gs,
|
||||
a2_gscale=a2_gs,
|
||||
w1_scale=w1_blockscale,
|
||||
w2_scale=w2_blockscale,
|
||||
g1_alphas=w1_gs,
|
||||
g2_alphas=w2_gs,
|
||||
)
|
||||
|
||||
with set_current_vllm_config(
|
||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
):
|
||||
return cutlass_moe_fp4(
|
||||
a=a,
|
||||
a1_gscale=a1_gs,
|
||||
w1_fp4=w1_fp4,
|
||||
w1_blockscale=w1_blockscale,
|
||||
w1_alphas=w1_alphas,
|
||||
a2_gscale=a2_gs,
|
||||
w2_fp4=w2_fp4,
|
||||
w2_blockscale=w2_blockscale,
|
||||
w2_alphas=w2_alphas,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
m=m,
|
||||
n=n,
|
||||
k=k,
|
||||
e=num_experts,
|
||||
device=device,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
def run_triton_from_graph(
|
||||
@ -246,16 +258,18 @@ def bench_run(
|
||||
with set_current_vllm_config(
|
||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
):
|
||||
quant_config = fp8_w8a8_moe_quant_config(
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a_fp8_scale,
|
||||
)
|
||||
return fused_experts(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
use_fp8_w8a8=True,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a_fp8_scale,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
def replay_graph(graph, num_repeats):
|
||||
|
||||
@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
fused_experts,
|
||||
@ -96,6 +97,11 @@ def bench_run(
|
||||
a_scale: torch.Tensor,
|
||||
num_repeats: int,
|
||||
):
|
||||
quant_config = fp8_w8a8_moe_quant_config(
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a_scale,
|
||||
)
|
||||
for _ in range(num_repeats):
|
||||
fused_experts(
|
||||
a,
|
||||
@ -103,10 +109,7 @@ def bench_run(
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
use_fp8_w8a8=True,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a_scale,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
def run_cutlass_moe(
|
||||
@ -125,6 +128,12 @@ def bench_run(
|
||||
per_act_token: bool,
|
||||
num_repeats: int,
|
||||
):
|
||||
quant_config = fp8_w8a8_moe_quant_config(
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
per_act_token_quant=per_act_token,
|
||||
)
|
||||
|
||||
for _ in range(num_repeats):
|
||||
cutlass_moe_fp8(
|
||||
a,
|
||||
@ -132,14 +141,11 @@ def bench_run(
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
ab_strides1,
|
||||
ab_strides2,
|
||||
c_strides1,
|
||||
c_strides2,
|
||||
per_act_token,
|
||||
a1_scale=None,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
def run_cutlass_from_graph(
|
||||
@ -156,6 +162,12 @@ def bench_run(
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
):
|
||||
quant_config = fp8_w8a8_moe_quant_config(
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
per_act_token_quant=per_act_token,
|
||||
)
|
||||
|
||||
with set_current_vllm_config(
|
||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
):
|
||||
@ -165,14 +177,11 @@ def bench_run(
|
||||
w2_q,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
ab_strides1,
|
||||
ab_strides2,
|
||||
c_strides1,
|
||||
c_strides2,
|
||||
per_act_token,
|
||||
a1_scale=None,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
def run_triton_from_graph(
|
||||
@ -185,6 +194,11 @@ def bench_run(
|
||||
w2_scale: torch.Tensor,
|
||||
a_scale: torch.Tensor,
|
||||
):
|
||||
quant_config = fp8_w8a8_moe_quant_config(
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a_scale,
|
||||
)
|
||||
with set_current_vllm_config(
|
||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
):
|
||||
@ -194,10 +208,7 @@ def bench_run(
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
use_fp8_w8a8=True,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a_scale,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
def replay_graph(graph, num_repeats):
|
||||
|
||||
@ -464,7 +464,11 @@ class BenchmarkTensors:
|
||||
for field_name in LoRAKernelMeta.__dataclass_fields__:
|
||||
field = getattr(self.lora_kernel_meta, field_name)
|
||||
assert isinstance(field, torch.Tensor)
|
||||
setattr(self.lora_kernel_meta, field_name, to_device(field))
|
||||
setattr(
|
||||
self.lora_kernel_meta,
|
||||
field_name,
|
||||
to_device(field) if field_name != "no_lora_flag_cpu" else field,
|
||||
)
|
||||
|
||||
def metadata(self) -> tuple[int, int, int]:
|
||||
"""
|
||||
@ -512,6 +516,7 @@ class BenchmarkTensors:
|
||||
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
|
||||
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
||||
"scaling": 1.0,
|
||||
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||
}
|
||||
|
||||
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
||||
@ -552,6 +557,7 @@ class BenchmarkTensors:
|
||||
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
||||
"offset_start": 0,
|
||||
"add_inputs": add_inputs,
|
||||
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||
}
|
||||
|
||||
def bench_fn_kwargs(
|
||||
|
||||
@ -14,6 +14,10 @@ import ray
|
||||
import torch
|
||||
from ray.experimental.tqdm_ray import tqdm
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEQuantConfig,
|
||||
_get_config_dtype_str,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
@ -134,43 +138,36 @@ def benchmark_config(
|
||||
def run():
|
||||
from vllm.model_executor.layers.fused_moe import override_config
|
||||
|
||||
if use_fp8_w8a8:
|
||||
quant_dtype = torch.float8_e4m3fn
|
||||
elif use_int8_w8a16:
|
||||
quant_dtype = torch.int8
|
||||
else:
|
||||
quant_dtype = None
|
||||
|
||||
quant_config = FusedMoEQuantConfig.make(
|
||||
quant_dtype=quant_dtype,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
block_shape=block_quant_shape,
|
||||
)
|
||||
|
||||
with override_config(config):
|
||||
if use_deep_gemm:
|
||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||
x, input_gating, topk, False
|
||||
)
|
||||
return fused_experts(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
inplace=True,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
block_shape=block_quant_shape,
|
||||
allow_deep_gemm=True,
|
||||
)
|
||||
else:
|
||||
fused_moe(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
input_gating,
|
||||
topk,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
block_shape=block_quant_shape,
|
||||
)
|
||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||
x, input_gating, topk, renormalize=not use_deep_gemm
|
||||
)
|
||||
return fused_experts(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
inplace=True,
|
||||
quant_config=quant_config,
|
||||
allow_deep_gemm=use_deep_gemm,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
@ -414,7 +411,7 @@ class BenchmarkWorker:
|
||||
use_deep_gemm: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
current_platform.seed_everything(self.seed)
|
||||
dtype_str = get_config_dtype_str(
|
||||
dtype_str = _get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
)
|
||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||
@ -547,7 +544,7 @@ def save_configs(
|
||||
block_quant_shape: list[int],
|
||||
save_dir: str,
|
||||
) -> None:
|
||||
dtype_str = get_config_dtype_str(
|
||||
dtype_str = _get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
)
|
||||
|
||||
|
||||
@ -55,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
|
||||
----------------------------------------------------------------------------------------------------
|
||||
```
|
||||
|
||||
### JSON configuration file for synthetic conversations generation
|
||||
|
||||
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
|
||||
When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
|
||||
|
||||
The file `generate_multi_turn.json` is an example file.
|
||||
|
||||
The file must contain the sections `prompt_input` and `prompt_output`.
|
||||
|
||||
The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
|
||||
|
||||
* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
|
||||
The final value will always be rounded to an even number so each user turn has a reply.
|
||||
* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
|
||||
* `num_tokens` - Total token length of each **user** message (one turn).
|
||||
|
||||
The `prompt_output` section must contain `num_tokens`:
|
||||
|
||||
* `num_tokens` - Total token length of each **assistant** message (one turn).
|
||||
|
||||
### Random distributions for synthetic conversations generation
|
||||
|
||||
When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
|
||||
every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
|
||||
The distribution determines how to randomly sample values for the field.
|
||||
|
||||
The available distributions are listed below.
|
||||
|
||||
**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
|
||||
Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
|
||||
|
||||
#### constant
|
||||
|
||||
```json
|
||||
{
|
||||
"distribution": "constant",
|
||||
"value": 500
|
||||
}
|
||||
```
|
||||
|
||||
* `value` - the fixed integer value (always returns the same number).
|
||||
|
||||
#### uniform
|
||||
|
||||
```json
|
||||
{
|
||||
"distribution": "uniform",
|
||||
"min": 12,
|
||||
"max": 18
|
||||
}
|
||||
```
|
||||
|
||||
* `min` - minimum value (inclusive).
|
||||
* `max` - maximum value (inclusive), should be equal or larger than min.
|
||||
|
||||
#### lognormal
|
||||
|
||||
```json
|
||||
{
|
||||
"distribution": "lognormal",
|
||||
"average": 1000,
|
||||
"max": 5000
|
||||
}
|
||||
```
|
||||
|
||||
You can parameterize the lognormal distribution in one of two ways:
|
||||
|
||||
Using the average and optional median ratio:
|
||||
|
||||
* `average` - target average value of the distribution.
|
||||
* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
|
||||
|
||||
Using the parameters of the underlying normal distribution:
|
||||
|
||||
* `mean` - mean of the underlying normal distribution.
|
||||
* `sigma` - standard deviation of the underlying normal distribution.
|
||||
|
||||
#### zipf
|
||||
|
||||
```json
|
||||
{
|
||||
"distribution": "zipf",
|
||||
"alpha": 1.2,
|
||||
"max": 100
|
||||
}
|
||||
```
|
||||
|
||||
* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
|
||||
|
||||
#### poisson
|
||||
|
||||
```json
|
||||
{
|
||||
"distribution": "poisson",
|
||||
"alpha": 10,
|
||||
"max": 50
|
||||
}
|
||||
```
|
||||
|
||||
* `alpha` - expected value (λ). Also the variance of the distribution.
|
||||
|
||||
## ShareGPT Conversations
|
||||
|
||||
To run with the ShareGPT data, download the following ShareGPT dataset:
|
||||
|
||||
@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):
|
||||
|
||||
class LognormalDistribution(Distribution):
|
||||
def __init__(
|
||||
self, mean: float, sigma: float, max_val: Optional[int] = None
|
||||
self,
|
||||
mean: Optional[float] = None,
|
||||
sigma: Optional[float] = None,
|
||||
average: Optional[int] = None,
|
||||
median_ratio: Optional[float] = None,
|
||||
max_val: Optional[int] = None,
|
||||
) -> None:
|
||||
self.average = average
|
||||
self.median_ratio = median_ratio
|
||||
self.max_val = max_val
|
||||
|
||||
if average is not None:
|
||||
if average < 1:
|
||||
raise ValueError("Lognormal average must be positive")
|
||||
|
||||
if mean or sigma:
|
||||
raise ValueError(
|
||||
"When using lognormal average, you can't provide mean/sigma"
|
||||
)
|
||||
|
||||
if self.median_ratio is None:
|
||||
# Default value that provides relatively wide range of values
|
||||
self.median_ratio = 0.85
|
||||
|
||||
# Calculate mean/sigma of np.random.lognormal based on the average
|
||||
mean, sigma = self._generate_lognormal_by_median(
|
||||
target_average=self.average, median_ratio=self.median_ratio
|
||||
)
|
||||
else:
|
||||
if mean is None or sigma is None:
|
||||
raise ValueError(
|
||||
"Must provide both mean and sigma if average is not used"
|
||||
)
|
||||
|
||||
if mean <= 0 or sigma < 0:
|
||||
raise ValueError(
|
||||
"Lognormal mean must be positive and sigma must be non-negative"
|
||||
)
|
||||
|
||||
# Mean and standard deviation of the underlying normal distribution
|
||||
# Based on numpy.random.lognormal
|
||||
self.mean = mean
|
||||
self.sigma = sigma
|
||||
self.max_val = max_val
|
||||
|
||||
@staticmethod
|
||||
def _generate_lognormal_by_median(
|
||||
target_average: int, median_ratio: float
|
||||
) -> tuple[float, float]:
|
||||
"""
|
||||
Compute (mu, sigma) for a lognormal distribution given:
|
||||
- a target average (mean of the distribution)
|
||||
- a ratio of median / mean (controls skewness), assume mean > median
|
||||
|
||||
Background:
|
||||
If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
|
||||
* mean(X) = exp(mu + sigma^2 / 2)
|
||||
* median(X) = exp(mu)
|
||||
|
||||
So:
|
||||
median / mean = exp(mu) / exp(mu + sigma^2 / 2)
|
||||
= exp(-sigma^2 / 2)
|
||||
|
||||
Rearranging:
|
||||
sigma^2 = 2 * ln(mean / median)
|
||||
mu = ln(median)
|
||||
|
||||
This gives a unique (mu, sigma) for any valid mean and median.
|
||||
"""
|
||||
# Check input validity: median must be smaller than mean
|
||||
if median_ratio <= 0 or median_ratio >= 1:
|
||||
raise ValueError("median_ratio must be in range (0, 1)")
|
||||
|
||||
target_median = target_average * median_ratio
|
||||
|
||||
# Solve sigma^2 = 2 * ln(mean / median)
|
||||
sigma = np.sqrt(2 * np.log(target_average / target_median))
|
||||
mu = np.log(target_median)
|
||||
|
||||
return mu, sigma
|
||||
|
||||
def sample(self, size: int = 1) -> np.ndarray:
|
||||
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
|
||||
|
||||
if self.average is not None:
|
||||
# Scale to average
|
||||
samples *= self.average / samples.mean()
|
||||
|
||||
if self.max_val:
|
||||
samples = np.minimum(samples, self.max_val)
|
||||
|
||||
return np.round(samples).astype(int)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"LognormalDistribution[{self.mean}, {self.sigma}]"
|
||||
if self.average:
|
||||
return (
|
||||
f"LognormalDistribution[{self.average}, "
|
||||
f"{self.median_ratio}, {self.max_val}]"
|
||||
)
|
||||
return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
|
||||
|
||||
|
||||
class GenConvArgs(NamedTuple):
|
||||
@ -173,10 +257,21 @@ def get_random_distribution(
|
||||
return PoissonDistribution(conf["alpha"], max_val=max_val)
|
||||
|
||||
elif distribution == "lognormal":
|
||||
max_val = conf.get("max", None)
|
||||
|
||||
if "average" in conf:
|
||||
# Infer lognormal mean/sigma (numpy) from input average
|
||||
median_ratio = conf.get("median_ratio", None)
|
||||
return LognormalDistribution(
|
||||
average=conf["average"], median_ratio=median_ratio, max_val=max_val
|
||||
)
|
||||
|
||||
# Use mean/sigma directly (for full control over the distribution)
|
||||
verify_field_exists(conf, "mean", section, subsection)
|
||||
verify_field_exists(conf, "sigma", section, subsection)
|
||||
max_val = conf.get("max", None)
|
||||
return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
|
||||
return LognormalDistribution(
|
||||
mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
|
||||
)
|
||||
|
||||
elif distribution == "uniform":
|
||||
verify_field_exists(conf, "min", section, subsection)
|
||||
|
||||
@ -15,9 +15,8 @@
|
||||
},
|
||||
"prefix_num_tokens": {
|
||||
"distribution": "lognormal",
|
||||
"mean": 6,
|
||||
"sigma": 4,
|
||||
"max": 1500
|
||||
"average": 1000,
|
||||
"max": 5000
|
||||
},
|
||||
"num_tokens": {
|
||||
"distribution": "uniform",
|
||||
|
||||
@ -1,38 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale);
|
||||
#endif
|
||||
|
||||
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale) {
|
||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
||||
return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale);
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
|
||||
}
|
||||
@ -1,225 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/kernel_hardware_info.h"
|
||||
|
||||
#include "cutlass_extensions/common.hpp"
|
||||
|
||||
#include "device/sm100_mla.hpp"
|
||||
#include "kernel/sm100_mla_tile_scheduler.hpp"
|
||||
|
||||
using namespace cute;
|
||||
using namespace cutlass::fmha::kernel;
|
||||
|
||||
template <typename T, bool PersistenceOption = true>
|
||||
struct MlaSm100 {
|
||||
using Element = T;
|
||||
using ElementAcc = float;
|
||||
using ElementOut = T;
|
||||
|
||||
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
|
||||
using TileShapeH = cute::tuple_element_t<0, TileShape>;
|
||||
using TileShapeD = cute::tuple_element_t<2, TileShape>;
|
||||
|
||||
// H K (D_latent D_rope) B
|
||||
using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
|
||||
|
||||
using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B
|
||||
using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B
|
||||
using StrideO = StrideK; // H D B
|
||||
using StrideLSE = cute::tuple<_1, int>; // H B
|
||||
|
||||
using TileScheduler =
|
||||
std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
|
||||
Sm100MlaIndividualTileScheduler>;
|
||||
|
||||
using FmhaKernel =
|
||||
cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
|
||||
TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
|
||||
/*kIsCpAsync=*/true>;
|
||||
using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
typename T::Fmha::Arguments args_from_options(
|
||||
at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
|
||||
at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
|
||||
at::Tensor const& page_table, double scale) {
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = q_nope.device().index();
|
||||
hw_info.sm_count =
|
||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||
hw_info.device_id);
|
||||
|
||||
int batches = q_nope.sizes()[0];
|
||||
int page_count_per_seq = page_table.sizes()[1];
|
||||
int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
|
||||
int page_size = kv_c_and_k_pe_cache.sizes()[1];
|
||||
int max_seq_len = page_size * page_count_per_seq;
|
||||
using TileShapeH = typename T::TileShapeH;
|
||||
using TileShapeD = typename T::TileShapeD;
|
||||
auto problem_shape =
|
||||
cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
|
||||
|
||||
auto [H, K, D, B] = problem_shape;
|
||||
auto [D_latent, D_rope] = D;
|
||||
|
||||
using StrideQ = typename T::StrideQ;
|
||||
using StrideK = typename T::StrideK;
|
||||
using StrideO = typename T::StrideO;
|
||||
using StrideLSE = typename T::StrideLSE;
|
||||
|
||||
StrideQ stride_Q_latent = cute::make_tuple(
|
||||
static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
|
||||
StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
|
||||
static_cast<int64_t>(H * D_rope));
|
||||
StrideK stride_C =
|
||||
cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
|
||||
static_cast<int64_t>(page_size * (D_latent + D_rope)));
|
||||
StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
|
||||
StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
|
||||
StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
|
||||
static_cast<int64_t>(H * D_latent));
|
||||
|
||||
using Element = typename T::Element;
|
||||
using ElementOut = typename T::ElementOut;
|
||||
using ElementAcc = typename T::ElementAcc;
|
||||
auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
|
||||
auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
|
||||
auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
|
||||
auto scale_f = static_cast<float>(scale);
|
||||
typename T::Fmha::Arguments arguments{
|
||||
problem_shape,
|
||||
{scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
|
||||
stride_C, C_ptr + D_latent, stride_C,
|
||||
static_cast<int*>(seq_lens.data_ptr()),
|
||||
static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
|
||||
page_size},
|
||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
||||
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
||||
hw_info,
|
||||
1, // split_kv
|
||||
nullptr, // is_var_split_kv
|
||||
};
|
||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
||||
// split_kv automatically based on batch size and sequence length to balance
|
||||
// workload across available SMs. Consider using var_split_kv for manual
|
||||
// control if needed.
|
||||
T::Fmha::set_split_kv(arguments);
|
||||
return arguments;
|
||||
}
|
||||
|
||||
template <typename Element>
|
||||
void runMla(at::Tensor const& out, at::Tensor const& q_nope,
|
||||
at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
|
||||
at::Tensor const& seq_lens, at::Tensor const& page_table,
|
||||
float scale, cudaStream_t stream) {
|
||||
using MlaSm100Type = MlaSm100<Element>;
|
||||
typename MlaSm100Type::Fmha fmha;
|
||||
auto arguments = args_from_options<MlaSm100Type>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
|
||||
size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
|
||||
auto const workspace_options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
|
||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||
|
||||
CUTLASS_CHECK(fmha.can_implement(arguments));
|
||||
|
||||
CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
|
||||
|
||||
CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
|
||||
}
|
||||
|
||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale) {
|
||||
TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
|
||||
TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
|
||||
TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
|
||||
TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
|
||||
"kv_c_and_k_pe_cache must be a 3D tensor");
|
||||
TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
|
||||
TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
|
||||
TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
|
||||
|
||||
auto B_q_nope = q_nope.size(0);
|
||||
auto H_q_nope = q_nope.size(1);
|
||||
auto D_q_nope = q_nope.size(2);
|
||||
auto B_q_pe = q_pe.size(0);
|
||||
auto H_q_pe = q_pe.size(1);
|
||||
auto D_q_pe = q_pe.size(2);
|
||||
auto B_pt = page_table.size(0);
|
||||
auto PAGE_NUM = page_table.size(1);
|
||||
auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
|
||||
auto D_ckv = kv_c_and_k_pe_cache.size(2);
|
||||
auto B_o = out.size(0);
|
||||
auto H_o = out.size(1);
|
||||
auto D_o = out.size(2);
|
||||
|
||||
TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
|
||||
TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
|
||||
TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
|
||||
TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
|
||||
"H_q_nope, H_q_pe, and H_o must be equal to 128");
|
||||
TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
|
||||
"PAGE_SIZE must be a power of 2");
|
||||
TORCH_CHECK(
|
||||
B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
|
||||
"Batch dims must be same for page_table, q_nope and q_pe, and out");
|
||||
TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
|
||||
"PAGE_NUM must be divisible by 128 / PAGE_SIZE");
|
||||
TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
|
||||
|
||||
TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
|
||||
q_nope.dtype() == at::ScalarType::BFloat16 ||
|
||||
q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
|
||||
"q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
|
||||
TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
|
||||
q_nope.dtype() == q_pe.dtype(),
|
||||
"kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
|
||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
|
||||
"seq_lens must be a 32-bit integer tensor");
|
||||
TORCH_CHECK(page_table.dtype() == torch::kInt32,
|
||||
"page_table must be a 32-bit integer tensor");
|
||||
|
||||
auto in_dtype = q_nope.dtype();
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
|
||||
const cudaStream_t stream =
|
||||
at::cuda::getCurrentCUDAStream(q_nope.get_device());
|
||||
if (in_dtype == at::ScalarType::Half) {
|
||||
runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
|
||||
page_table, scale, stream);
|
||||
} else if (in_dtype == at::ScalarType::BFloat16) {
|
||||
runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale, stream);
|
||||
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
|
||||
runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported input data type of MLA");
|
||||
}
|
||||
}
|
||||
@ -133,6 +133,14 @@ public:
|
||||
// printf(" sm_count = %d\n", sm_count);
|
||||
int max_splits = ceil_div(K, 128);
|
||||
max_splits = min(16, max_splits);
|
||||
|
||||
// TODO: This avoids a hang when the batch size larger than 1 and
|
||||
// there is more than 4 kv_splits.
|
||||
// Discuss with NVIDIA how this can be fixed.
|
||||
if (B > 1) {
|
||||
max_splits = min(2, max_splits);
|
||||
}
|
||||
|
||||
// printf(" max_splits = %d\n", max_splits);
|
||||
int sms_per_batch = max(1, sm_count / B);
|
||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
||||
|
||||
@ -17,4 +17,8 @@
|
||||
#warning "unsupported vLLM cpu implementation"
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
|
||||
CPU_KERNEL_GUARD_IN(onednn_mm)
|
||||
TORCH_CHECK(a.dim() == 2);
|
||||
TORCH_CHECK(a.stride(-1) == 1);
|
||||
TORCH_CHECK(c.is_contiguous());
|
||||
TORCH_CHECK(c.stride(-1) == 1);
|
||||
MatMulPrimitiveHandler* ptr =
|
||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
||||
|
||||
|
||||
17
csrc/cub_helpers.h
Normal file
17
csrc/cub_helpers.h
Normal file
@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/cub.cuh>
|
||||
#if CUB_VERSION >= 200800
|
||||
#include <cuda/std/functional>
|
||||
using CubAddOp = cuda::std::plus<>;
|
||||
using CubMaxOp = cuda::maximum<>;
|
||||
#else // if CUB_VERSION < 200800
|
||||
using CubAddOp = cub::Sum;
|
||||
using CubMaxOp = cub::Max;
|
||||
#endif // CUB_VERSION
|
||||
#else
|
||||
#include <hipcub/hipcub.hpp>
|
||||
using CubAddOp = cub::Sum;
|
||||
using CubMaxOp = cub::Max;
|
||||
#endif // USE_ROCM
|
||||
@ -1,15 +1,10 @@
|
||||
#include "type_convert.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/cub.cuh>
|
||||
#else
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#endif
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// TODO(woosuk): Further optimize this kernel.
|
||||
@ -30,7 +25,7 @@ __global__ void rms_norm_kernel(
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||
@ -85,7 +80,7 @@ fused_add_rms_norm_kernel(
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||
@ -126,7 +121,7 @@ fused_add_rms_norm_kernel(
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||
|
||||
@ -8,16 +8,11 @@
|
||||
#include "type_convert.cuh"
|
||||
#include "quantization/fp8/common.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/cub.cuh>
|
||||
#else
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#endif
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// TODO(woosuk): Further optimize this kernel.
|
||||
@ -39,7 +34,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||
@ -100,7 +95,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||
@ -149,7 +144,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||
variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include <torch/all.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda/std/limits>
|
||||
#include <cooperative_groups.h>
|
||||
#include <cooperative_groups/reduce.h>
|
||||
namespace cg = cooperative_groups;
|
||||
@ -28,7 +29,6 @@ namespace cg = cooperative_groups;
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
constexpr float kNegInfinity = INFINITY * -1;
|
||||
constexpr unsigned FULL_WARP_MASK = 0xffffffff;
|
||||
constexpr int32_t WARP_SIZE = 32;
|
||||
constexpr int32_t BLOCK_SIZE = 512;
|
||||
@ -411,14 +411,21 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
|
||||
return __bfloat162float(val);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T neg_inf() {
|
||||
// cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
|
||||
// so we need to cast from fp32
|
||||
return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void topk_with_k2(T* output, T const* input,
|
||||
cg::thread_block_tile<32> const& tile,
|
||||
int32_t const lane_id,
|
||||
int const num_experts_per_group) {
|
||||
// Get the top2 per thread
|
||||
T largest = -INFINITY;
|
||||
T second_largest = -INFINITY;
|
||||
T largest = neg_inf<T>();
|
||||
T second_largest = neg_inf<T>();
|
||||
|
||||
if (num_experts_per_group > WARP_SIZE) {
|
||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||
@ -513,8 +520,8 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
warp_id * topk;
|
||||
s_topk_idx += warp_id * topk;
|
||||
|
||||
T value = kNegInfinity;
|
||||
T topk_group_value = kNegInfinity;
|
||||
T value = neg_inf<T>();
|
||||
T topk_group_value = neg_inf<T>();
|
||||
int32_t num_equalto_topkth_group;
|
||||
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
@ -525,11 +532,8 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
if (case_id < num_tokens) {
|
||||
// calculate group_idx
|
||||
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
|
||||
if (lane_id < n_group &&
|
||||
(isfinite(cuda_cast<float, T>(
|
||||
group_scores[lane_id])))) // The check is necessary to avoid
|
||||
// abnormal input
|
||||
{
|
||||
// The check is necessary to avoid abnormal input
|
||||
if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) {
|
||||
value = group_scores[lane_id];
|
||||
}
|
||||
|
||||
@ -540,11 +544,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
||||
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
||||
if (value == topk_group_value) {
|
||||
value = kNegInfinity;
|
||||
value = neg_inf<T>();
|
||||
}
|
||||
pre_count_equal_to_top_value = count_equal_to_top_value;
|
||||
count_equal_to_top_value = __popc(__ballot_sync(
|
||||
FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
|
||||
count_equal_to_top_value =
|
||||
__popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
|
||||
}
|
||||
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
|
||||
}
|
||||
@ -552,11 +556,10 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
|
||||
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
|
||||
/* is_stable */ true>
|
||||
queue((int32_t)topk, -INFINITY);
|
||||
queue((int32_t)topk, neg_inf<T>());
|
||||
|
||||
int count_equalto_topkth_group = 0;
|
||||
bool if_proceed_next_topk =
|
||||
(topk_group_value != cuda_cast<T, float>(kNegInfinity));
|
||||
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
for (int i_group = 0; i_group < n_group; i_group++) {
|
||||
if ((group_scores[i_group] > topk_group_value) ||
|
||||
@ -566,10 +569,10 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
for (int32_t i = lane_id; i < align_num_experts_per_group;
|
||||
i += WARP_SIZE) {
|
||||
T candidates =
|
||||
(i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
|
||||
scores_with_bias[offset + i]))
|
||||
(i < num_experts_per_group) &&
|
||||
cuda::std::isfinite(scores_with_bias[offset + i])
|
||||
? scores_with_bias[offset + i]
|
||||
: cuda_cast<T, float>(kNegInfinity);
|
||||
: neg_inf<T>();
|
||||
queue.add(candidates, offset + i);
|
||||
}
|
||||
if (group_scores[i_group] == topk_group_value) {
|
||||
@ -598,7 +601,8 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
if (i < topk) {
|
||||
s_topk_value[i] = value;
|
||||
}
|
||||
topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
topk_sum +=
|
||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -20,17 +20,7 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/util_type.cuh>
|
||||
#include <cub/cub.cuh>
|
||||
#include <cuda/std/functional>
|
||||
using AddOp = cuda::std::plus<float>;
|
||||
#else
|
||||
#include <hipcub/util_type.hpp>
|
||||
#include <hipcub/hipcub.hpp>
|
||||
using AddOp = cub::Sum;
|
||||
#endif
|
||||
#include "../cub_helpers.h"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -79,7 +69,7 @@ __launch_bounds__(TPB) __global__
|
||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||
}
|
||||
|
||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
|
||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
float_max = maxElem;
|
||||
@ -94,7 +84,7 @@ __launch_bounds__(TPB) __global__
|
||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||
}
|
||||
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
|
||||
@ -365,7 +365,6 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
||||
int32_t compute_pipeline_offset_64 = 0;
|
||||
|
||||
for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
|
||||
__nv_bfloat16 y_max_bf16 = EPS;
|
||||
__nv_bfloat162 results_bf162[2];
|
||||
|
||||
cp_async_wait<NUM_STAGES - 2>();
|
||||
@ -405,7 +404,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
||||
auto _y_max2 =
|
||||
__hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
|
||||
|
||||
y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
|
||||
__nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
|
||||
|
||||
// An entire group is assigned to a single warp, so a simple warp reduce
|
||||
// is used.
|
||||
|
||||
@ -7,17 +7,10 @@
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "../../cub_helpers.h"
|
||||
#include "../../dispatch_utils.h"
|
||||
#include "../vectorization_utils.cuh"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/cub.cuh>
|
||||
#include <cub/util_type.cuh>
|
||||
#else
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#include <hipcub/util_type.hpp>
|
||||
#endif
|
||||
|
||||
static inline __device__ int8_t float_to_int8_rn(float x) {
|
||||
#ifdef USE_ROCM
|
||||
static constexpr auto i8_min =
|
||||
@ -173,7 +166,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
|
||||
});
|
||||
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||
__shared__ typename BlockReduce::TempStorage tmp;
|
||||
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
|
||||
float block_max = BlockReduce(tmp).Reduce(thread_max, CubMaxOp{}, blockDim.x);
|
||||
__shared__ float absmax;
|
||||
if (tid == 0) {
|
||||
absmax = block_max;
|
||||
|
||||
@ -25,6 +25,8 @@
|
||||
#include "cutlass_extensions/common.hpp"
|
||||
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace vllm::cutlass_w4a8 {
|
||||
|
||||
using namespace cute;
|
||||
@ -393,6 +395,71 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
|
||||
return packed_scales;
|
||||
}
|
||||
|
||||
/*
|
||||
GPU-accelerated implementation of cutlass::unified_encode_int4b.
|
||||
Constructs a lookup table in constant memory to map 8 bits
|
||||
(two 4-bit values) at a time. Assumes memory is contiguous
|
||||
and pointers are 16-byte aligned.
|
||||
*/
|
||||
__constant__ uint8_t kNibbleLUT[256];
|
||||
|
||||
__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
|
||||
size_t nbytes) {
|
||||
constexpr size_t V = sizeof(uint4); // 16 bytes
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t nthreads = size_t(gridDim.x) * blockDim.x;
|
||||
const size_t nvec = nbytes / V;
|
||||
|
||||
// 1-D grid-stride loop over 16-byte chunks
|
||||
for (size_t vec = tid; vec < nvec; vec += nthreads) {
|
||||
uint4 v = reinterpret_cast<const uint4*>(in)[vec];
|
||||
uint8_t* b = reinterpret_cast<uint8_t*>(&v);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
|
||||
reinterpret_cast<uint4*>(out)[vec] = v;
|
||||
}
|
||||
}
|
||||
|
||||
static bool upload_lut() {
|
||||
std::array<uint8_t, 256> lut{};
|
||||
auto map_nib = [](uint8_t v) -> uint8_t {
|
||||
// 1..7 -> (8 - v); keep 0 and 8..15
|
||||
return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
|
||||
};
|
||||
for (int b = 0; b < 256; ++b) {
|
||||
uint8_t lo = b & 0xF;
|
||||
uint8_t hi = (b >> 4) & 0xF;
|
||||
lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
|
||||
}
|
||||
cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
|
||||
/*offset=*/0, cudaMemcpyHostToDevice);
|
||||
|
||||
return (e == cudaSuccess);
|
||||
}
|
||||
|
||||
static bool unified_encode_int4b(cutlass::int4b_t const* in,
|
||||
cutlass::int4b_t* out, size_t num_int4_elems) {
|
||||
// Build/upload LUT
|
||||
if (!upload_lut()) return false;
|
||||
|
||||
static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
|
||||
"int4 storage must be 1 byte");
|
||||
const size_t nbytes = num_int4_elems >> 1;
|
||||
|
||||
auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
|
||||
auto* out_bytes = reinterpret_cast<uint8_t*>(out);
|
||||
|
||||
// kernel launch params
|
||||
constexpr int block = 256;
|
||||
const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors
|
||||
int grid = int((nvec + block - 1) / block);
|
||||
if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel
|
||||
|
||||
unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
|
||||
cudaError_t err = cudaGetLastError();
|
||||
return (err == cudaSuccess);
|
||||
}
|
||||
|
||||
torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
|
||||
TORCH_CHECK(B.dtype() == torch::kInt32);
|
||||
TORCH_CHECK(B.dim() == 2);
|
||||
@ -401,6 +468,7 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
|
||||
|
||||
int k = B.size(0) * PackFactor; // logical k
|
||||
int n = B.size(1);
|
||||
TORCH_CHECK((n * k) % 32 == 0, "need multiples of 32 int4s for 16B chunks");
|
||||
|
||||
auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
|
||||
auto B_packed_ptr = static_cast<QuantType*>(B_packed.data_ptr());
|
||||
@ -409,7 +477,9 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
|
||||
LayoutB_Reordered layout_B_reordered =
|
||||
cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
|
||||
|
||||
cutlass::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
|
||||
bool ok =
|
||||
vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
|
||||
TORCH_CHECK(ok, "unified_encode_int4b failed");
|
||||
cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
|
||||
|
||||
return B_packed;
|
||||
|
||||
@ -30,109 +30,41 @@
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// silu in float32
|
||||
__device__ __forceinline__ float silu(float x) {
|
||||
return __fdividef(x, (1.f + __expf(-x)));
|
||||
}
|
||||
|
||||
__device__ __forceinline__ float2 silu2(float2 x) {
|
||||
return make_float2(silu(x.x), silu(x.y));
|
||||
}
|
||||
|
||||
template <class Type>
|
||||
__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
|
||||
PackedVec<Type>& vec2) {
|
||||
__inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
|
||||
PackedVec<Type>& vec2) {
|
||||
PackedVec<Type> result;
|
||||
using packed_type = typename TypeConverter<Type>::Type;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
|
||||
// silu_mul in float32
|
||||
if constexpr (std::is_same_v<Type, half>) {
|
||||
half2 val(0.5f, 0.5f);
|
||||
half2 t0 = __hmul2(vec.elts[i], val);
|
||||
half2 t1 = __hfma2(h2tanh(t0), val, val);
|
||||
half2 t2 = __hmul2(vec.elts[i], t1);
|
||||
result.elts[i] = __hmul2(t2, vec2.elts[i]);
|
||||
float2 silu_vec = silu2(__half22float2(vec.elts[i]));
|
||||
result.elts[i] =
|
||||
__float22half2_rn(__fmul2_rn(silu_vec, __half22float2(vec2.elts[i])));
|
||||
} else {
|
||||
__nv_bfloat162 val(0.5f, 0.5f);
|
||||
__nv_bfloat162 t0 = __hmul2(vec.elts[i], val);
|
||||
__nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val);
|
||||
__nv_bfloat162 t2 = __hmul2(vec.elts[i], t1);
|
||||
result.elts[i] = __hmul2(t2, vec2.elts[i]);
|
||||
float2 silu_vec = silu2(__bfloat1622float2(vec.elts[i]));
|
||||
result.elts[i] = __float22bfloat162_rn(
|
||||
__fmul2_rn(silu_vec, __bfloat1622float2(vec2.elts[i])));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Quantizes the provided PackedVec into the uint32_t output
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
|
||||
PackedVec<Type>& vec2,
|
||||
float SFScaleVal,
|
||||
uint8_t* SFout) {
|
||||
PackedVec<Type> out_silu = compute_silu(vec, vec2);
|
||||
// Get absolute maximum values among the local 8 values.
|
||||
auto localMax = __habs2(out_silu.elts[0]);
|
||||
|
||||
// Local maximum value.
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
|
||||
}
|
||||
|
||||
// Get the absolute maximum among all 16 values (two threads).
|
||||
localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
|
||||
// Get the final absolute maximum values.
|
||||
float vecMax = float(__hmax(localMax.x, localMax.y));
|
||||
|
||||
// Get the SF (max value of the vector / max value of e2m1).
|
||||
// maximum value of e2m1 = 6.0.
|
||||
// TODO: use half as compute data type.
|
||||
float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
|
||||
// 8 bits representation of the SF.
|
||||
uint8_t fp8SFVal;
|
||||
// Write the SF to global memory (STG.8).
|
||||
if constexpr (UE8M0_SF) {
|
||||
// Extract the 8 exponent bits from float32.
|
||||
// float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
|
||||
uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
|
||||
fp8SFVal = tmp & 0xff;
|
||||
// Convert back to fp32.
|
||||
reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
|
||||
} else {
|
||||
// Here SFValue is always positive, so E4M3 is the same as UE4M3.
|
||||
__nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
|
||||
reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
|
||||
// Convert back to fp32.
|
||||
SFValue = float(tmp);
|
||||
}
|
||||
// Get the output scale.
|
||||
// Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
|
||||
// reciprocal(SFScaleVal))
|
||||
float outputScale =
|
||||
SFValue != 0 ? reciprocal_approximate_ftz(
|
||||
SFValue * reciprocal_approximate_ftz(SFScaleVal))
|
||||
: 0.0f;
|
||||
|
||||
if (SFout) {
|
||||
// Write the SF to global memory (STG.8).
|
||||
*SFout = fp8SFVal;
|
||||
}
|
||||
|
||||
// Convert the input to float.
|
||||
float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
if constexpr (std::is_same_v<Type, half>) {
|
||||
fp2Vals[i] = __half22float2(out_silu.elts[i]);
|
||||
} else {
|
||||
fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
|
||||
}
|
||||
fp2Vals[i].x *= outputScale;
|
||||
fp2Vals[i].y *= outputScale;
|
||||
}
|
||||
|
||||
// Convert to e2m1 values.
|
||||
uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
|
||||
|
||||
// Write the e2m1 values to global memory.
|
||||
return e2m1Vec;
|
||||
}
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(1024, 4)
|
||||
silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out,
|
||||
uint32_t* SFout) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
@ -160,16 +92,18 @@ __global__ void __launch_bounds__(1024, 4)
|
||||
// Get the output tensor offset.
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
;
|
||||
auto& out_pos = out[outOffset];
|
||||
|
||||
// Compute silu and mul
|
||||
PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx, colIdx, numCols, SFout);
|
||||
|
||||
out_pos = silu_and_cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(
|
||||
in_vec, in_vec2, SFScaleVal, sf_out);
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
|
||||
sf_out);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -204,7 +138,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
|
||||
input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
|
||||
vllm::silu_and_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
|
||||
vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
|
||||
m, n, input_ptr, input_sf_ptr,
|
||||
reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
|
||||
@ -1,15 +1,10 @@
|
||||
#include "common.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
#include "../../cub_helpers.h"
|
||||
#include "../vectorization_utils.cuh"
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/cub.cuh>
|
||||
#else
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#endif
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <typename scalar_t, typename fp8_type>
|
||||
@ -116,7 +111,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
|
||||
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||
__shared__ typename BlockReduce::TempStorage tmp;
|
||||
const float block_max =
|
||||
BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
||||
BlockReduce(tmp).Reduce(absmax_val, CubMaxOp{}, blockDim.x);
|
||||
|
||||
__shared__ float token_scale;
|
||||
if (tid == 0) {
|
||||
|
||||
@ -8,11 +8,7 @@
|
||||
#include "quantization/utils.cuh"
|
||||
#include "quant_conversions.cuh"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/cub.cuh>
|
||||
#else
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#endif
|
||||
#include "../../cub_helpers.h"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
@ -36,7 +32,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
|
||||
ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);
|
||||
|
||||
__shared__ float s_rms;
|
||||
if (threadIdx.x == 0) {
|
||||
@ -73,7 +69,7 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
block_absmax_val_maybe =
|
||||
BlockReduce(reduceStore)
|
||||
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
|
||||
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||
|
||||
__shared__ float s_token_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
@ -169,7 +165,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x);
|
||||
ss = BlockReduce(reduceStore).Reduce(ss, CubAddOp{}, blockDim.x);
|
||||
|
||||
__shared__ float s_rms;
|
||||
if (threadIdx.x == 0) {
|
||||
@ -240,7 +236,7 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
block_absmax_val_maybe =
|
||||
BlockReduce(reduceStore)
|
||||
.Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x);
|
||||
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||
|
||||
__shared__ float s_token_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
|
||||
@ -510,13 +510,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
|
||||
ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
|
||||
|
||||
// CUTLASS MLA decode
|
||||
ops.def(
|
||||
"cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
|
||||
" Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
|
||||
" Tensor page_table, float scale) -> ()");
|
||||
ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
|
||||
|
||||
// SM100 CUTLASS MLA decode
|
||||
ops.def(
|
||||
"sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"
|
||||
|
||||
@ -283,6 +283,10 @@ WORKDIR /vllm-workspace
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
ARG GDRCOPY_CUDA_VERSION=12.8
|
||||
# Keep in line with FINAL_BASE_IMAGE
|
||||
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ARG DEADSNAKES_MIRROR_URL
|
||||
@ -441,13 +445,21 @@ COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
||||
|
||||
# Install EP kernels(pplx-kernels and DeepEP), NixL
|
||||
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
|
||||
RUN set -eux; \
|
||||
case "${TARGETPLATFORM}" in \
|
||||
linux/arm64) UUARCH="aarch64" ;; \
|
||||
linux/amd64) UUARCH="x64" ;; \
|
||||
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
||||
esac; \
|
||||
./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
|
||||
rm ./install_gdrcopy.sh
|
||||
|
||||
# Install EP kernels(pplx-kernels and DeepEP)
|
||||
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
|
||||
COPY tools/install_nixl.sh install_nixl.sh
|
||||
ENV CUDA_HOME=/usr/local/cuda
|
||||
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
|
||||
&& bash install_python_libraries.sh \
|
||||
&& bash install_nixl.sh --force
|
||||
&& bash install_python_libraries.sh
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
|
||||
@ -29,7 +29,10 @@ ARG VLLM_BRANCH="main"
|
||||
ONBUILD RUN git clone ${VLLM_REPO} \
|
||||
&& cd vllm \
|
||||
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
|
||||
&& git checkout FETCH_HEAD
|
||||
&& git checkout FETCH_HEAD \
|
||||
&& if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
|
||||
git remote add upstream "https://github.com/vllm-project/vllm.git" \
|
||||
&& git fetch upstream ; fi
|
||||
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
|
||||
|
||||
# -----------------------
|
||||
|
||||
@ -1,25 +1,23 @@
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
|
||||
ARG HIPBLASLT_BRANCH="aa0bda7b"
|
||||
ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
|
||||
ARG LEGACY_HIPBLASLT_OPTION=
|
||||
ARG TRITON_BRANCH="e5be006"
|
||||
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||
ARG PYTORCH_BRANCH="f717b2af"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.21.0"
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
|
||||
ARG TRITON_BRANCH="f9e5bf54"
|
||||
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
|
||||
ARG PYTORCH_BRANCH="b2fb6885"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.23.0"
|
||||
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="1a7f4dfa"
|
||||
ARG FA_BRANCH="0e60e394"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="4822e675"
|
||||
ARG AITER_BRANCH="2ab9f4cd"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
ENV AITER_ROCM_ARCH=gfx942;gfx950
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
@ -45,29 +43,6 @@ RUN apt-get update -y \
|
||||
|
||||
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
|
||||
|
||||
FROM base AS build_hipblaslt
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
|
||||
ARG LEGACY_HIPBLASLT_OPTION
|
||||
RUN git clone https://github.com/ROCm/hipBLAS-common.git
|
||||
RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
|
||||
RUN cd hipBLAS-common \
|
||||
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
|
||||
&& mkdir build \
|
||||
&& cd build \
|
||||
&& cmake .. \
|
||||
&& make package \
|
||||
&& dpkg -i ./*.deb
|
||||
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||
RUN cd hipBLASLt \
|
||||
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||
&& apt-get install -y llvm-dev \
|
||||
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||
&& cd build/release \
|
||||
&& make package
|
||||
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||
|
||||
FROM base AS build_triton
|
||||
ARG TRITON_BRANCH
|
||||
ARG TRITON_REPO
|
||||
@ -121,13 +96,11 @@ RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt
|
||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
||||
|
||||
FROM base AS debs
|
||||
RUN mkdir /app/debs
|
||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||
cp /install/*.deb /app/debs
|
||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
@ -138,11 +111,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
|
||||
FROM base AS final
|
||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||
dpkg -i /install/*deb \
|
||||
&& perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
|
||||
&& perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
|
||||
&& perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
|
||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
@ -153,9 +121,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
|
||||
ARG BASE_IMAGE
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG LEGACY_HIPBLASLT_OPTION
|
||||
ARG TRITON_BRANCH
|
||||
ARG TRITON_REPO
|
||||
ARG PYTORCH_BRANCH
|
||||
@ -167,9 +132,6 @@ ARG FA_REPO
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
|
||||
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
|
||||
&& echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
|
||||
@ -177,5 +139,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
|
||||
- [vllm.config.LoRAConfig][]
|
||||
- [vllm.config.MultiModalConfig][]
|
||||
- [vllm.config.PoolerConfig][]
|
||||
- [vllm.config.DecodingConfig][]
|
||||
- [vllm.config.StructuredOutputsConfig][]
|
||||
- [vllm.config.ObservabilityConfig][]
|
||||
- [vllm.config.KVTransferConfig][]
|
||||
- [vllm.config.CompilationConfig][]
|
||||
@ -46,7 +46,6 @@ Engine classes for offline and online inference.
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
[](){ #sampling-params }
|
||||
[](){ #pooling-params }
|
||||
|
||||
- [vllm.SamplingParams][]
|
||||
- [vllm.PoolingParams][]
|
||||
|
||||
@ -175,6 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
|
||||
Known supported models:
|
||||
|
||||
- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
|
||||
- InternVL (<gh-pr:23909>)
|
||||
- Kimi-VL (<gh-pr:23817>)
|
||||
- Llama4 (<gh-pr:18368>)
|
||||
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
||||
|
||||
@ -26,113 +26,123 @@ See <gh-file:LICENSE>.
|
||||
|
||||
## Developing
|
||||
|
||||
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
|
||||
|
||||
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
|
||||
Check out the [building from source][build-from-source] documentation for details.
|
||||
|
||||
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
||||
|
||||
### Building the docs with MkDocs
|
||||
|
||||
#### Introduction to MkDocs
|
||||
|
||||
[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
|
||||
|
||||
#### Install MkDocs and Plugins
|
||||
|
||||
Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/docs.txt
|
||||
```
|
||||
|
||||
!!! note
|
||||
Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
|
||||
|
||||
#### Verify Installation
|
||||
|
||||
Confirm that MkDocs is correctly installed:
|
||||
|
||||
```bash
|
||||
mkdocs --version
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```console
|
||||
mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
|
||||
```
|
||||
|
||||
#### Clone the `vLLM` repository
|
||||
The first step of contributing to vLLM is to clone the GitHub repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
```
|
||||
|
||||
#### Start the Development Server
|
||||
Then, configure your Python virtual environment.
|
||||
|
||||
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
|
||||
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
|
||||
|
||||
If you are only developing vLLM's Python code, install vLLM using:
|
||||
|
||||
```bash
|
||||
mkdocs serve
|
||||
VLLM_USE_PRECOMPILED=1 uv pip install -e .
|
||||
```
|
||||
|
||||
Example output:
|
||||
If you are developing vLLM's Python and CUDA/C++ code, install vLLM using:
|
||||
|
||||
```console
|
||||
INFO - Documentation built in 106.83 seconds
|
||||
INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
|
||||
INFO - [22:02:02] Serving on http://127.0.0.1:8000/
|
||||
```bash
|
||||
uv pip install -e .
|
||||
```
|
||||
|
||||
#### View in Your Browser
|
||||
For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section.
|
||||
|
||||
Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
|
||||
|
||||
#### Learn More
|
||||
|
||||
For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
|
||||
|
||||
## Testing
|
||||
|
||||
??? console "Commands"
|
||||
|
||||
```bash
|
||||
# These commands are only for Nvidia CUDA platforms.
|
||||
uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
|
||||
|
||||
# Linting, formatting and static type checking
|
||||
pre-commit install
|
||||
|
||||
# You can manually run pre-commit with
|
||||
pre-commit run --all-files --show-diff-on-failure
|
||||
|
||||
# To manually run something from CI that does not run
|
||||
# locally by default, you can run:
|
||||
pre-commit run mypy-3.9 --hook-stage manual --all-files
|
||||
|
||||
# Unit tests
|
||||
pytest tests/
|
||||
|
||||
# Run tests for a single test file with detailed output
|
||||
pytest -s -v tests/test_logger.py
|
||||
```
|
||||
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
||||
|
||||
!!! tip
|
||||
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
|
||||
vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
||||
|
||||
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
||||
|
||||
!!! note "Install python3-dev if Python.h is missing"
|
||||
### Linting
|
||||
|
||||
vLLM uses `pre-commit` to lint and format the codebase. See <https://pre-commit.com/#usage> if `pre-commit` is new to you. Setting up `pre-commit` is as easy as:
|
||||
|
||||
```bash
|
||||
uv pip install pre-commit
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
vLLM's `pre-commit` hooks will now run automatically every time you commit.
|
||||
|
||||
!!! tip "Tips"
|
||||
You can manually run the `pre-commit` hooks using:
|
||||
|
||||
```bash
|
||||
pre-commit run # runs on staged files
|
||||
pre-commit run -a # runs on all files (short for --all-files)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
|
||||
|
||||
```bash
|
||||
pre-commit run --hook-stage manual markdownlint
|
||||
pre-commit run --hook-stage manual mypy-3.9
|
||||
```
|
||||
|
||||
### Documentation
|
||||
|
||||
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
|
||||
|
||||
Get started with:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/docs.txt
|
||||
```
|
||||
|
||||
!!! tip
|
||||
Ensure that your Python version is compatible with the plugins
|
||||
(e.g., `mkdocs-awesome-nav` requires Python 3.10+)
|
||||
|
||||
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it.
|
||||
From the root of the repository, run:
|
||||
|
||||
```bash
|
||||
mkdocs serve # with API ref (~10 minutes)
|
||||
API_AUTONAV_EXCLUDE=vllm mkdocs serve # API ref off (~15 seconds)
|
||||
```
|
||||
|
||||
Once you see `Serving on http://127.0.0.1:8000/` in the logs, the live preview is ready!
|
||||
Open <http://127.0.0.1:8000/> in your browser to see it.
|
||||
|
||||
For additional features and advanced configurations, refer to the:
|
||||
|
||||
- [MkDocs documentation](https://www.mkdocs.org/)
|
||||
- [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/) (the MkDocs theme we use)
|
||||
|
||||
### Testing
|
||||
|
||||
vLLM uses `pytest` to test the codebase.
|
||||
|
||||
```bash
|
||||
# Install the test dependencies used in CI (CUDA only)
|
||||
uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
|
||||
|
||||
# Install some common test dependencies (hardware agnostic)
|
||||
uv pip install pytest pytest-asyncio
|
||||
|
||||
# Run all tests
|
||||
pytest tests/
|
||||
|
||||
# Run tests for a single test file with detailed output
|
||||
pytest -s -v tests/test_logger.py
|
||||
```
|
||||
|
||||
!!! tip "Install python3-dev if Python.h is missing"
|
||||
If any of the above commands fails with `Python.h: No such file or directory`, install
|
||||
`python3-dev` with `sudo apt install python3-dev`.
|
||||
|
||||
!!! note
|
||||
!!! warning "Warnings"
|
||||
Currently, the repository is not fully checked by `mypy`.
|
||||
|
||||
!!! note
|
||||
---
|
||||
|
||||
Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
|
||||
platform to run unit tests locally, rely on the continuous integration system to run the tests for
|
||||
now.
|
||||
@ -194,8 +204,7 @@ appropriately to indicate the type of change. Please use one of the following:
|
||||
The PR needs to meet the following code quality standards:
|
||||
|
||||
- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
|
||||
- Pass all linter checks. Please use `pre-commit` to format your code. See
|
||||
<https://pre-commit.com/#usage> if `pre-commit` is new to you.
|
||||
- Pass all linter checks.
|
||||
- The code needs to be well-documented to ensure future contributors can easily
|
||||
understand the code.
|
||||
- Include sufficient tests to ensure the project stays correct and robust. This
|
||||
|
||||
@ -1,9 +1,787 @@
|
||||
---
|
||||
toc_depth: 4
|
||||
---
|
||||
|
||||
# Benchmark Suites
|
||||
|
||||
vLLM contains two sets of benchmarks:
|
||||
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
|
||||
|
||||
- [Performance benchmarks][performance-benchmarks]
|
||||
- [Nightly benchmarks][nightly-benchmarks]
|
||||
- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||
- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
|
||||
|
||||
[Benchmark CLI]: #benchmark-cli
|
||||
|
||||
## Benchmark CLI
|
||||
|
||||
This section guides you through running benchmark tests with the extensive
|
||||
datasets supported on vLLM. It's a living document, updated as new features and datasets
|
||||
become available.
|
||||
|
||||
### Dataset Overview
|
||||
|
||||
<style>
|
||||
th {
|
||||
min-width: 0 !important;
|
||||
}
|
||||
</style>
|
||||
|
||||
| Dataset | Online | Offline | Data Path |
|
||||
|---------|--------|---------|-----------|
|
||||
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
|
||||
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
|
||||
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
|
||||
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
|
||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||
| Random | ✅ | ✅ | `synthetic` |
|
||||
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
|
||||
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
||||
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
||||
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
|
||||
| HuggingFace-InstructCoder | ✅ | ✅ | `likaixin/InstructCoder` |
|
||||
| HuggingFace-AIMO | ✅ | ✅ | `AI-MO/aimo-validation-aime`, `AI-MO/NuminaMath-1.5`, `AI-MO/NuminaMath-CoT` |
|
||||
| HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` |
|
||||
| HuggingFace-MTBench | ✅ | ✅ | `philschmid/mt-bench` |
|
||||
| HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` |
|
||||
| Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` |
|
||||
| Custom | ✅ | ✅ | Local file: `data.jsonl` |
|
||||
|
||||
Legend:
|
||||
|
||||
- ✅ - supported
|
||||
- 🟡 - Partial support
|
||||
- 🚧 - to be supported
|
||||
|
||||
!!! note
|
||||
HuggingFace dataset's `dataset-name` should be set to `hf`.
|
||||
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
|
||||
|
||||
```bash
|
||||
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
|
||||
```
|
||||
|
||||
### Examples
|
||||
|
||||
#### 🚀 Online Benchmark
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
First start serving your model
|
||||
|
||||
```bash
|
||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
||||
```
|
||||
|
||||
Then run the benchmarking script
|
||||
|
||||
```bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
If successful, you will see the following output
|
||||
|
||||
```text
|
||||
============ Serving Benchmark Result ============
|
||||
Successful requests: 10
|
||||
Benchmark duration (s): 5.78
|
||||
Total input tokens: 1369
|
||||
Total generated tokens: 2212
|
||||
Request throughput (req/s): 1.73
|
||||
Output token throughput (tok/s): 382.89
|
||||
Total Token throughput (tok/s): 619.85
|
||||
---------------Time to First Token----------------
|
||||
Mean TTFT (ms): 71.54
|
||||
Median TTFT (ms): 73.88
|
||||
P99 TTFT (ms): 79.49
|
||||
-----Time per Output Token (excl. 1st token)------
|
||||
Mean TPOT (ms): 7.91
|
||||
Median TPOT (ms): 7.96
|
||||
P99 TPOT (ms): 8.03
|
||||
---------------Inter-token Latency----------------
|
||||
Mean ITL (ms): 7.74
|
||||
Median ITL (ms): 7.70
|
||||
P99 ITL (ms): 8.39
|
||||
==================================================
|
||||
```
|
||||
|
||||
##### Custom Dataset
|
||||
|
||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||
|
||||
```json
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
|
||||
```bash
|
||||
# start server
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||
```
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
vllm bench serve --port 9001 --save-result --save-detailed \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name custom \
|
||||
--dataset-path <path-to-your-data-jsonl> \
|
||||
--custom-skip-chat-template \
|
||||
--num-prompts 80 \
|
||||
--max-concurrency 1 \
|
||||
--temperature=0.3 \
|
||||
--top-p=0.75 \
|
||||
--result-dir "./log/"
|
||||
```
|
||||
|
||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||
|
||||
##### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
```bash
|
||||
# need a model with vision capability here
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||
--hf-split train \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
##### InstructCoder Benchmark with Speculative Decoding
|
||||
|
||||
``` bash
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name hf \
|
||||
--dataset-path likaixin/InstructCoder \
|
||||
--num-prompts 2048
|
||||
```
|
||||
|
||||
##### Spec Bench Benchmark with Speculative Decoding
|
||||
|
||||
``` bash
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
|
||||
|
||||
Run all categories:
|
||||
|
||||
``` bash
|
||||
# Download the dataset using:
|
||||
# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
|
||||
|
||||
vllm bench serve \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name spec_bench \
|
||||
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
|
||||
--num-prompts -1
|
||||
```
|
||||
|
||||
Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
|
||||
|
||||
Run only a specific category like "summarization":
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name spec_bench \
|
||||
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
|
||||
--num-prompts -1
|
||||
--spec-bench-category "summarization"
|
||||
```
|
||||
|
||||
##### Other HuggingFaceDataset Examples
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```
|
||||
|
||||
`lmms-lab/LLaVA-OneVision-Data`:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||
--hf-split train \
|
||||
--hf-subset "chart2text(cauldron)" \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
`AI-MO/aimo-validation-aime`:
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
--num-prompts 10 \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
`philschmid/mt-bench`:
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path philschmid/mt-bench \
|
||||
--num-prompts 80
|
||||
```
|
||||
|
||||
`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
|
||||
|
||||
``` bash
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path vdaita/edit_5k_char \
|
||||
--num-prompts 90 \
|
||||
--blazedit-min-distance 0.01 \
|
||||
--blazedit-max-distance 0.99
|
||||
```
|
||||
|
||||
##### Running With Sampling Parameters
|
||||
|
||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
parameters can be specified. Example client command:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--top-k 10 \
|
||||
--top-p 0.9 \
|
||||
--temperature 0.5 \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
##### Running With Ramp-Up Request Rate
|
||||
|
||||
The benchmark tool also supports ramping up the request rate over the
|
||||
duration of the benchmark run. This can be useful for stress testing the
|
||||
server or finding the maximum throughput that it can handle, given some latency budget.
|
||||
|
||||
Two ramp-up strategies are supported:
|
||||
|
||||
- `linear`: Increases the request rate linearly from a start value to an end value.
|
||||
- `exponential`: Increases the request rate exponentially.
|
||||
|
||||
The following arguments can be used to control the ramp-up:
|
||||
|
||||
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
||||
|
||||
</details>
|
||||
|
||||
#### 📈 Offline Throughput Benchmark
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
If successful, you will see the following output
|
||||
|
||||
```text
|
||||
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
||||
Total num prompt tokens: 5014
|
||||
Total num output tokens: 1500
|
||||
```
|
||||
|
||||
##### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||
--num-prompts 1000 \
|
||||
--hf-split train
|
||||
```
|
||||
|
||||
The `num prompt tokens` now includes image token counts
|
||||
|
||||
```text
|
||||
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
||||
Total num prompt tokens: 14527
|
||||
Total num output tokens: 1280
|
||||
```
|
||||
|
||||
##### InstructCoder Benchmark with Speculative Decoding
|
||||
|
||||
``` bash
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_USE_V1=1 \
|
||||
vllm bench throughput \
|
||||
--dataset-name=hf \
|
||||
--dataset-path=likaixin/InstructCoder \
|
||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--input-len=1000 \
|
||||
--output-len=100 \
|
||||
--num-prompts=2048 \
|
||||
--async-engine \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
```text
|
||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
||||
Total num prompt tokens: 261136
|
||||
Total num output tokens: 204800
|
||||
```
|
||||
|
||||
##### Other HuggingFaceDataset Examples
|
||||
|
||||
`lmms-lab/LLaVA-OneVision-Data`:
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||
--hf-split train \
|
||||
--hf-subset "chart2text(cauldron)" \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
`AI-MO/aimo-validation-aime`:
|
||||
|
||||
```bash
|
||||
vllm bench throughput \
|
||||
--model Qwen/QwQ-32B \
|
||||
--backend vllm \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
Benchmark with LoRA adapters:
|
||||
|
||||
``` bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
vllm bench throughput \
|
||||
--model meta-llama/Llama-2-7b-hf \
|
||||
--backend vllm \
|
||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--dataset_name sharegpt \
|
||||
--num-prompts 10 \
|
||||
--max-loras 2 \
|
||||
--max-lora-rank 8 \
|
||||
--enable-lora \
|
||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### 🛠️ Structured Output Benchmark
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
||||
|
||||
##### Server Setup
|
||||
|
||||
```bash
|
||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
||||
```
|
||||
|
||||
##### JSON Schema Benchmark
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset json \
|
||||
--structured-output-ratio 1.0 \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
##### Grammar-based Generation Benchmark
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset grammar \
|
||||
--structure-type grammar \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
##### Regex-based Generation Benchmark
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset regex \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
##### Choice-based Generation Benchmark
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset choice \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
##### XGrammar Benchmark Dataset
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset xgrammar_bench \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### 📚 Long Document QA Benchmark
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Benchmark the performance of long document question-answering with prefix caching.
|
||||
|
||||
##### Basic Long Document QA Test
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 16 \
|
||||
--document-length 2000 \
|
||||
--output-len 50 \
|
||||
--repeat-count 5
|
||||
```
|
||||
|
||||
##### Different Repeat Modes
|
||||
|
||||
```bash
|
||||
# Random mode (default) - shuffle prompts randomly
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode random
|
||||
|
||||
# Tile mode - repeat entire prompt list in sequence
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode tile
|
||||
|
||||
# Interleave mode - repeat each prompt consecutively
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode interleave
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### 🗂️ Prefix Caching Benchmark
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Benchmark the efficiency of automatic prefix caching.
|
||||
|
||||
##### Fixed Prompt with Prefix Caching
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prefix_caching.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-prompts 1 \
|
||||
--repeat-count 100 \
|
||||
--input-length-range 128:256
|
||||
```
|
||||
|
||||
##### ShareGPT Dataset with Prefix Caching
|
||||
|
||||
```bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
python3 benchmarks/benchmark_prefix_caching.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--enable-prefix-caching \
|
||||
--num-prompts 20 \
|
||||
--repeat-count 5 \
|
||||
--input-length-range 128:256
|
||||
```
|
||||
|
||||
##### Prefix Repetition Dataset
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--dataset-name prefix_repetition \
|
||||
--num-prompts 100 \
|
||||
--prefix-repetition-prefix-len 512 \
|
||||
--prefix-repetition-suffix-len 128 \
|
||||
--prefix-repetition-num-prefixes 5 \
|
||||
--prefix-repetition-output-len 128
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### ⚡ Request Prioritization Benchmark
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Benchmark the performance of request prioritization in vLLM.
|
||||
|
||||
##### Basic Prioritization Test
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prioritization.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--input-len 128 \
|
||||
--output-len 64 \
|
||||
--num-prompts 100 \
|
||||
--scheduling-policy priority
|
||||
```
|
||||
|
||||
##### Multiple Sequences per Prompt
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prioritization.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--input-len 128 \
|
||||
--output-len 64 \
|
||||
--num-prompts 100 \
|
||||
--scheduling-policy priority \
|
||||
--n 2
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### 👁️ Multi-Modal Benchmark
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Benchmark the performance of multi-modal requests in vLLM.
|
||||
|
||||
##### Images (ShareGPT4V)
|
||||
|
||||
Start vLLM:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--limit-mm-per-prompt '{"image": 1}' \
|
||||
--allowed-local-media-path /path/to/sharegpt4v/images
|
||||
```
|
||||
|
||||
Send requests with images:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
|
||||
--num-prompts 100 \
|
||||
--save-result \
|
||||
--result-dir ~/vllm_benchmark_results \
|
||||
--save-detailed \
|
||||
--endpoint /v1/chat/completion
|
||||
```
|
||||
|
||||
##### Videos (ShareGPT4Video)
|
||||
|
||||
Start vLLM:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--limit-mm-per-prompt '{"video": 1}' \
|
||||
--allowed-local-media-path /path/to/sharegpt4video/videos
|
||||
```
|
||||
|
||||
Send requests with videos:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
|
||||
--num-prompts 100 \
|
||||
--save-result \
|
||||
--result-dir ~/vllm_benchmark_results \
|
||||
--save-detailed \
|
||||
--endpoint /v1/chat/completion
|
||||
```
|
||||
|
||||
##### Synthetic Random Images (random-mm)
|
||||
|
||||
Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
|
||||
|
||||
Notes:
|
||||
|
||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
||||
- Video sampling is not yet implemented.
|
||||
|
||||
Start the server (example):
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--max-model-len 16384 \
|
||||
--limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
||||
--mm-processor-kwargs max_pixels=1003520
|
||||
```
|
||||
|
||||
Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
|
||||
|
||||
Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2.5-VL-3B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name random-mm \
|
||||
--num-prompts 100 \
|
||||
--max-concurrency 10 \
|
||||
--random-prefix-len 25 \
|
||||
--random-input-len 300 \
|
||||
--random-output-len 40 \
|
||||
--random-range-ratio 0.2 \
|
||||
--random-mm-base-items-per-request 2 \
|
||||
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
||||
--random-mm-bucket-config '{(224, 224, 1): 1.0}' \
|
||||
--request-rate inf \
|
||||
--ignore-eos \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
The number of items per request can be controlled by passing multiple image buckets:
|
||||
|
||||
```bash
|
||||
--random-mm-base-items-per-request 2 \
|
||||
--random-mm-num-mm-items-range-ratio 0.5 \
|
||||
--random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
|
||||
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
|
||||
```
|
||||
|
||||
Flags specific to `random-mm`:
|
||||
|
||||
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
|
||||
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
|
||||
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
|
||||
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
|
||||
|
||||
Behavioral notes:
|
||||
|
||||
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
|
||||
|
||||
How sampling works:
|
||||
|
||||
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
|
||||
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
|
||||
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
|
||||
This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
|
||||
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
|
||||
|
||||
</details>
|
||||
|
||||
[](){ #performance-benchmarks }
|
||||
|
||||
@ -13,22 +791,22 @@ The performance benchmarks are used for development to confirm whether new chang
|
||||
|
||||
### Manually Trigger the benchmark
|
||||
|
||||
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
|
||||
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
|
||||
For CPU environment, please use the image with "-cpu" postfix.
|
||||
|
||||
Here is an example for docker run command for CPU.
|
||||
Here is an example for docker run command for CPU.
|
||||
|
||||
```bash
|
||||
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
|
||||
```
|
||||
|
||||
Then, run below command inside the docker instance.
|
||||
Then, run below command inside the docker instance.
|
||||
|
||||
```bash
|
||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
```
|
||||
|
||||
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
|
||||
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
|
||||
|
||||
#### Runtime environment variables
|
||||
|
||||
|
||||
@ -40,6 +40,16 @@ python tools/generate_cmake_presets.py
|
||||
|
||||
The script will prompt you if it cannot automatically determine certain paths (e.g., `nvcc` or a specific Python executable for your vLLM development environment). Follow the on-screen prompts. If an existing `CMakeUserPresets.json` is found, the script will ask for confirmation before overwriting it.
|
||||
|
||||
**Force overwrite existing file:**
|
||||
|
||||
To automatically overwrite an existing `CMakeUserPresets.json` without prompting, use the `--force-overwrite` flag:
|
||||
|
||||
```console
|
||||
python tools/generate_cmake_presets.py --force-overwrite
|
||||
```
|
||||
|
||||
This is particularly useful in automated scripts or CI/CD environments where interactive prompts are not desired.
|
||||
|
||||
After running the script, a `CMakeUserPresets.json` file will be created in the root of your vLLM repository.
|
||||
|
||||
### Example `CMakeUserPresets.json`
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
!!! important
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
|
||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/compatibility_matrix.md) to optimize their performance.
|
||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
|
||||
|
||||
The complexity of integrating a model into vLLM depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
|
||||
559
docs/design/logits_processors.md
Normal file
559
docs/design/logits_processors.md
Normal file
@ -0,0 +1,559 @@
|
||||
# Logits Processors
|
||||
|
||||
!!! important
|
||||
Some logits processors design changes are still in progress and the API may
|
||||
change in the near future. We hope to stabilize this part of the API soon
|
||||
|
||||
This document describes how the vLLM engine interacts with logits processors, and the programming model which vLLM supports for implementing logits processors.
|
||||
|
||||
## Logits Processors Background
|
||||
|
||||
A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior.
|
||||
|
||||
In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax.
|
||||
|
||||
## Logits Processors in the vLLM engine
|
||||
|
||||
The vLLM engine's persistent batch data structure maintains a list of loaded logits processors.
|
||||
|
||||
In order to operate on the entire batch at once, each logits processor may maintain metadata about the requests in the batch (i.e. each request's logits-processor-specific configuration settings). Therefore, logits processors are stateful.
|
||||
|
||||
In each engine step, the vLLM engine will (1) update each logits processor's internal state and (2) apply logits processors to the model output logits.
|
||||
|
||||
### Updating Logits Processor Internal State
|
||||
|
||||
At the beginning of each engine step, the persistent batch may add, discard and/or reorder requests in response to the scheduler output. After the persistent batch has reorganized, the vLLM engine invokes each logits processor's `update_state()` method. This is necessary to ensure that logits processors' internal states are reorganized to match the new persistent batch state at the beginning of the engine step.
|
||||
|
||||
The pseudocode below shows the process by which the vLLM persistent batch notifies each logits processor of changes in batch state:
|
||||
|
||||
??? code "Model Runner Updates Logits Processor States"
|
||||
|
||||
``` python
|
||||
# gpu_model_runner.py
|
||||
|
||||
class GPUModelRunner(...):
|
||||
|
||||
...
|
||||
|
||||
def execute_model(self, scheduler_output, ...):
|
||||
self._update_states(scheduler_output)
|
||||
|
||||
...
|
||||
|
||||
def _update_states(...):
|
||||
|
||||
...
|
||||
|
||||
# ...update persistent batch to reflect new/finished requests & reordering
|
||||
# of requests within batch...
|
||||
|
||||
...
|
||||
|
||||
self.input_batch.refresh_metadata()
|
||||
|
||||
|
||||
# gpu_input_batch.py
|
||||
|
||||
class InputBatch:
|
||||
|
||||
...
|
||||
|
||||
def refresh_metadata(self):
|
||||
|
||||
...
|
||||
|
||||
# Update each logits processor's state to reflect persistent batch state
|
||||
batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
|
||||
for logit_proc in self.logitsprocs.all:
|
||||
logit_proc.update_state(batch_update)
|
||||
|
||||
...
|
||||
|
||||
|
||||
# vllm/v1/sample/logits_processor/interface.py
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BatchUpdate:
|
||||
# Batch state-change data structure which is passed to logits processors'
|
||||
# update_state() methods
|
||||
|
||||
batch_size: int
|
||||
|
||||
removed: Sequence[RemovedRequest]
|
||||
added: Sequence[AddedRequest]
|
||||
moved: Sequence[MovedRequest]
|
||||
|
||||
```
|
||||
|
||||
### Applying Logits Processors to the Model Output Logits
|
||||
|
||||
After updating persistent batch state, the vLLM model runner performs model inference to obtain logits. Then, the model runner invokes the sampler against the logits. In turn, part of the sampler's operation is to invoke the logits processors' `apply()` methods against the model output logit processors, yielding transformed logits (the `apply()` methods may modify the logits in-place or out-of-place, although in-place is more memory-efficient). This process is shown in the pseudocode below.
|
||||
|
||||
Note that the sampler will access the logits processors via `SamplingMetadata.logitsprocs`. When the vLLM engine constructs `SamplingMetadata` (not shown in the code below), the reference to the list of logits processors is passed from the persistent batch data structure to `SamplingMetadata`.
|
||||
|
||||
??? code "Apply logits processors to model output logits"
|
||||
|
||||
``` python
|
||||
# gpu_model_runner.py
|
||||
|
||||
class GPUModelRunner(...):
|
||||
|
||||
...
|
||||
|
||||
def execute_model(self, scheduler_output, ...):
|
||||
# (discussed in previous section)
|
||||
self._update_states(scheduler_output)
|
||||
|
||||
...
|
||||
|
||||
# ...run model inference to obtain logits...
|
||||
|
||||
...
|
||||
|
||||
# Invoke sampler, which applies logits processors
|
||||
sampler_output = self.sampler(logits=logits,
|
||||
sampling_metadata=sampling_metadata)
|
||||
|
||||
...
|
||||
|
||||
|
||||
# sampler.py
|
||||
|
||||
class Sampler(nn.Module):
|
||||
|
||||
...
|
||||
|
||||
def forward(self, logits, sampling_metadata):
|
||||
|
||||
...
|
||||
|
||||
# Apply non-argmax-invariant logits processors to model output logits
|
||||
for processor in (sampling_metadata.logitsprocs.non_argmax_invariant):
|
||||
logits = processor.apply(logits)
|
||||
|
||||
sampled = self.sample(logits, sampling_metadata)
|
||||
|
||||
...
|
||||
|
||||
# ...return sampler output data structure...
|
||||
|
||||
|
||||
def sample(self, logits, sampling_metadta)
|
||||
|
||||
...
|
||||
|
||||
# ...exit early if all requests are greedy-sampling...
|
||||
|
||||
...
|
||||
|
||||
# Apply argmax-invariant logits processors
|
||||
for processor in sampling_metadata.logitsprocs.argmax_invariant:
|
||||
logits = processor.apply(logits)
|
||||
|
||||
...
|
||||
|
||||
# ...perform sampling and return sampling result...
|
||||
```
|
||||
|
||||
At sampling time, the sampler checks whether all requests in the persistent batch employ greedy sampling. If that is the case, the sampler saves compute by skipping "argmax-invariant" logits processors. Here, "argmax" is shorthand for the token ID with the highest logit value in a given row of the logits tensor (i.e. the token which the model weighted the highest for a given request).
|
||||
|
||||
* An **argmax-invariant logits processor** is a logits processor (such as Min-P) which does not modify the argmax. For example, a logits processor which masks out the lowest-probability tokens will not change which token ID has the max logit. Greedy sampling always picks the highest-logit-value token ID, and so conceptually an argmax-invariant logits processor can be skipped for greedy sampling requests.
|
||||
|
||||
* A **non-argmax-invariant logits processor** is a logits processor which may modify the argmax. For example, a logits processor which masks all tokens except for EOS after a certain number of steps in order to force decoding to terminate might end up masking the max-logit-value token and therefore change the argmax. Conceptually, these logits processors cannot be skipped for greedy sampling requests.
|
||||
|
||||
The vLLM logits processor abstraction requires the engine to apply logits processors at batch granularity; therefore in practice the argmax-invariant logits processors can only be skipped when the entire batch uses greedy sampling.
|
||||
|
||||
## Logits Processor Programming Model
|
||||
|
||||
The previous sections alluded to the interfaces which vLLM logits processors must support. This section introduces in full the programming model for implementing logits processors that are compatible with the vLLM engine, including the `LogitsProcessor` base class and its interface methods as well as the `BatchUpdate` data structure for representing persistent batch state changes, both of which are shown in the code below:
|
||||
|
||||
??? code "`LogitsProcessor` base class and `BatchUpdate` data structure"
|
||||
|
||||
``` python
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
|
||||
|
||||
class MoveDirectionality(Enum):
|
||||
# One-way i1->i2 req move within batch
|
||||
UNIDIRECTIONAL = auto()
|
||||
# Two-way i1<->i2 req swap within batch
|
||||
SWAP = auto()
|
||||
|
||||
|
||||
# (index, params, prompt_tok_ids, output_tok_ids) tuples for new
|
||||
# requests added to the batch.
|
||||
AddedRequest = tuple[int, SamplingParams, list[int], list[int]]
|
||||
|
||||
# (index 1, index 2, directionality) tuples representing
|
||||
# one-way moves or two-way swaps of requests in batch
|
||||
MovedRequest = tuple[int, int, MoveDirectionality]
|
||||
|
||||
# Batch indices of any removed requests.
|
||||
RemovedRequest = int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BatchUpdate:
|
||||
"""Persistent batch state change info for logitsprocs"""
|
||||
batch_size: int # Current num reqs in batch
|
||||
|
||||
# Metadata for requests added to, removed from, and moved
|
||||
# within the persistent batch.
|
||||
#
|
||||
# Key assumption: the `output_tok_ids` list (which is an element of each
|
||||
# tuple in `added`) is a reference to the request's running output tokens
|
||||
# list; via this reference, the logits processors always see the latest
|
||||
# list of generated output tokens
|
||||
removed: Sequence[RemovedRequest]
|
||||
moved: Sequence[MovedRequest]
|
||||
added: Sequence[AddedRequest]
|
||||
|
||||
|
||||
class LogitsProcessor(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, vllm_config: "VllmConfig", device: torch.device,
|
||||
is_pin_memory: bool) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, logits: torch.Tensor) -> torch.Tensor:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def is_argmax_invariant(self) -> bool:
|
||||
"""True if logits processor has no impact on the
|
||||
argmax computation in greedy sampling.
|
||||
NOTE: may or may not have the same value for all
|
||||
instances of a given LogitsProcessor subclass,
|
||||
depending on subclass implementation.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def update_state(
|
||||
self,
|
||||
batch_update: Optional["BatchUpdate"],
|
||||
) -> None:
|
||||
"""Called when there are new output tokens, prior
|
||||
to each forward pass.
|
||||
|
||||
Args:
|
||||
batch_update is non-None iff there have been
|
||||
changes to the batch makeup.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
```
|
||||
|
||||
A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) the following methods:
|
||||
|
||||
* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)`
|
||||
* `vllm_config`: engine configuration data structure
|
||||
* `device`: hardware accelerator device info
|
||||
* `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation
|
||||
|
||||
* `apply(self, logits: torch.Tensor) -> torch.Tensor`:
|
||||
* Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`)
|
||||
* Apply logits processor transformation at batch granularity
|
||||
* Return a transformed `(num_requests) x (vocab_size)` logits tensor
|
||||
* You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient
|
||||
|
||||
* `is_argmax_invariant(self) -> bool`:
|
||||
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
|
||||
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
|
||||
|
||||
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
|
||||
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
|
||||
* Use the `BatchUpdate` members to update logits processor internal state
|
||||
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
|
||||
|
||||
### `BatchUpdate` data structure
|
||||
|
||||
The `BatchUpdate` abstraction models the persistent batch as a list of requests, supporting the following operations to change batch state (note that the order in which the operations are mentioned below reflects the order in which they should be processed in `update_state()`):
|
||||
|
||||
* **Remove:** remove (without replacement) request at index `i`
|
||||
|
||||
* A Remove is represented in `Batchupdate.removed` by an `int` (representing `i`)
|
||||
|
||||
* Effect of remove-at-index on batch:
|
||||
|
||||
``` text
|
||||
Batch: [A,B,C]
|
||||
Remove @ i: 1
|
||||
|
||||
=>
|
||||
|
||||
New Batch: [A,x,C] # Discard B and leave an empty slot
|
||||
```
|
||||
|
||||
* **Add:** add (or replace existing request with) a new request at index `i`. If a request is replaced, its associated state should be discarded.
|
||||
|
||||
* An Add is represented in `Batchupdate.added` as a tuple of
|
||||
|
||||
``` text
|
||||
(index, new request SamplingParams, prompt token ids, output token ids)
|
||||
```
|
||||
|
||||
* `prompt token ids` and `output token ids` are references to the request's prompt token ids and output token ids lists, respectively. Note that the output token ids list grows with each engine step, and this growth is visible to the logits processor because output token ids are passed by reference. **This is important for LogitsProcessors that take into account the tokens generated so far**.
|
||||
|
||||
* The implementation of the particular logits processor subclass determines whether or how the fields in the added request tuple are digested into an internal representation. For example, a logits processor that does not utilize prompt or output token ids may only need to utilize `index` and `SamplingParams` and discard the other tuple fields
|
||||
|
||||
* If index `i` currently holds a request, a replacement occurs:
|
||||
|
||||
``` text
|
||||
Batch: [A,B,C]
|
||||
New request to be added @ i: D @ 1
|
||||
|
||||
=>
|
||||
|
||||
New Batch: [A,D,C] # Add D, discard B
|
||||
```
|
||||
|
||||
* If index `i` does not currently hold a request (because `i` is out of bounds of the current batch size):
|
||||
|
||||
``` text
|
||||
Batch: [A,B,C]
|
||||
New request to be added @ i: D @ 3
|
||||
|
||||
=>
|
||||
|
||||
New Batch: [A,B,C,D] # Add D, extending batch
|
||||
```
|
||||
|
||||
* **Move:** move request at index `s` to index `d` OR swap requests at indices `s` and `d`
|
||||
|
||||
* A Move is represented in `Batchupdate.moved` as a tuple of
|
||||
|
||||
``` text
|
||||
(s, d, UNIDIRECTIONAL or SWAP)
|
||||
```
|
||||
|
||||
* If the Move specifies `UNIDRECTIONAL`:
|
||||
|
||||
* The request at index `s` is moved to index `d`; index `s` becomes an empty slot
|
||||
|
||||
``` text
|
||||
Batch: [A,x,C,D]
|
||||
Unidirectionally Move s -> d: 3 -> 1
|
||||
|
||||
=>
|
||||
|
||||
New Batch: [A,D,C,x] # Move D to 1, leaving empty slot at 3
|
||||
```
|
||||
|
||||
* If another request already resided at index `d`, it is replaced and discarded
|
||||
|
||||
``` text
|
||||
Batch: [A,B,C,D]
|
||||
Unidirectionally Move s -> d: 3 -> 1
|
||||
|
||||
=>
|
||||
|
||||
New Batch: [A,D,C,x] # Move D to 1, discarding B and leaving empty slot at 3
|
||||
```
|
||||
|
||||
* If the Move specifies `SWAP`, the requests at `s` and `d` exchange indices
|
||||
|
||||
``` text
|
||||
Batch: [A,B,C,D]
|
||||
Swap Move s <-> d: 3 <-> 1
|
||||
|
||||
=>
|
||||
|
||||
New Batch: [A,D,C,B] # Swap B and D
|
||||
```
|
||||
|
||||
Additionally, the `BatchUpdate` data structure includes a representation (`batch_size`) of the size of the persistent batch at the beginning of the engine step.
|
||||
|
||||
### How the vLLM engine builds the `BatchUpdate` data structure
|
||||
|
||||
Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction):
|
||||
|
||||
1. Identify indices of requests which finished in the current engine step
|
||||
|
||||
2. Identify new requests introduced in the current step
|
||||
|
||||
3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index
|
||||
|
||||
4. Based on the relative number of new and finished requests:
|
||||
|
||||
1. If the numbers of new and finished requests are the same, proceed to next step
|
||||
|
||||
2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1`
|
||||
|
||||
3. *If there are fewer new requests than finished requests:*
|
||||
|
||||
* Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state
|
||||
|
||||
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
|
||||
|
||||
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
|
||||
|
||||
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
|
||||
|
||||
Notes:
|
||||
|
||||
* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves
|
||||
|
||||
* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations
|
||||
* Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3
|
||||
* In other words Move operations can be assumed to be applied after Adds and Removes
|
||||
|
||||
* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved`
|
||||
|
||||
* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None`
|
||||
|
||||
#### Example: Batch Update with Fewer New Requests Than Finished Requests
|
||||
|
||||
The following example models an engine step where 1 new request is introduced and 2 finished requests are eliminated, additionally the attention backend performs a swap to optimize the batch ordering.
|
||||
|
||||
``` text
|
||||
Batch state (beginning of engine step): [A,B,C,D]
|
||||
Batch size: 4
|
||||
|
||||
New requests: E
|
||||
|
||||
Finished requests: A, C
|
||||
|
||||
Processing steps (using BatchUpdate abstraction):
|
||||
|
||||
1. Add E at index 0
|
||||
|
||||
[E,B,C,D] # Discard A
|
||||
Batch size: 4
|
||||
|
||||
2. Remove at index 2
|
||||
|
||||
[E,B,x,D] # Discard C, empty slot at index 2
|
||||
Batch size: 4
|
||||
|
||||
3. Condense batch with a Unidirectional Move 3 -> 2 operation and shrink batch
|
||||
|
||||
[E,B,D] x # Empty slot is now outside batch
|
||||
Batch size: 3
|
||||
|
||||
4. Attention backend optimization: reorder batch with Swap 0 <-> 1
|
||||
|
||||
[B,E,D]
|
||||
Batch size: 3
|
||||
|
||||
```
|
||||
|
||||
The resulting `BatchUpdate` data structure will look like
|
||||
|
||||
``` text
|
||||
BatchUpdate instance
|
||||
* added: [(0,E's SamplingParams,E's prompt tokens ref,E's output tokens ref)]
|
||||
* removed: [2] # request C was removed without replacement
|
||||
* moved: [(3,2,UNIDIRECTIONAL),(0,1,SWAP)]
|
||||
```
|
||||
|
||||
#### Example: Batch Update with More New Requests Than Finished Requests
|
||||
|
||||
The following example models an engine step where 2 new requests are introduced and 1 finished request is eliminated, additionally the attention backend performs a swap to optimize the batch ordering.
|
||||
|
||||
``` text
|
||||
Batch state (beginning of engine step): [A,B,C,D]
|
||||
Batch size: 4
|
||||
|
||||
New requests: E,F
|
||||
|
||||
Finished requests: C
|
||||
|
||||
Processing steps (using BatchUpdate abstraction):
|
||||
|
||||
1. Add E at index 2
|
||||
|
||||
[A,B,E,D] # Discard C
|
||||
Batch size: 4
|
||||
|
||||
2. Add F at index 4 (current max batch index + 1)
|
||||
|
||||
[A,B,E,D,F] # Extend batch by 1
|
||||
Batch size: 5
|
||||
|
||||
4. Attention backend optimization: reorder batch with Swap 0 <-> 1
|
||||
|
||||
[B,A,E,D,F]
|
||||
Batch size: 5
|
||||
|
||||
```
|
||||
|
||||
Note that batch condensation is skipped because there are no empty slots left behind by Remove operations.
|
||||
|
||||
The resulting `BatchUpdate` data structure will look like
|
||||
|
||||
``` text
|
||||
BatchUpdate instance
|
||||
* added: [(2,E's SamplingParams,E's prompt tokens ref,E's output tokens ref),(4,F's SamplingParams,F's prompt tokens ref,F's output tokens ref)]
|
||||
* removed: [] # no requests were removed without replacement
|
||||
* moved: [(0,1,SWAP)]
|
||||
```
|
||||
|
||||
## How to Introduce a New Logits Processor to vLLM
|
||||
|
||||
### Best Practices for Writing Built-In Logits Processors
|
||||
|
||||
* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
|
||||
* For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
|
||||
* However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor
|
||||
|
||||
* It is up to the logits processor author to determine:
|
||||
|
||||
1. **The per-request attributes which configure the logits processor's behavior against that request.** For example, if you are writing a new built-in logits processor for vLLM, you may or may not need to add additional fields to `SamplingParams` and the vLLM REST API
|
||||
|
||||
2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the built-in logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor
|
||||
|
||||
3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the built-in logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the built-in logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor
|
||||
|
||||
* Additionally, an easy way to save compute in `update_state()` is to exit early when the batch_update is `None`
|
||||
|
||||
* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove)
|
||||
|
||||
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
|
||||
|
||||
### Built-In Logits Processors
|
||||
|
||||
Built-in logits processors are always loaded when the vLLM engine starts. See the existing vLLM built-in logits processors in `vllm/v1/sample/logits_processor/builtin.py` for examples of how to write a new built-in vLLM logits processor. It makes sense to write a PR to introduce a new logits processor as a built-in if it is likely to be useful to a wide audience. vLLM currently employs the following built-in logits processors based on the programming model described above:
|
||||
|
||||
* Min-P
|
||||
|
||||
* Logit bias
|
||||
|
||||
* Min-tokens
|
||||
|
||||
Review these logits processor implementations for guidance on writing built-in logits processors.
|
||||
|
||||
Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
|
||||
|
||||
* Allowed token IDs
|
||||
|
||||
* Bad words
|
||||
|
||||
* Repetition penalty
|
||||
|
||||
* Frequency penalty
|
||||
|
||||
* Presence penalty
|
||||
|
||||
* Temperature
|
||||
|
||||
* Top-K
|
||||
|
||||
* Top-P
|
||||
|
||||
### Custom Logits Processors
|
||||
|
||||
vLLM can be augmented with [user-provided custom logits processors](../features/custom_logitsprocs.md).
|
||||
46
docs/features/custom_arguments.md
Normal file
46
docs/features/custom_arguments.md
Normal file
@ -0,0 +1,46 @@
|
||||
# Custom Arguments
|
||||
|
||||
You can use vLLM *custom arguments* to pass in arguments which are not part of the vLLM `SamplingParams` and REST API specifications. Adding or removing a vLLM custom argument does not require recompiling vLLM, since the custom arguments are passed in as a dictionary.
|
||||
|
||||
Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
|
||||
|
||||
## Offline Custom Arguments
|
||||
|
||||
Custom arguments passed to `SamplingParams.extra_args` as a `dict` will be visible to any code which has access to `SamplingParams`:
|
||||
|
||||
``` python
|
||||
SamplingParams(extra_args={"your_custom_arg_name": 67})
|
||||
```
|
||||
|
||||
This allows arguments which are not already part of `SamplingParams` to be passed into `LLM` as part of a request.
|
||||
|
||||
## Online Custom Arguments
|
||||
|
||||
The vLLM REST API allows custom arguments to be passed to the vLLM server via `vllm_xargs`. The example below integrates custom arguments into a vLLM REST API request:
|
||||
|
||||
``` bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen2.5-1.5B-Instruct",
|
||||
...
|
||||
"vllm_xargs": {"your_custom_arg": 67}
|
||||
}'
|
||||
```
|
||||
|
||||
Furthermore, OpenAI SDK users can access `vllm_xargs` via the `extra_body` argument:
|
||||
|
||||
``` python
|
||||
batch = await client.completions.create(
|
||||
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
...,
|
||||
extra_body={
|
||||
"vllm_xargs": {
|
||||
"your_custom_arg": 67
|
||||
}
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
!!! note
|
||||
`vllm_xargs` is assigned to `SamplingParams.extra_args` under the hood, so code which uses `SamplingParams.extra_args` is compatible with both offline and online scenarios.
|
||||
445
docs/features/custom_logitsprocs.md
Normal file
445
docs/features/custom_logitsprocs.md
Normal file
@ -0,0 +1,445 @@
|
||||
# Custom Logits Processors
|
||||
|
||||
!!! important
|
||||
Some logits processors design changes are still in progress and the API may
|
||||
change in the near future. We hope to stabilize this part of the API soon
|
||||
|
||||
A "custom" logits processor is written by a user of vLLM and is loaded into vLLM at initialization without needing to modify or recompile the vLLM source code. It is the opposite of a built-in logits processor.
|
||||
|
||||
This document shows how to write, load and use a custom logits processor.
|
||||
|
||||
## Logits Processors Background
|
||||
|
||||
A logits processor adjusts the next-token probability distribution, usually with the intention of steering the model towards a desired type of behavior.
|
||||
|
||||
In vLLM, logits processors operate at batch granularity. During a given engine step, the logits processor consumes a `(num_requests) x (vocab_size)` tensor of raw logits output by the model. For all requests which enable the logits processor, the logits processor applies a transformation to the corresponding row of the logits tensor, while leaving other rows unmodified. The transformed logits tensor is then passed to softmax.
|
||||
|
||||
## Creating a Custom Logits Processor
|
||||
|
||||
Custom logits processors must subclass `vllm.v1.sample.logits_processor.LogitsProcessor` and define (at minimum) the following methods:
|
||||
|
||||
* `__init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool)`
|
||||
* `vllm_config`: engine configuration data structure
|
||||
* `device`: hardware accelerator device info
|
||||
* `is_pin_memory`: flag indicating whether pin memory is available to support logits processor implementation
|
||||
|
||||
* `apply(self, logits: torch.Tensor) -> torch.Tensor`:
|
||||
* Consume a `(num_requests) x (vocab_size)` logits tensor (`logits`)
|
||||
* Apply logits processor transformation at batch granularity
|
||||
* Return a transformed `(num_requests) x (vocab_size)` logits tensor
|
||||
* You can modify the input logits processors in-place or out-of-place; in-place is more memory-efficient
|
||||
|
||||
* `is_argmax_invariant(self) -> bool`:
|
||||
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
|
||||
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
|
||||
|
||||
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
|
||||
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
|
||||
* Use the `BatchUpdate` members to update logits processor internal state
|
||||
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
|
||||
|
||||
### How the vLLM engine builds the `BatchUpdate` data structure
|
||||
|
||||
!!! important
|
||||
Some logits processors design changes are still in progress. We expect
|
||||
that in the future you will not need to account for batch state changes
|
||||
when implementing a logits processor, and the information in this section
|
||||
will become irrelevant.
|
||||
|
||||
Logits processor `update_state()` implementations should assume the following model for how the model runner updates persistent batch state (expressed here in terms of the `BatchUpdate` abstraction):
|
||||
|
||||
1. Identify indices of requests which finished in the current engine step
|
||||
|
||||
2. Identify new requests introduced in the current step
|
||||
|
||||
3. Use Add operations to replace as many finished requests with new requests, in order of increasing index of the replaced request starting with the lowest index
|
||||
|
||||
4. Based on the relative number of new and finished requests:
|
||||
|
||||
1. If the numbers of new and finished requests are the same, proceed to next step
|
||||
|
||||
2. *If there are more new requests than finished requests:* apply Add operations to extend the batch with the remaining new requests which did not replace finished requests. Assign consecutive indices to these new requests, starting with `current_max_batch_index + 1`
|
||||
|
||||
3. *If there are fewer new requests than finished requests:*
|
||||
|
||||
* Apply Remove operations to finished requests which were not replaced with new requests. These removed request indices will necessarily be greater than the greatest index of the finished requests which were replaced in the previous step. The Removes may leave the batch in a non-contiguous state
|
||||
|
||||
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
|
||||
|
||||
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
|
||||
|
||||
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
|
||||
|
||||
Notes:
|
||||
|
||||
* A logits processor `update_state()` method must process batch update operations in the following order: removes, adds, moves
|
||||
|
||||
* The index argument for Add operations refers to the index *at the time the Add occurred*, i.e. before any Move operations
|
||||
* Example: if a request is Added at index 5 and then swapped with index 3, the Add operation in `BatchUpdate.added` will be associated with index 5 not 3
|
||||
* In other words Move operations can be assumed to be applied after Adds and Removes
|
||||
|
||||
* Move operations can be assumed to be applied in the order in which they appear in `BatchUpdate.moved`
|
||||
|
||||
* If there are no new/finished requests and there is no batch reordering, then the batch update for the logits processors will be `None`
|
||||
|
||||
### Passing Custom Argument to a Custom Logits Processor
|
||||
|
||||
Unlike built-in logits processors, custom logits processors may require configuration arguments that are not hard-coded into `SamplingParams` or the vLLM server REST API. To solve this problem, custom logits processors may leverage vLLM [custom arguments](./custom_arguments.md) support to receive configuration settings from the user (although you are also free to design a custom logits processor which utilizes the pre-existing fields in `SamplingParams`.)
|
||||
|
||||
### Example Custom Logits Processor Implementation
|
||||
|
||||
The contrived example below implements a custom logits processor which consumes a `(num\_requests) \times (vocab\_size)` logits tensor and masks out all tokens except for one (`target_token`) with `float(-inf)`. The logits processor is disabled for any request that does not specify `target_token`. To determine whether the logits processor is enabled and which token to leave unmasked, the logits processor checks `SamplingParams.extra_args` for a `target_token` custom argument associated with each request:
|
||||
|
||||
??? code "Example custom logits processor definition"
|
||||
|
||||
``` python
|
||||
from typing import Optional
|
||||
import torch
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.sample.logits_processor import (BatchUpdate,
|
||||
LogitsProcessor,
|
||||
MoveDirectionality)
|
||||
|
||||
class DummyLogitsProcessor(LogitsProcessor):
|
||||
"""Fake logit processor to support unit testing and examples"""
|
||||
|
||||
def __init__(self, vllm_config: "VllmConfig", device: torch.device,
|
||||
is_pin_memory: bool):
|
||||
self.req_info: dict[int, int] = {}
|
||||
|
||||
def is_argmax_invariant(self) -> bool:
|
||||
"""Never impacts greedy sampling"""
|
||||
return False
|
||||
|
||||
def update_state(self, batch_update: Optional[BatchUpdate]):
|
||||
if not batch_update:
|
||||
return
|
||||
|
||||
# Process added requests.
|
||||
for index, params, _, _ in batch_update.added:
|
||||
assert params is not None
|
||||
if params.extra_args and (target_token :=
|
||||
params.extra_args.get("target_token")):
|
||||
self.req_info[index] = target_token
|
||||
else:
|
||||
self.req_info.pop(index, None)
|
||||
|
||||
if self.req_info:
|
||||
# Process removed requests.
|
||||
for index in batch_update.removed:
|
||||
self.req_info.pop(index, None)
|
||||
|
||||
# Process moved requests, unidirectional move (a->b) and swap
|
||||
# (a<->b)
|
||||
for adx, bdx, direct in batch_update.moved:
|
||||
a_val = self.req_info.pop(adx, None)
|
||||
b_val = self.req_info.pop(bdx, None)
|
||||
if a_val is not None:
|
||||
self.req_info[bdx] = a_val
|
||||
if direct == MoveDirectionality.SWAP and b_val is not None:
|
||||
self.req_info[adx] = b_val
|
||||
|
||||
def apply(self, logits: torch.Tensor) -> torch.Tensor:
|
||||
if not self.req_info:
|
||||
return logits
|
||||
|
||||
# Save target values before modification
|
||||
cols = torch.tensor(
|
||||
list(self.req_info.values()), dtype=torch.long, device=logits.device
|
||||
)
|
||||
rows = torch.tensor(
|
||||
list(self.req_info.keys()), dtype=torch.long, device=logits.device
|
||||
)
|
||||
values_to_keep = logits[rows, cols].clone()
|
||||
|
||||
# Mask all but target tokens
|
||||
logits[rows] = float('-inf')
|
||||
logits[rows, cols] = values_to_keep
|
||||
|
||||
return logits
|
||||
```
|
||||
|
||||
In the rest of this document, we will use `DummyLogitsProcessor` as an example of a custom logits processor.
|
||||
|
||||
The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" representation of the batched requests in the `self.req_info` dictionary: only those requests which specify a `target_token` value have a key in the dictionary. `update_state()` adjusts the stored request indices and `target_token` values (keys and values respectively in `self.req_info`) in response to Add, Remove and Move operations against the persistent batch.
|
||||
|
||||
### Wrapping an Existing Request-Level Logits Processor
|
||||
|
||||
Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here](https://docs.vllm.ai/en/v0.10.1.1/api/vllm/logits_process.html)) conforming to the following type annotation:
|
||||
|
||||
``` python
|
||||
RequestLogitsProcessor = Union[
|
||||
|
||||
# (output token ids, logits tensor) -> logits tensor
|
||||
Callable[[list[int], Tensor], Tensor],
|
||||
|
||||
# (prompt token ids, output token ids, logits tensor) -> logits tensor
|
||||
Callable[[list[int], list[int], Tensor], Tensor],
|
||||
]
|
||||
```
|
||||
|
||||
While request-level logits processors are explicitly *not* supported in the vLLM engine, vLLM *does* provide a convenient process to wrap an existing `Callable` request-level logits processor and create a batch-level logits processor that is compatible with vLLM. The `Callable` must conform to the type annotation above; if your request-level logits processor has a different interface, then in order to wrap it, you may need to modify it or implement an additional wrapper layer to comply with the interface specification above.
|
||||
|
||||
You can wrap the request-level logits processor by subclassing `AdapterLogitsProcessor` as shown in the example below (in this example, `DummyPerReqLogitsProcessor` is a stand-in for your request-level logits processor which needs to be wrapped.) Override `AdapterLogitsProcessor.is_argmax_invariant(self)` to accurately reflect whether your request-level logits processor may impact which token has the highest-value logit. Override `AdapterLogitsProcessor.new_req_logits_processor(self,params)` to create a new request-level logits processor instance from a `SamplingParams` instance:
|
||||
|
||||
??? code "Example of Wrapping a Request-Level Logits Processor"
|
||||
|
||||
``` python
|
||||
...
|
||||
|
||||
from vllm.v1.sample.logits_processor import (
|
||||
AdapterLogitsProcessor, # Wrapper base-class
|
||||
RequestLogitsProcessor, # Request-level logitsproc type annotation
|
||||
)
|
||||
|
||||
...
|
||||
|
||||
# Stand-in for your request-level logits processor:
|
||||
class DummyPerReqLogitsProcessor:
|
||||
"""The request-level logits processor masks out all logits except the
|
||||
token id identified by `target_token`"""
|
||||
|
||||
def __init__(self, target_token: int) -> None:
|
||||
"""Specify `target_token`"""
|
||||
self.target_token = target_token
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
output_ids: list[int],
|
||||
logits: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
val_to_keep = logits[self.target_token].item()
|
||||
logits[:] = float("-inf")
|
||||
logits[self.target_token] = val_to_keep
|
||||
return logits
|
||||
|
||||
...
|
||||
|
||||
# Example of wrapping the request-level logits processor:
|
||||
class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
|
||||
"""Example of wrapping a fake request-level logit processor to create a
|
||||
batch-level logits processor"""
|
||||
|
||||
def is_argmax_invariant(self) -> bool:
|
||||
return False
|
||||
|
||||
def new_req_logits_processor(
|
||||
self,
|
||||
params: SamplingParams,
|
||||
) -> Optional[RequestLogitsProcessor]:
|
||||
"""This method returns a new request-level logits processor, customized
|
||||
to the `target_token` value associated with a particular request.
|
||||
|
||||
Returns None if the logits processor should not be applied to the
|
||||
particular request. To use the logits processor the request must have
|
||||
a "target_token" custom argument with an integer value.
|
||||
|
||||
Args:
|
||||
params: per-request sampling params
|
||||
|
||||
Returns:
|
||||
`Callable` request logits processor, or None
|
||||
"""
|
||||
target_token: Optional[Any] = params.extra_args and params.extra_args.get(
|
||||
"target_token"
|
||||
)
|
||||
if target_token is None:
|
||||
return None
|
||||
if not isinstance(target_token, int):
|
||||
logger.warning(
|
||||
"target_token value %s is not int; not applying logits"
|
||||
" processor to request.",
|
||||
target_token,
|
||||
)
|
||||
return None
|
||||
return DummyPerReqLogitsProcessor(target_token)
|
||||
```
|
||||
|
||||
!!! note
|
||||
Your `new_req_logits_processor()` override can return `None` to signal that the wrapped logits processor should not be applied to the request in question.
|
||||
|
||||
Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) which wraps your request level logits processor, you can pass the custom subclass to vLLM via any of the methods described in the following section.
|
||||
|
||||
## Ways to Load Your Custom Logits Processor in vLLM
|
||||
|
||||
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
|
||||
|
||||
This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
|
||||
|
||||
### Method 1: Pass the Custom Logits Processor Fully-Qualified Class Name (FQCN) to vLLM at Initialization Time
|
||||
|
||||
This method is supported in both offline and online vLLM usage scenarios. The custom logits processor's FQCN (in the form of `dotted.path.to.module:ClassName`) can be passed as an argument to the `LLM` and `AsyncLLM` Python constructors, or as a CLI argument to `vllm serve` with the following syntax
|
||||
|
||||
``` bash
|
||||
vllm serve ... --logits_processors <logits processor 1> <logits processor 2> ...
|
||||
```
|
||||
|
||||
The only requirements on the FQCN are
|
||||
|
||||
1. Python's `importlib.import_module()` must be able to resolve the dotted path portion of the FQCN and load it as a module
|
||||
|
||||
2. The class-name portion of the FQCN must be possible to import from the loaded module
|
||||
|
||||
3. The object pointed to by the FQCN must be a subclass of `LogitsProcessor`
|
||||
|
||||
See examples below:
|
||||
|
||||
??? code "Passing custom logits processor FQCN to `LLM` in Python"
|
||||
|
||||
``` python
|
||||
# Pass in FQCN
|
||||
llm = LLM(
|
||||
model="facebook/opt-125m",
|
||||
logits_processors=["your.module.path:DummyLogitsProcessor"],
|
||||
)
|
||||
```
|
||||
|
||||
??? code "Passing custom logits processor FQCN to `AsyncLLM` in Python"
|
||||
|
||||
``` python
|
||||
# Pass in FQCN
|
||||
engine_args = AsyncEngineArgs(model="facebook/opt-125m",
|
||||
logits_processors=["your.module.path:DummyLogitsProcessor"])
|
||||
async_llm = AsyncLLM.from_engine_args(engine_args)
|
||||
```
|
||||
|
||||
??? code "Passing custom logits processor FQCN to vLLM server via CLI"
|
||||
|
||||
```bash
|
||||
vllm serve facebook/opt-125m --logits_processors your.module.path:DummyLogitsProcessor
|
||||
```
|
||||
|
||||
### Method 2: Automatically Detect Custom Logits Processors Installed in Your Python Environment As Entry Points
|
||||
|
||||
[`setuptools`](https://setuptools.pypa.io/en/latest/userguide/entry_point.html) can enable installed packages to make themselves available as plugins to other Python programs, via pieces of metadata known as "entry points".
|
||||
|
||||
During initialization, vLLM automatically scans the `vllm.logits_processors` entry point group and loads any installed logits processors which it finds.
|
||||
|
||||
Suppose that you have developed a Python package that holds your custom logits processors. You can expose each logits processor to vLLM by adding a unique entrypoint for each logits processor to your logits processor Python package. The example below shows how to add an entrypoint to your project's `pyproject.toml` file:
|
||||
|
||||
??? code "Exposing a custom logits processor as a Python entrypoint"
|
||||
|
||||
``` toml
|
||||
[project.entry-points."vllm.logits_processors"]
|
||||
dummy_logits_processor = "your.module.path:DummyLogitsProcessor"
|
||||
```
|
||||
|
||||
Once your package is installed, your custom logits processor will be loaded automatically whenever vLLM is initialized. You do *not* need to pass the custom logits processor to the `LLM` or `AsyncLLM` constructors or to the vLLM server explicitly at initialization time if your logits processor is exposed as an entry point.
|
||||
|
||||
!!! note
|
||||
vLLM will *always* load *all* logits processors which are exposed via entrypoints under the `vllm.logits_processors` grouping.
|
||||
|
||||
### Method 3 (Offline-only): Pass a Python Class Object to the vLLM Constructor
|
||||
|
||||
You can pass one or more custom logits processor class objects to the `LLM` and `AsyncLLM` constructors. This option is very flexible, as the logits processor classes may either be (1) defined locally within the same Python source file where `LLM` or `AsyncLLM` is instantiated, or (2) imported from a Python package.
|
||||
|
||||
??? code "Passing custom logits processor class object to `LLM` or `AsyncLLM` in Python"
|
||||
|
||||
``` python
|
||||
# Import custom logits processor
|
||||
from some.module import DummyLogitsProcessor
|
||||
|
||||
# ...or...
|
||||
|
||||
# Define custom logits processor locally
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||
|
||||
class DummyLogitsProcessor(LogitsProcessor):
|
||||
# See DummyLogitsProcessor implementation above
|
||||
...
|
||||
|
||||
# Pass class object to LLM constructor
|
||||
llm = LLM(
|
||||
model="facebook/opt-125m",
|
||||
logits_processors=[DummyLogitsProcessor],
|
||||
)
|
||||
|
||||
# Pass class object to AsyncLLM constructor
|
||||
engine_args = AsyncEngineArgs(model="facebook/opt-125m",
|
||||
logits_processors=[DummyLogitsProcessor])
|
||||
async_llm = AsyncLLM.from_engine_args(engine_args)
|
||||
```
|
||||
|
||||
## Invoking a Custom Logits Processor Against a Request
|
||||
|
||||
The design of the custom logits processor determines whether the logits processor must be enabled/disabled for a given request, and what arguments must be provided to configure the logits processor.
|
||||
|
||||
The examples below show how a user would pass a custom argument (`target_token`) to `DummyLogitsProcessor` in order to (1) enable the logits processor for that particular request and (2) control the logits processor's behavior.
|
||||
|
||||
??? code "vLLM REST API: configure custom logits processor for a request"
|
||||
|
||||
``` bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen2.5-1.5B-Instruct",
|
||||
...
|
||||
"vllm_xargs": {"target_token": 67}
|
||||
}'
|
||||
```
|
||||
|
||||
??? code "OpenAI SDK: configure custom logits processor for a request"
|
||||
|
||||
``` python
|
||||
batch = await client.completions.create(
|
||||
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
...,
|
||||
extra_body={
|
||||
"vllm_xargs": {
|
||||
"target_token": 67
|
||||
}
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
??? code "Offline: configure custom logits processor for an `LLM` request"
|
||||
|
||||
``` python
|
||||
outputs_logitproc = llm.generate("your prompt",
|
||||
SamplingParams(...,
|
||||
extra_args={"target_token": 67}))
|
||||
```
|
||||
|
||||
??? code "Offline: configure custom logits processor for an `AsyncLLM` request"
|
||||
|
||||
``` python
|
||||
async for out in engine.generate(request_id="your request id",
|
||||
prompt="your prompt",
|
||||
sampling_params=SamplingParams(...,
|
||||
extra_args={"target_token": 67})):
|
||||
|
||||
# Process async request outputs
|
||||
...
|
||||
```
|
||||
|
||||
## Best Practices for Writing Custom Logits Processors
|
||||
|
||||
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
|
||||
|
||||
* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
|
||||
* For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
|
||||
* However, if you think that a logits processor may be used infrequently, it may be appropriate to use a "sparse" representation of request state i.e. the class can represent request configuration using a dictionary which only stores metadata about requests that enable the logits processor
|
||||
* **Note:** wrapped request-level logits processors do not need to implement `apply()` and `update_state()`; the default `AdapterLogitsProcessor.update_state()` implementation maintains a sparse representation of request state, wherein requests for which `new_req_logits_processor()` returns `None` are not represented in the base-class state dictionary. The default implementation of `AdapterLogitsProcessor.apply()` applies the request-level logits processor to each row of input logits sequentially and assembles the output logits tensor. If the performance of this `AdapterLogitsProcessor` default implementation is insufficient, then avoid wrapping your request-level logits processor and instead re-implement it as a `LogitsProcessor` subclass with optimized `apply()` and `update_state()` implementations that operate at batch granularity
|
||||
|
||||
* It is up to the logits processor author to determine:
|
||||
|
||||
1. **The per-request attributes which configure the logits processor's behavior against that request.** Your custom logits processor's `update_state()` override determines how `SamplingParams` fields are mapped into logits processor state
|
||||
|
||||
* **Note:** for wrapped request-level logits processors, `new_req_logits_processor()` determines how `SamplingParams` fields are used to initialize a request-level logits processor instance.
|
||||
|
||||
2. **The conditions under which the logits processor is or is not enabled on a per-request basis.** Unless your intention is for the custom logits processor to act on all requests all the time, you should write your logits processor in such a way that it is possible to disable the logits processor for a given request, i.e. by defaulting an argument to `None` or by passing in a specific do-nothing argument value i.e. `0.0`. Try to save compute and memory for requests which disable the logits processor
|
||||
|
||||
* **Note:** for wrapped per-request logits processors, the default `AdapterLogitsProcessor.update_state()` implementation ensures that the request-level logits processor is disabled when `new_req_logits_processor()` returns `None` for that request
|
||||
|
||||
3. **The conditions under which the logits processor is short-circuited at the batch level.** Even if you have defined a way to disable the custom logits processor at the request level, it may be difficult to translate this into compute savings i.e. if your `update_state()` and `apply()` implementations use efficient vectorized implementations that operate on the whole persistent batch in a single command. For example, you cannot skip an entire vectorized operation in `apply()` just because one request disabled the logits processor. To save compute in the edge-case where no running requests utilize the custom logits processor, we recommend designing `apply()` to return the unmodified input tensor if all requests have the logits processor disabled. Similarly, consider whether steps can be skipped in `update_state()` if no requests enable the logits processor
|
||||
|
||||
* Additionally, an easy way to save compute in `update_state()` is to exit early when the `batch_update` is `None`
|
||||
|
||||
* **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class implements the above optimizations by default
|
||||
|
||||
* Ensure that the logits processor `update_state` method discards information about finished requests (i.e. requests which are replaced by an Add or which are subject to a Remove)
|
||||
|
||||
* **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
|
||||
|
||||
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
|
||||
@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models:
|
||||
|
||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||
|--------------|-------------|------------------|-------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
||||
|
||||
!!! note
|
||||
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
|
||||
@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
|
||||
|
||||
The following parameters are supported, which must be added as extra parameters:
|
||||
|
||||
- `guided_choice`: the output will be exactly one of the choices.
|
||||
- `guided_regex`: the output will follow the regex pattern.
|
||||
- `guided_json`: the output will follow the JSON schema.
|
||||
- `guided_grammar`: the output will follow the context free grammar.
|
||||
- `choice`: the output will be exactly one of the choices.
|
||||
- `regex`: the output will follow the regex pattern.
|
||||
- `json`: the output will follow the JSON schema.
|
||||
- `grammar`: the output will follow the context free grammar.
|
||||
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
|
||||
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
|
||||
|
||||
Structured outputs are supported by default in the OpenAI-Compatible Server. You
|
||||
may choose to specify the backend to use by setting the
|
||||
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
|
||||
`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
|
||||
which will try to choose an appropriate backend based on the details of the
|
||||
request. You may also choose a specific backend, along with
|
||||
some options. A full set of options is available in the `vllm serve --help`
|
||||
text.
|
||||
|
||||
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
||||
Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
|
||||
|
||||
??? code
|
||||
|
||||
@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
|
||||
The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
|
||||
|
||||
??? code
|
||||
|
||||
@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
|
||||
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
|
||||
For this we can use the `guided_json` parameter in two different ways:
|
||||
For this we can use the `json` parameter in two different ways:
|
||||
|
||||
- Using directly a [JSON Schema](https://json-schema.org/)
|
||||
- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
|
||||
|
||||
The next example shows how to use the `guided_json` parameter with a Pydantic model:
|
||||
The next example shows how to use the `response_format` parameter with a Pydantic model:
|
||||
|
||||
??? code
|
||||
|
||||
@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
results notably in most cases.
|
||||
|
||||
Finally we have the `guided_grammar` option, which is probably the most
|
||||
Finally we have the `grammar` option, which is probably the most
|
||||
difficult to use, but it´s really powerful. It allows us to define complete
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
As an example, we can use to define a specific format of simplified SQL queries:
|
||||
@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
|
||||
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
|
||||
## Offline Inference
|
||||
|
||||
Offline inference allows for the same types of structured outputs.
|
||||
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
|
||||
The main available options inside `GuidedDecodingParams` are:
|
||||
To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
|
||||
The main available options inside `StructuredOutputsParams` are:
|
||||
|
||||
- `json`
|
||||
- `regex`
|
||||
@ -309,12 +309,12 @@ shown below:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
|
||||
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
||||
|
||||
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
|
||||
sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
|
||||
outputs = llm.generate(
|
||||
prompts="Classify this sentiment: vLLM is wonderful!",
|
||||
sampling_params=sampling_params,
|
||||
|
||||
@ -71,7 +71,7 @@ This example demonstrates:
|
||||
* Making a request with `tool_choice="auto"`
|
||||
* Handling the structured response and executing the corresponding function
|
||||
|
||||
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
|
||||
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
|
||||
|
||||
Remember that it's the caller's responsibility to:
|
||||
|
||||
@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
|
||||
|
||||
## Named Function Calling
|
||||
|
||||
vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
|
||||
enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
|
||||
vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
|
||||
high-quality one.
|
||||
|
||||
vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
|
||||
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
|
||||
vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
|
||||
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
|
||||
|
||||
To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
|
||||
specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
|
||||
|
||||
## Required Function Calling
|
||||
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
|
||||
|
||||
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
|
||||
|
||||
|
||||
@ -52,6 +52,24 @@ uv pip install -e .
|
||||
1 error generated.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
If the build fails with C++11/C++17 compatibility errors like the following, the issue is that the build system is defaulting to an older C++ standard:
|
||||
|
||||
```text
|
||||
[...] error: 'constexpr' is not a type
|
||||
[...] error: expected ';' before 'constexpr'
|
||||
[...] error: 'constexpr' does not name a type
|
||||
```
|
||||
|
||||
**Solution**: Your compiler might be using an older C++ standard. Edit `cmake/cpu_extension.cmake` and add `set(CMAKE_CXX_STANDARD 17)` before `set(CMAKE_CXX_STANDARD_REQUIRED ON)`.
|
||||
|
||||
To check your compiler's C++ standard support:
|
||||
```bash
|
||||
clang++ -std=c++17 -pedantic -dM -E -x c++ /dev/null | grep __cplusplus
|
||||
```
|
||||
On Apple Clang 16 you should see: `#define __cplusplus 201703L`
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
|
||||
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
|
||||
|
||||
```bash
|
||||
uv venv --python 3.12 --seed
|
||||
|
||||
@ -554,6 +554,17 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
|
||||
#### Token Classification
|
||||
|
||||
These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
|
||||
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ |
|
||||
|
||||
!!! note
|
||||
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
|
||||
|
||||
[](){ #supported-mm-models }
|
||||
|
||||
## List of Multimodal Language Models
|
||||
@ -661,6 +672,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
|
||||
|
||||
@ -10,7 +10,7 @@ Before using EP, you need to install the necessary dependencies. We are actively
|
||||
|
||||
1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](gh-file:tools/ep_kernels).
|
||||
2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation).
|
||||
3. **For disaggregated serving**: Install UCX and NIXL following the [script](gh-file:tools/install_nixl.sh).
|
||||
3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](gh-file:tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
|
||||
|
||||
### Backend Selection Guide
|
||||
|
||||
@ -191,7 +191,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok
|
||||
|
||||
### Setup Steps
|
||||
|
||||
1. **Install KV Connector**: Install NIXL using the [installation script](gh-file:tools/install_nixl.sh)
|
||||
1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip.
|
||||
|
||||
2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`
|
||||
|
||||
|
||||
@ -133,7 +133,7 @@ completion = client.chat.completions.create(
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_body={
|
||||
"guided_choice": ["positive", "negative"]
|
||||
"structured_outputs": {"choice": ["positive", "negative"]}
|
||||
}
|
||||
)
|
||||
```
|
||||
@ -317,10 +317,11 @@ Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_cli
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:embedding-pooling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported by default:
|
||||
@ -374,7 +375,7 @@ The following extra parameters are supported:
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
||||
```
|
||||
|
||||
|
||||
[](){ #translations-api }
|
||||
|
||||
### Translations API
|
||||
@ -527,10 +528,11 @@ curl -v "http://127.0.0.1:8000/classify" \
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classification-pooling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
@ -733,10 +735,11 @@ Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_mu
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classification-pooling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
@ -815,10 +818,11 @@ Result documents will be sorted by relevance, and the `index` property can be us
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classification-pooling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
@ -87,6 +87,11 @@ def parse_args():
|
||||
default=0.8,
|
||||
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-dbo",
|
||||
action="store_true",
|
||||
help=("Enable microbatched execution"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compilation-config",
|
||||
type=int,
|
||||
@ -113,6 +118,7 @@ def main(
|
||||
max_model_len,
|
||||
compilation_config,
|
||||
gpu_memory_utilization,
|
||||
enable_dbo,
|
||||
quantization,
|
||||
):
|
||||
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
|
||||
@ -167,6 +173,7 @@ def main(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
enable_dbo=enable_dbo,
|
||||
quantization=quantization,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
@ -227,6 +234,7 @@ if __name__ == "__main__":
|
||||
args.max_model_len,
|
||||
args.compilation_config,
|
||||
args.gpu_memory_utilization,
|
||||
args.enable_dbo,
|
||||
args.quantization,
|
||||
),
|
||||
)
|
||||
|
||||
@ -56,7 +56,6 @@ class DummyLogitsProcessor(LogitsProcessor):
|
||||
self.req_info: dict[int, int] = {}
|
||||
|
||||
def is_argmax_invariant(self) -> bool:
|
||||
"""Never impacts greedy sampling"""
|
||||
return False
|
||||
|
||||
def update_state(self, batch_update: Optional[BatchUpdate]):
|
||||
@ -75,13 +74,12 @@ class DummyLogitsProcessor(LogitsProcessor):
|
||||
return logits
|
||||
|
||||
# Save target values before modification
|
||||
rows_list = list(self.req_info.keys())
|
||||
cols = torch.tensor(
|
||||
[self.req_info[i] for i in rows_list],
|
||||
dtype=torch.long,
|
||||
device=logits.device,
|
||||
list(self.req_info.values()), dtype=torch.long, device=logits.device
|
||||
)
|
||||
rows = torch.tensor(
|
||||
list(self.req_info.keys()), dtype=torch.long, device=logits.device
|
||||
)
|
||||
rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device)
|
||||
values_to_keep = logits[rows, cols].clone()
|
||||
|
||||
# Mask all but target tokens
|
||||
|
||||
@ -26,8 +26,14 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
|
||||
python examples/offline_inference/pooling/embed_matryoshka_fy.py
|
||||
```
|
||||
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/ner.py
|
||||
```
|
||||
|
||||
## Qwen3 reranker usage
|
||||
|
||||
```bash
|
||||
python qwen3_reranker.py
|
||||
python examples/offline_inference/pooling/qwen3_reranker.py
|
||||
```
|
||||
|
||||
54
examples/offline_inference/pooling/ner.py
Normal file
54
examples/offline_inference/pooling/ner.py
Normal file
@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(
|
||||
model="boltuix/NeuroBERT-NER",
|
||||
runner="pooling",
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Barack Obama visited Microsoft headquarters in Seattle on January 2025."
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(**vars(args))
|
||||
tokenizer = llm.get_tokenizer()
|
||||
label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
|
||||
|
||||
# Run inference
|
||||
outputs = llm.encode(prompts)
|
||||
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
logits = output.outputs.data
|
||||
predictions = logits.argmax(dim=-1)
|
||||
|
||||
# Map predictions to labels
|
||||
tokens = tokenizer.convert_ids_to_tokens(output.prompt_token_ids)
|
||||
labels = [label_map[p.item()] for p in predictions]
|
||||
|
||||
# Print results
|
||||
for token, label in zip(tokens, labels):
|
||||
if token not in tokenizer.all_special_tokens:
|
||||
print(f"{token:15} → {label}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -53,7 +53,6 @@ def parse_args():
|
||||
"--method",
|
||||
type=str,
|
||||
default="eagle",
|
||||
choices=["ngram", "eagle", "eagle3", "mtp"],
|
||||
)
|
||||
parser.add_argument("--num-spec-tokens", type=int, default=2)
|
||||
parser.add_argument("--prompt-lookup-max", type=int, default=5)
|
||||
@ -118,6 +117,11 @@ def main():
|
||||
"prompt_lookup_max": args.prompt_lookup_max,
|
||||
"prompt_lookup_min": args.prompt_lookup_min,
|
||||
}
|
||||
elif args.method.endswith("mtp"):
|
||||
speculative_config = {
|
||||
"method": args.method,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"unknown method: {args.method}")
|
||||
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file demonstrates the example usage of guided decoding
|
||||
to generate structured outputs using vLLM. It shows how to apply
|
||||
different guided decoding techniques such as Choice, Regex, JSON schema,
|
||||
and Grammar to produce structured and formatted results
|
||||
based on specific prompts.
|
||||
This file demonstrates the example usage of structured outputs
|
||||
in vLLM. It shows how to apply different constraints such as choice,
|
||||
regex, json schema, and grammar to produce structured and formatted
|
||||
results based on specific prompts.
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
@ -13,19 +12,23 @@ from enum import Enum
|
||||
from pydantic import BaseModel
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
|
||||
MAX_TOKENS = 50
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||
sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
|
||||
# Structured outputs by Choice (list of possible options)
|
||||
structured_outputs_params_choice = StructuredOutputsParams(
|
||||
choice=["Positive", "Negative"]
|
||||
)
|
||||
sampling_params_choice = SamplingParams(
|
||||
structured_outputs=structured_outputs_params_choice
|
||||
)
|
||||
prompt_choice = "Classify this sentiment: vLLM is wonderful!"
|
||||
|
||||
# Guided decoding by Regex
|
||||
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
|
||||
# Structured outputs by Regex
|
||||
structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
|
||||
sampling_params_regex = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_regex,
|
||||
structured_outputs=structured_outputs_params_regex,
|
||||
stop=["\n"],
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
@ -36,7 +39,7 @@ prompt_regex = (
|
||||
)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
# Structured outputs by JSON using Pydantic schema
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
@ -51,17 +54,16 @@ class CarDescription(BaseModel):
|
||||
|
||||
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
|
||||
structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
|
||||
sampling_params_json = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_json,
|
||||
max_tokens=MAX_TOKENS,
|
||||
structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS
|
||||
)
|
||||
prompt_json = (
|
||||
"Generate a JSON with the brand, model and car_type of"
|
||||
"Generate a JSON with the brand, model and car_type of "
|
||||
"the most iconic car from the 90's"
|
||||
)
|
||||
|
||||
# Guided decoding by Grammar
|
||||
# Structured outputs by Grammar
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
@ -70,13 +72,15 @@ table ::= "table_1 " | "table_2 "
|
||||
condition ::= column "= " number
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
|
||||
structured_outputs_params_grammar = StructuredOutputsParams(
|
||||
grammar=simplified_sql_grammar
|
||||
)
|
||||
sampling_params_grammar = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_grammar,
|
||||
structured_outputs=structured_outputs_params_grammar,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
prompt_grammar = (
|
||||
"Generate an SQL query to show the 'username' and 'email'from the 'users' table."
|
||||
"Generate an SQL query to show the 'username' and 'email' from the 'users' table."
|
||||
)
|
||||
|
||||
|
||||
@ -93,16 +97,16 @@ def main():
|
||||
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
|
||||
|
||||
choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
|
||||
format_output("Guided decoding by Choice", choice_output)
|
||||
format_output("Structured outputs by Choice", choice_output)
|
||||
|
||||
regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
|
||||
format_output("Guided decoding by Regex", regex_output)
|
||||
format_output("Structured outputs by Regex", regex_output)
|
||||
|
||||
json_output = generate_output(prompt_json, sampling_params_json, llm)
|
||||
format_output("Guided decoding by JSON", json_output)
|
||||
format_output("Structured outputs by JSON", json_output)
|
||||
|
||||
grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
|
||||
format_output("Guided decoding by Grammar", grammar_output)
|
||||
format_output("Structured outputs by Grammar", grammar_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1437,6 +1437,80 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
|
||||
)
|
||||
|
||||
|
||||
# Qwen3-VL-Dense
|
||||
def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen3-VL-4B-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
"fps": 1,
|
||||
},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|image_pad|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|video_pad|>"
|
||||
|
||||
prompts = [
|
||||
(
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Qwen3-VL-MOE
|
||||
def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
"fps": 1,
|
||||
},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|image_pad|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|video_pad|>"
|
||||
|
||||
prompts = [
|
||||
(
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# R-4B
|
||||
def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1645,6 +1719,8 @@ model_example_map = {
|
||||
"qwen2_vl": run_qwen2_vl,
|
||||
"qwen2_5_vl": run_qwen2_5_vl,
|
||||
"qwen2_5_omni": run_qwen2_5_omni,
|
||||
"qwen3_vl": run_qwen3_vl,
|
||||
"qwen3_vl_moe": run_qwen3_vl_moe,
|
||||
"rvl": run_r_vl,
|
||||
"skywork_chat": run_skyworkr1v,
|
||||
"smolvlm": run_smolvlm,
|
||||
@ -1658,6 +1734,8 @@ MODELS_NEED_VIDEO_METADATA = [
|
||||
"glm4_1v",
|
||||
"glm4_5v",
|
||||
"glm4_5v_fp8",
|
||||
"qwen3_vl",
|
||||
"qwen3_vl_moe",
|
||||
]
|
||||
|
||||
|
||||
|
||||
87
examples/online_serving/dashboards/README.md
Normal file
87
examples/online_serving/dashboards/README.md
Normal file
@ -0,0 +1,87 @@
|
||||
# Monitoring Dashboards
|
||||
|
||||
This directory contains monitoring dashboard configurations for vLLM, providing
|
||||
comprehensive observability for your vLLM deployments.
|
||||
|
||||
## Dashboard Platforms
|
||||
|
||||
We provide dashboards for two popular observability platforms:
|
||||
|
||||
- **[Grafana](https://grafana.com)**
|
||||
- **[Perses](https://perses.dev)**
|
||||
|
||||
## Dashboard Format Approach
|
||||
|
||||
All dashboards are provided in **native formats** that work across different
|
||||
deployment methods:
|
||||
|
||||
### Grafana (JSON)
|
||||
|
||||
- ✅ Works with any Grafana instance (cloud, self-hosted, Docker)
|
||||
- ✅ Direct import via Grafana UI or API
|
||||
- ✅ Can be wrapped in Kubernetes operators when needed
|
||||
- ✅ No vendor lock-in or deployment dependencies
|
||||
|
||||
### Perses (YAML)
|
||||
|
||||
- ✅ Works with standalone Perses instances
|
||||
- ✅ Compatible with Perses API and CLI
|
||||
- ✅ Supports Dashboard-as-Code workflows
|
||||
- ✅ Can be wrapped in Kubernetes operators when needed
|
||||
|
||||
## Dashboard Contents
|
||||
|
||||
Both platforms provide equivalent monitoring capabilities:
|
||||
|
||||
| Dashboard | Description |
|
||||
|-----------|-------------|
|
||||
| **Performance Statistics** | Tracks latency, throughput, and performance metrics |
|
||||
| **Query Statistics** | Monitors request volume, query performance, and KPIs |
|
||||
|
||||
## Quick Start
|
||||
|
||||
First, navigate to this example's directory:
|
||||
|
||||
```bash
|
||||
cd examples/online_serving/dashboards
|
||||
```
|
||||
|
||||
### Grafana
|
||||
|
||||
Import the JSON directly into the Grafana UI, or use the API:
|
||||
|
||||
```bash
|
||||
curl -X POST http://grafana/api/dashboards/db \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @grafana/performance_statistics.json
|
||||
```
|
||||
|
||||
### Perses
|
||||
|
||||
Import via the Perses CLI:
|
||||
|
||||
```bash
|
||||
percli apply -f perses/performance_statistics.yaml
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- **Prometheus** metrics from your vLLM deployment
|
||||
- **Data source** configured in your monitoring platform
|
||||
- **vLLM metrics** enabled and accessible
|
||||
|
||||
## Platform-Specific Documentation
|
||||
|
||||
For detailed deployment instructions and platform-specific options, see:
|
||||
|
||||
- **[Grafana Documentation](./grafana)** - JSON dashboards, operator usage, manual import
|
||||
- **[Perses Documentation](./perses)** - YAML specs, CLI usage, operator wrapping
|
||||
|
||||
## Contributing
|
||||
|
||||
When adding new dashboards, please:
|
||||
|
||||
1. Provide native formats (JSON for Grafana, YAML specs for Perses)
|
||||
2. Update platform-specific README files
|
||||
3. Ensure dashboards work across deployment methods
|
||||
4. Test with the latest platform versions
|
||||
59
examples/online_serving/dashboards/grafana/README.md
Normal file
59
examples/online_serving/dashboards/grafana/README.md
Normal file
@ -0,0 +1,59 @@
|
||||
# Grafana Dashboards for vLLM Monitoring
|
||||
|
||||
This directory contains Grafana dashboard configurations (as JSON) designed to monitor
|
||||
vLLM performance and metrics.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Grafana 8.0+
|
||||
- Prometheus data source configured in Grafana
|
||||
- vLLM deployment with Prometheus metrics enabled
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
|
||||
throughput for your vLLM service.
|
||||
- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
|
||||
performance indicators for your vLLM service.
|
||||
|
||||
## Deployment Options
|
||||
|
||||
### Manual Import (Recommended)
|
||||
|
||||
The easiest way to use these dashboards is to manually import the JSON configurations
|
||||
directly into your Grafana instance:
|
||||
|
||||
1. Navigate to your Grafana instance
|
||||
2. Click the '+' icon in the sidebar
|
||||
3. Select 'Import'
|
||||
4. Copy and paste the JSON content from the dashboard files, or upload the JSON files
|
||||
directly
|
||||
|
||||
### Grafana Operator
|
||||
|
||||
If you're using the [Grafana Operator](https://github.com/grafana-operator/grafana-operator)
|
||||
in Kubernetes, you can wrap these JSON configurations in a `GrafanaDashboard` custom
|
||||
resource:
|
||||
|
||||
```yaml
|
||||
# Note: Adjust the instanceSelector to match your Grafana instance's labels
|
||||
# You can check with: kubectl get grafana -o yaml
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
name: vllm-performance-dashboard
|
||||
spec:
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: grafana # Adjust to match your Grafana instance labels
|
||||
folder: "vLLM Monitoring"
|
||||
json: |
|
||||
# Replace this comment with the complete JSON content from
|
||||
# performance_statistics.json - The JSON should start with { and end with }
|
||||
```
|
||||
|
||||
Then apply to your cluster:
|
||||
|
||||
```bash
|
||||
kubectl apply -f your-dashboard.yaml -n <namespace>
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
760
examples/online_serving/dashboards/grafana/query_statistics.json
Normal file
760
examples/online_serving/dashboards/grafana/query_statistics.json
Normal file
@ -0,0 +1,760 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "High-level overview of VLLM model deployment behavior and key performance indicators. Designed for Data Scientists and Product Managers to monitor request volume, token throughput, and latency",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": 47,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 20,
|
||||
"panels": [],
|
||||
"title": "Request Over Time",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "req/s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 1 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (model_name) (\n rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval])\n)",
|
||||
"interval": "1",
|
||||
"legendFormat": "{{model_name}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Successful Requests Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "req/s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 1 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Requests Avg Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calcultaions": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "p50 Latency",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 4 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "p90 Latency",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 4 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "p99 Latency",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
|
||||
"id": 19,
|
||||
"panels": [],
|
||||
"title": "Size Distribution",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"fillOpacity": 80,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineWidth": 1,
|
||||
"stacking": { "group": "A", "mode": "none" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 8 },
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "{{model_name}} le={{le}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size Distribution",
|
||||
"type": "histogram"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "calculation ": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 8 },
|
||||
"id": 9,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size p90",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 8 },
|
||||
"id": 8,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size p50",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calcultaion": { "index": 0, "text": "mean" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 11 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))\n/\nsum(rate(vllm:request_success_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size Avg",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 17, "y": 11 },
|
||||
"id": 10,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:request_prompt_tokens_bucket{model_name=~\"$Deployment_id\"}[$__rate_interval])))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Token Size p99",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 18,
|
||||
"panels": [],
|
||||
"title": "Input Token Over Time",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 15 },
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "{{model_name}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Tokens Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 15 },
|
||||
"id": 12,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(vllm:prompt_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Input Tokens/Sec Avg",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
|
||||
"id": 17,
|
||||
"panels": [],
|
||||
"title": "Output Token Over Time",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 10, "x": 0, "y": 22 },
|
||||
"id": 13,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "{{model_name}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Output Tokens Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "Calculation": { "index": 0, "text": "mean" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }, { "color": "red", "value": 80 }]
|
||||
},
|
||||
"unit": "cps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 7, "x": 10, "y": 22 },
|
||||
"id": 14,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "11.3.0",
|
||||
"targets": [
|
||||
{
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(vllm:generation_tokens_total{model_name=~\"$Deployment_id\"}[$__rate_interval]))",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Output Tokens/Sec Avg",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"preload": false,
|
||||
"schemaVersion": 40,
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "text": "Prometheus", "value": "4184fc20-68a7-483a-8d9b-7caa59c680dd" },
|
||||
"label": "datasource",
|
||||
"name": "DS_PROMETHEUS",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"current": { "text": ["All"], "value": ["$__all"] },
|
||||
"definition": "label_values(vllm:request_success_total,model_name)",
|
||||
"includeAll": true,
|
||||
"label": "Deployment_ID",
|
||||
"multi": true,
|
||||
"name": "Deployment_id",
|
||||
"options": [],
|
||||
"query": {
|
||||
"qryType": 1,
|
||||
"query": "label_values(vllm:request_success_total,model_name)",
|
||||
"refId": "PrometheusVariableQueryEditor-VariableQuery"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": { "text": "All hours", "value": "All hours" },
|
||||
"hide": 2,
|
||||
"label": "Rush Hours Only",
|
||||
"name": "rush_hours",
|
||||
"options": [
|
||||
{ "selected": true, "text": "false", "value": "All hours" },
|
||||
{ "selected": false, "text": "true", "value": "Rush hours" }
|
||||
],
|
||||
"query": "false : All hours, true : Rush hours",
|
||||
"type": "custom"
|
||||
},
|
||||
{
|
||||
"current": { "text": "All", "value": "All" },
|
||||
"hide": 2,
|
||||
"label": "Rush Hours Type",
|
||||
"name": "rush_hours_type",
|
||||
"options": [
|
||||
{ "selected": true, "text": "^All__.*$", "value": "All" },
|
||||
{ "selected": false, "text": "^Static__.*$", "value": "Static" },
|
||||
{ "selected": false, "text": "^Dynamic__.*$", "value": "Dynamic" }
|
||||
],
|
||||
"query": "^All__.*$ : All, ^Static__.*$ : Static, ^Dynamic__.*$ : Dynamic",
|
||||
"type": "custom"
|
||||
},
|
||||
{
|
||||
"current": { "text": "", "value": "" },
|
||||
"hide": 2,
|
||||
"name": "query0",
|
||||
"options": [],
|
||||
"query": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-12h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Query Statistics_New4",
|
||||
"uid": "query-statistics4",
|
||||
"version": 2,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
||||
48
examples/online_serving/dashboards/perses/README.md
Normal file
48
examples/online_serving/dashboards/perses/README.md
Normal file
@ -0,0 +1,48 @@
|
||||
# Perses Dashboards for vLLM Monitoring
|
||||
|
||||
This directory contains Perses dashboard configurations designed to monitor vLLM
|
||||
performance and metrics.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Perses instance (standalone or via operator)
|
||||
- Prometheus data source configured in Perses
|
||||
- vLLM deployment with Prometheus metrics enabled
|
||||
|
||||
## Dashboard Format
|
||||
|
||||
We provide dashboards in the **native Perses YAML format** that works across all
|
||||
deployment methods:
|
||||
|
||||
- **Files**: `*.yaml` (native Perses dashboard specifications)
|
||||
- **Format**: Pure dashboard specifications that work everywhere
|
||||
- **Usage**: Works with standalone Perses, API imports, CLI, and file provisioning
|
||||
- **Kubernetes**: Directly compatible with Perses Operator
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
|
||||
statistics
|
||||
- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
|
||||
|
||||
## Deployment Options
|
||||
|
||||
### Direct Import to Perses
|
||||
|
||||
Import the dashboard specifications via Perses API or CLI:
|
||||
|
||||
```bash
|
||||
percli apply -f performance_statistics.yaml
|
||||
```
|
||||
|
||||
### Perses Operator (Kubernetes)
|
||||
|
||||
The native YAML format works directly with the Perses Operator:
|
||||
|
||||
```bash
|
||||
kubectl apply -f performance_statistics.yaml -n <namespace>
|
||||
```
|
||||
|
||||
### File Provisioning
|
||||
|
||||
Place the YAML files in a Perses provisioning folder for automatic loading.
|
||||
@ -0,0 +1,764 @@
|
||||
kind: PersesDashboard
|
||||
metadata:
|
||||
name: performance-statistics
|
||||
createdAt: 0001-01-01T00:00:00Z
|
||||
updatedAt: 0001-01-01T00:00:00Z
|
||||
version: 0
|
||||
project: ""
|
||||
spec:
|
||||
display:
|
||||
name: Performance Statistics
|
||||
|
||||
variables:
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
display:
|
||||
name: Deployment_ID
|
||||
hidden: false
|
||||
name: Deployment_id
|
||||
allowAllValue: true
|
||||
allowMultiple: true
|
||||
defaultValue:
|
||||
- $__all
|
||||
sort: alphabetical-asc
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
labelName: model_name
|
||||
matchers:
|
||||
# Any one vllm metric that always carries model_name
|
||||
- vllm:generation_tokens_total{}
|
||||
|
||||
panels:
|
||||
"1":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# avg latency by model = sum(rate(sum)) / sum(rate(count))
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
|
||||
"2":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:e2e_request_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"3":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"4":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"5":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: E2E Latency (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:e2e_request_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"6":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
|
||||
"7":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:time_to_first_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"8":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"9":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"10":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TTFT (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_to_first_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"11":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (Time per Output Token) over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
|
||||
/
|
||||
sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}}'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p50'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p90'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
seriesNameFormat: '{{model_name}} p99'
|
||||
|
||||
"12":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (Avg)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
|
||||
/
|
||||
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
|
||||
|
||||
"13":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P50)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.50,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"14":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P90)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.90,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"15":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: ITL (P99)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by (le, model_name) (
|
||||
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
|
||||
)
|
||||
)
|
||||
|
||||
"16":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: TPS (Tokens/sec) over Time
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:generation_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}} generation'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (model_name) (rate(vllm:prompt_tokens_total{model_name=~"$Deployment_id"}[$__interval]))
|
||||
seriesNameFormat: '{{model_name}} prompt'
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# overall iteration tokens/sec if exposed
|
||||
query: >
|
||||
rate(vllm:iteration_tokens_total_count[$__interval])
|
||||
seriesNameFormat: 'iteration overall'
|
||||
|
||||
"17":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: KV Cache Usage (avg %)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
# Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts)
|
||||
query: >
|
||||
100 * avg(vllm:gpu_cache_usage_perc)
|
||||
|
||||
"18":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Running Requests by Pod
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (pod) (vllm:num_requests_running)
|
||||
seriesNameFormat: '{{pod}}'
|
||||
|
||||
"19":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Waiting Requests by Pod
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend:
|
||||
mode: table
|
||||
position: bottom
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: >
|
||||
sum by (pod) (vllm:num_requests_waiting)
|
||||
seriesNameFormat: '{{pod}}'
|
||||
|
||||
"20":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Running Requests (sum)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: sum(vllm:num_requests_running)
|
||||
|
||||
"21":
|
||||
kind: Panel
|
||||
spec:
|
||||
display:
|
||||
name: Waiting Requests (sum)
|
||||
plugin:
|
||||
kind: StatChart
|
||||
spec:
|
||||
calculation: last-number
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource:
|
||||
kind: PrometheusDatasource
|
||||
name: accelerators-thanos-querier-datasource
|
||||
query: sum(vllm:num_requests_waiting)
|
||||
|
||||
layouts:
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: Overview
|
||||
items:
|
||||
- x: 0
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/17' } # KV cache %
|
||||
- x: 6
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/20' } # running sum
|
||||
- x: 12
|
||||
y: 0
|
||||
width: 6
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/21' } # waiting sum
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: E2E Latency
|
||||
items:
|
||||
- x: 0
|
||||
y: 1
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/1' }
|
||||
- x: 10
|
||||
y: 1
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/2' }
|
||||
- x: 17
|
||||
y: 1
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/3' }
|
||||
- x: 10
|
||||
y: 4
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/4' }
|
||||
- x: 17
|
||||
y: 4
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/5' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: TTFT
|
||||
items:
|
||||
- x: 0
|
||||
y: 8
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/6' }
|
||||
- x: 10
|
||||
y: 8
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/7' }
|
||||
- x: 17
|
||||
y: 8
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/8' }
|
||||
- x: 10
|
||||
y: 11
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/9' }
|
||||
- x: 17
|
||||
y: 11
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/10' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: ITL (Time per Output Token)
|
||||
items:
|
||||
- x: 0
|
||||
y: 15
|
||||
width: 10
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/11' }
|
||||
- x: 10
|
||||
y: 15
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/12' }
|
||||
- x: 17
|
||||
y: 15
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/13' }
|
||||
- x: 10
|
||||
y: 18
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/14' }
|
||||
- x: 17
|
||||
y: 18
|
||||
width: 7
|
||||
height: 3
|
||||
content: { $ref: '#/spec/panels/15' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: TPS (Prompt / Generation / Iteration)
|
||||
items:
|
||||
- x: 0
|
||||
y: 22
|
||||
width: 14
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/16' }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display:
|
||||
title: Per-Pod Request State
|
||||
items:
|
||||
- x: 0
|
||||
y: 28
|
||||
width: 12
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/18' }
|
||||
- x: 12
|
||||
y: 28
|
||||
width: 12
|
||||
height: 6
|
||||
content: { $ref: '#/spec/panels/19' }
|
||||
|
||||
392
examples/online_serving/dashboards/perses/query_statistics.yaml
Normal file
392
examples/online_serving/dashboards/perses/query_statistics.yaml
Normal file
@ -0,0 +1,392 @@
|
||||
kind: PersesDashboard
|
||||
metadata:
|
||||
name: query-statistics
|
||||
createdAt: 0001-01-01T00:00:00Z
|
||||
updatedAt: 0001-01-01T00:00:00Z
|
||||
version: 0
|
||||
project: ""
|
||||
spec:
|
||||
display:
|
||||
name: Query Statistics_New
|
||||
|
||||
variables:
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: NS
|
||||
display: { name: Namespace }
|
||||
allowMultiple: false
|
||||
defaultValue: llm-d
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: namespace
|
||||
matchers:
|
||||
- up{service=~".*vllm.*"}
|
||||
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: SVC
|
||||
display: { name: Service }
|
||||
allowMultiple: false
|
||||
defaultValue: vllm-qwen2-0-5b-sim
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: service
|
||||
matchers:
|
||||
- up{namespace="$NS",service=~".*vllm.*"}
|
||||
|
||||
- kind: ListVariable
|
||||
spec:
|
||||
name: MODEL
|
||||
display: { name: Model (real vLLM) }
|
||||
allowAllValue: true
|
||||
allowMultiple: true
|
||||
defaultValue: ["$__all"]
|
||||
plugin:
|
||||
kind: PrometheusLabelValuesVariable
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
labelName: model_name
|
||||
matchers:
|
||||
- vllm:request_success_total{namespace="$NS",service="$SVC"}
|
||||
|
||||
panels:
|
||||
|
||||
# --- Core (works on Simulator & Real) ---
|
||||
core_running_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running Requests (now) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum(vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_waiting_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting Requests (now) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum(vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_kv_usage_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (0–1) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_running_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running Over Time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (service) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_waiting_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting Over Time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (service) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_targets_up:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Scrape Targets Up }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: count(up{namespace="$NS",service="$SVC"} == 1) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- KV Cache as Percent (works on Simulator & Real) ---
|
||||
core_kv_usage_pct_now:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (%) – now }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# multiply by 100 to present percentage; omit format.unit to avoid schema conflicts
|
||||
query: (avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
core_kv_usage_pct_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache Usage (%) – over time }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: (avg by (service) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Per-Pod breakdowns (works on Simulator & Real) ---
|
||||
per_pod_running_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Running by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (pod) (vllm:num_requests_running{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
per_pod_waiting_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Waiting by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (pod) (vllm:num_requests_waiting{namespace="$NS",service="$SVC"}) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
per_pod_kv_pct_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: KV Cache (%) by Pod }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
# if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty
|
||||
query: (avg by (pod) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
# --- Real vLLM only (zeros on simulator) ---
|
||||
real_req_rate_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Request Rate (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:request_success_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p50:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p50 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.50, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p90:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p90 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.90, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_p99:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: p99 Latency (real vLLM) }
|
||||
plugin: { kind: StatChart, spec: { calculation: last-number } }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: histogram_quantile(0.99, sum by (le, model_name) (rate(vllm:e2e_request_latency_seconds_bucket{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval]))) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_input_tokens_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Input Tokens / sec (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:prompt_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
real_output_tokens_ts:
|
||||
kind: Panel
|
||||
spec:
|
||||
display: { name: Output Tokens / sec (real vLLM) }
|
||||
plugin:
|
||||
kind: TimeSeriesChart
|
||||
spec:
|
||||
legend: { mode: table, position: bottom }
|
||||
visual: { display: line, lineWidth: 1, areaOpacity: 0.3 }
|
||||
queries:
|
||||
- kind: TimeSeriesQuery
|
||||
spec:
|
||||
plugin:
|
||||
kind: PrometheusTimeSeriesQuery
|
||||
spec:
|
||||
datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource }
|
||||
query: sum by (model_name) (rate(vllm:generation_tokens_total{namespace="$NS",service="$SVC",model_name=~"$MODEL"}[$__interval])) or vector(0)
|
||||
minStep: "15s"
|
||||
|
||||
layouts:
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Core (Sim & Real) }
|
||||
items:
|
||||
- { x: 0, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_running_now' } }
|
||||
- { x: 6, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_waiting_now' } }
|
||||
- { x: 12, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_now' } }
|
||||
- { x: 18, y: 0, width: 6, height: 3, content: { $ref: '#/spec/panels/core_targets_up' } }
|
||||
- { x: 0, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_running_ts' } }
|
||||
- { x: 12, y: 3, width: 12, height: 6, content: { $ref: '#/spec/panels/core_waiting_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: KV Cache (%) }
|
||||
items:
|
||||
- { x: 0, y: 9, width: 6, height: 3, content: { $ref: '#/spec/panels/core_kv_usage_pct_now' } }
|
||||
- { x: 6, y: 9, width: 18, height: 6, content: { $ref: '#/spec/panels/core_kv_usage_pct_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Per-Pod breakdowns }
|
||||
items:
|
||||
- { x: 0, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_running_ts' } }
|
||||
- { x: 12, y: 15, width: 12, height: 6, content: { $ref: '#/spec/panels/per_pod_waiting_ts' } }
|
||||
- { x: 0, y: 21, width: 24, height: 6, content: { $ref: '#/spec/panels/per_pod_kv_pct_ts' } }
|
||||
|
||||
- kind: Grid
|
||||
spec:
|
||||
display: { title: Real vLLM only (shows 0 on simulator) }
|
||||
items:
|
||||
- { x: 0, y: 27, width: 12, height: 6, content: { $ref: '#/spec/panels/real_req_rate_ts' } }
|
||||
- { x: 12, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p50' } }
|
||||
- { x: 16, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p90' } }
|
||||
- { x: 20, y: 27, width: 4, height: 3, content: { $ref: '#/spec/panels/real_p99' } }
|
||||
- { x: 0, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_input_tokens_ts' } }
|
||||
- { x: 12, y: 33, width: 12, height: 6, content: { $ref: '#/spec/panels/real_output_tokens_ts' } }
|
||||
|
||||
@ -6,7 +6,7 @@ without any specific flags:
|
||||
|
||||
```bash
|
||||
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
--guided-decoding-backend outlines
|
||||
--structured-outputs-config.backend outlines
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions
|
||||
|
||||
@ -12,6 +12,12 @@ python examples/online_serving/pooling/cohere_rerank_client.py
|
||||
python examples/online_serving/pooling/jinaai_rerank_client.py
|
||||
```
|
||||
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/ner.py
|
||||
```
|
||||
|
||||
## Openai chat embedding for multimodal usage
|
||||
|
||||
```bash
|
||||
|
||||
71
examples/online_serving/pooling/ner.py
Normal file
71
examples/online_serving/pooling/ner.py
Normal file
@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
|
||||
|
||||
"""
|
||||
Example online usage of Pooling API for Named Entity Recognition (NER).
|
||||
|
||||
Run `vllm serve <model> --runner pooling`
|
||||
to start up the server in vLLM. e.g.
|
||||
|
||||
vllm serve boltuix/NeuroBERT-NER
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
import requests
|
||||
import torch
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
response = requests.post(api_url, headers=headers, json=prompt)
|
||||
return response
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--model", type=str, default="boltuix/NeuroBERT-NER")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
api_url = f"http://{args.host}:{args.port}/pooling"
|
||||
model_name = args.model
|
||||
|
||||
# Load tokenizer and config
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
label_map = config.id2label
|
||||
|
||||
# Input text
|
||||
text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
|
||||
prompt = {"model": model_name, "input": text}
|
||||
|
||||
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
|
||||
# Run inference
|
||||
output = pooling_response.json()["data"][0]
|
||||
logits = torch.tensor(output["data"])
|
||||
predictions = logits.argmax(dim=-1)
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
|
||||
# Map predictions to labels
|
||||
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
||||
labels = [label_map[p.item()] for p in predictions]
|
||||
assert len(tokens) == len(predictions)
|
||||
|
||||
# Print results
|
||||
for token, label in zip(tokens, labels):
|
||||
if token not in tokenizer.all_special_tokens:
|
||||
print(f"{token:15} → {label}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
"content": "Classify this sentiment: vLLM is wonderful!",
|
||||
}
|
||||
],
|
||||
"extra_body": {"guided_choice": ["positive", "negative"]},
|
||||
"extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
|
||||
},
|
||||
"regex": {
|
||||
"messages": [
|
||||
@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
|
||||
"structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
|
||||
},
|
||||
},
|
||||
"json": {
|
||||
@ -122,7 +122,8 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_grammar": """
|
||||
"structured_outputs": {
|
||||
"grammar": """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
@ -135,6 +136,7 @@ condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
""",
|
||||
}
|
||||
},
|
||||
},
|
||||
"structural_tag": {
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
import traceback
|
||||
from typing import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
|
||||
def find_cuda_init(fn: Callable[[], object]) -> None:
|
||||
"""
|
||||
Helper function to debug CUDA re-initialization errors.
|
||||
|
||||
If `fn` initializes CUDA, prints the stack trace of how this happens.
|
||||
"""
|
||||
from torch.cuda import _lazy_init
|
||||
|
||||
stack = None
|
||||
|
||||
def wrapper():
|
||||
nonlocal stack
|
||||
stack = traceback.extract_stack()
|
||||
return _lazy_init()
|
||||
|
||||
with patch("torch.cuda._lazy_init", wrapper):
|
||||
fn()
|
||||
|
||||
if stack is not None:
|
||||
print("==== CUDA Initialized ====")
|
||||
print("".join(traceback.format_list(stack)).strip())
|
||||
print("==========================")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
find_cuda_init(
|
||||
lambda: importlib.import_module("vllm.model_executor.models.llava"))
|
||||
@ -79,6 +79,7 @@ plugins:
|
||||
- "re:vllm\\._.*" # Internal modules
|
||||
- "vllm.third_party"
|
||||
- "vllm.vllm_flash_attn"
|
||||
- !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default
|
||||
- mkdocstrings:
|
||||
handlers:
|
||||
python:
|
||||
|
||||
@ -115,7 +115,6 @@ follow_imports = "silent"
|
||||
# move the directory here and remove it from tools/mypy.sh
|
||||
files = [
|
||||
"vllm/*.py",
|
||||
"vllm/adapter_commons",
|
||||
"vllm/assets",
|
||||
"vllm/entrypoints",
|
||||
"vllm/core",
|
||||
|
||||
@ -1 +1,2 @@
|
||||
lmcache
|
||||
lmcache
|
||||
nixl >= 0.5.1 # Required for disaggregated prefill
|
||||
|
||||
@ -14,3 +14,4 @@ setuptools-scm>=8
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
amdsmi==6.2.4
|
||||
timm>=1.0.17
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
tblib==3.1.0
|
||||
|
||||
# entrypoints test
|
||||
# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
|
||||
|
||||
@ -17,4 +17,5 @@ setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
conch-triton-kernels==1.2.1
|
||||
conch-triton-kernels==1.2.1
|
||||
timm>=1.0.17
|
||||
@ -1,54 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""vllm.entrypoints.api_server with some extra logging for testing."""
|
||||
from collections.abc import Iterable
|
||||
from typing import Any
|
||||
|
||||
import uvicorn
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
import vllm.entrypoints.api_server
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
app = vllm.entrypoints.api_server.app
|
||||
|
||||
|
||||
class AsyncLLMEngineWithStats(AsyncLLMEngine):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._num_aborts = 0
|
||||
|
||||
async def _engine_abort(self, request_ids: Iterable[str]):
|
||||
ids = list(request_ids)
|
||||
self._num_aborts += len(ids)
|
||||
await super()._engine_abort(ids)
|
||||
|
||||
def testing_stats(self) -> dict[str, Any]:
|
||||
return {"num_aborted_requests": self._num_aborts}
|
||||
|
||||
|
||||
@app.get("/stats")
|
||||
def stats() -> Response:
|
||||
"""Get the statistics of the engine."""
|
||||
return JSONResponse(engine.testing_stats())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
|
||||
vllm.entrypoints.api_server.engine = engine
|
||||
uvicorn.run(app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level="debug",
|
||||
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
|
||||
@ -1,12 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
@ -1,139 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import copyreg
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from multiprocessing import Pool
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import urllib3.exceptions
|
||||
|
||||
|
||||
def _pickle_new_connection_error(obj):
|
||||
"""Custom pickler for NewConnectionError to fix tblib compatibility."""
|
||||
# Extract the original message by removing the "conn: " prefix
|
||||
full_message = obj.args[0] if obj.args else ""
|
||||
if ': ' in full_message:
|
||||
# Split off the connection part and keep the actual message
|
||||
_, actual_message = full_message.split(': ', 1)
|
||||
else:
|
||||
actual_message = full_message
|
||||
return _unpickle_new_connection_error, (actual_message, )
|
||||
|
||||
|
||||
def _unpickle_new_connection_error(message):
|
||||
"""Custom unpickler for NewConnectionError."""
|
||||
# Create with None as conn and the actual message
|
||||
return urllib3.exceptions.NewConnectionError(None, message)
|
||||
|
||||
|
||||
# Register the custom pickle/unpickle functions for tblib compatibility
|
||||
copyreg.pickle(urllib3.exceptions.NewConnectionError,
|
||||
_pickle_new_connection_error)
|
||||
|
||||
|
||||
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
|
||||
response = requests.post("http://localhost:8000/generate",
|
||||
json={
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0,
|
||||
"ignore_eos": True
|
||||
})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def _query_server_long(prompt: str) -> dict:
|
||||
return _query_server(prompt, max_tokens=500)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server(distributed_executor_backend: str):
|
||||
script_path = Path(__file__).parent.joinpath(
|
||||
"api_server_async_engine.py").absolute()
|
||||
commands = [
|
||||
sys.executable,
|
||||
"-u",
|
||||
str(script_path),
|
||||
"--model",
|
||||
"facebook/opt-125m",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--distributed-executor-backend",
|
||||
distributed_executor_backend,
|
||||
]
|
||||
|
||||
# API Server Test Requires V0.
|
||||
my_env = os.environ.copy()
|
||||
my_env["VLLM_USE_V1"] = "0"
|
||||
uvicorn_process = subprocess.Popen(commands, env=my_env)
|
||||
yield
|
||||
uvicorn_process.terminate()
|
||||
|
||||
|
||||
@pytest.mark.timeout(300)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
||||
def test_api_server(api_server, distributed_executor_backend: str):
|
||||
"""
|
||||
Run the API server and test it.
|
||||
|
||||
We run both the server and requests in separate processes.
|
||||
|
||||
We test that the server can handle incoming requests, including
|
||||
multiple requests at the same time, and that it can handle requests
|
||||
being cancelled without crashing.
|
||||
"""
|
||||
with Pool(32) as pool:
|
||||
# Wait until the server is ready
|
||||
prompts = ["warm up"] * 1
|
||||
result = None
|
||||
while not result:
|
||||
try:
|
||||
for r in pool.map(_query_server, prompts):
|
||||
result = r
|
||||
break
|
||||
except requests.exceptions.ConnectionError:
|
||||
time.sleep(1)
|
||||
|
||||
# Actual tests start here
|
||||
# Try with 1 prompt
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
||||
assert num_aborted_requests == 0
|
||||
|
||||
# Try with 100 prompts
|
||||
prompts = ["test prompt"] * 100
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
|
||||
with Pool(32) as pool:
|
||||
# Cancel requests
|
||||
prompts = ["canceled requests"] * 100
|
||||
pool.map_async(_query_server_long, prompts)
|
||||
time.sleep(0.01)
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
|
||||
# check cancellation stats
|
||||
# give it some time to update the stats
|
||||
time.sleep(1)
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
||||
assert num_aborted_requests > 0
|
||||
|
||||
# check that server still runs after cancellations
|
||||
with Pool(32) as pool:
|
||||
# Try with 100 prompts
|
||||
prompts = ["test prompt after canceled"] * 100
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
@ -1,71 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.engine.async_llm_engine import RequestTracker
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_tracker():
|
||||
tracker = RequestTracker()
|
||||
stream_1 = tracker.add_request("1")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert len(new) == 1
|
||||
assert new[0]["request_id"] == "1"
|
||||
assert not aborted
|
||||
assert not stream_1.finished
|
||||
|
||||
stream_2 = tracker.add_request("2")
|
||||
stream_3 = tracker.add_request("3")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert len(new) == 2
|
||||
assert new[0]["request_id"] == "2"
|
||||
assert new[1]["request_id"] == "3"
|
||||
assert not aborted
|
||||
assert not stream_2.finished
|
||||
assert not stream_3.finished
|
||||
|
||||
# request_ids must be unique
|
||||
with pytest.raises(KeyError):
|
||||
tracker.add_request("1")
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
|
||||
tracker.abort_request("1")
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert len(aborted) == 1
|
||||
assert "1" in aborted
|
||||
assert not new
|
||||
assert stream_1.finished
|
||||
|
||||
stream_4 = tracker.add_request("4")
|
||||
tracker.abort_request("4")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
# aborted new requests will cancel each other out -
|
||||
# there's no need for them to propagate into the
|
||||
# engine
|
||||
assert not aborted
|
||||
assert not new
|
||||
assert stream_4.finished
|
||||
|
||||
stream_5 = tracker.add_request("5")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
tracker.process_request_output(
|
||||
RequestOutput("2", "output", [], [], [], finished=True))
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert not aborted
|
||||
assert len(new) == 1
|
||||
assert new[0]["request_id"] == "5"
|
||||
assert stream_2.finished
|
||||
assert not stream_5.finished
|
||||
@ -1,189 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
|
||||
|
||||
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
|
||||
pytest tests/basic_correctness/test_preemption.py`.
|
||||
"""
|
||||
import pytest
|
||||
from prometheus_client import REGISTRY
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import SamplingParams
|
||||
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
|
||||
ENABLE_ARTIFICIAL_PREEMPT)
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"distilbert/distilgpt2",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
|
||||
so use VLLM_USE_V1=0 for all tests in the file.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_settings():
|
||||
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
|
||||
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
|
||||
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
|
||||
"pytest tests/basic_correctness/test_preemption.py`")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def distributed_executor_backend() -> str:
|
||||
# When SPMD worker is used, use distributed_executor_backend="ray"
|
||||
# to test delta input optimization works with preemption.
|
||||
return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
def test_chunked_prefill_recompute(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
"""Ensure that chunked prefill works with preemption."""
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
enable_chunked_prefill = False
|
||||
max_num_batched_tokens = None
|
||||
if chunked_prefill_token_size != -1:
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_seqs=max_num_seqs,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
disable_log_stats=False,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption(
|
||||
caplog_vllm,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
"""By default, recompute preemption is enabled"""
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
total_preemption = (
|
||||
vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
|
||||
"is not enough KV cache space." in caplog_vllm.text)
|
||||
# Ensure the count bucket of request-level histogram metrics matches
|
||||
# the number of requests as a simple sanity check to ensure metrics are
|
||||
# generated
|
||||
preemption_metrics = None
|
||||
for m in REGISTRY.collect():
|
||||
if m.name == "vllm:num_preemptions":
|
||||
preemption_metrics = m
|
||||
assert preemption_metrics is not None
|
||||
total_recorded_preemption = 0
|
||||
for sample in preemption_metrics.samples:
|
||||
total_recorded_preemption += sample.value
|
||||
assert total_preemption == total_recorded_preemption
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption_infeasible(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
"""Verify infeasible preemption request will be ignored."""
|
||||
BLOCK_SIZE = 16
|
||||
prefill_blocks = 2
|
||||
decode_blocks = max_tokens // BLOCK_SIZE
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
block_size=BLOCK_SIZE,
|
||||
# Not enough gpu blocks to complete a single sequence.
|
||||
# preemption should happen, and the sequence should be
|
||||
# ignored instead of hanging forever.
|
||||
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
|
||||
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
ignore_eos=True)
|
||||
req_outputs = vllm_model.llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
|
||||
# Verify the request is ignored and not hang.
|
||||
for req_output in req_outputs:
|
||||
outputs = req_output.outputs
|
||||
assert len(outputs) == 1
|
||||
assert outputs[0].finish_reason == "length"
|
||||
@ -68,7 +68,7 @@ def test_bench_serve_chat(server):
|
||||
"5",
|
||||
"--endpoint",
|
||||
"/v1/chat/completions",
|
||||
"--endpoint-type",
|
||||
"--backend",
|
||||
"openai-chat",
|
||||
]
|
||||
result = subprocess.run(command, capture_output=True, text=True)
|
||||
|
||||
@ -98,8 +98,9 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
|
||||
return [FUSED_OPS[kNvfp4Quant]]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", [64])
|
||||
@pytest.mark.parametrize("hidden_size", [128])
|
||||
@pytest.mark.parametrize("num_tokens", [32, 64])
|
||||
@pytest.mark.parametrize("hidden_size", [128, 256])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize(
|
||||
"model_class",
|
||||
cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
|
||||
@ -110,13 +111,13 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
|
||||
[True, False] if cutlass_fp8_supported() else [True])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
|
||||
reason="Only test on CUDA and ROCm")
|
||||
def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
|
||||
def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class,
|
||||
cuda_force_torch):
|
||||
if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch:
|
||||
pytest.skip("Duplicate tests for NVFP4")
|
||||
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_dtype(torch.float16)
|
||||
torch.set_default_dtype(dtype)
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size * 2)
|
||||
|
||||
@ -145,8 +146,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
|
||||
elif model_class == TestSiluMulNvfp4QuantModel:
|
||||
atol, rtol = 1e-1, 1e-1
|
||||
|
||||
torch.testing.assert_close(result[0].to(dtype=torch.float16),
|
||||
result2[0].to(dtype=torch.float16),
|
||||
torch.testing.assert_close(result[0].to(dtype=dtype),
|
||||
result2[0].to(dtype=dtype),
|
||||
atol=atol,
|
||||
rtol=rtol)
|
||||
|
||||
|
||||
@ -1,15 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test() -> bool:
|
||||
"""Disable the global cleanup fixture for tests in this directory. This
|
||||
provides a ~10x speedup for unit tests that don't load a model to GPU.
|
||||
|
||||
This requires that tests in this directory clean up after themselves if they
|
||||
use the GPU.
|
||||
"""
|
||||
return False
|
||||
@ -1,71 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, seed):
|
||||
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, seed)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
test_llm_kwargs, seed):
|
||||
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
test_llm_kwargs, seed)
|
||||
|
||||
|
||||
def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
distinct_llm_kwargs, seed):
|
||||
kwargs = {
|
||||
**common_llm_kwargs,
|
||||
**per_test_common_llm_kwargs,
|
||||
**distinct_llm_kwargs,
|
||||
}
|
||||
|
||||
def generator_inner():
|
||||
llm = LLM(**kwargs)
|
||||
|
||||
set_random_seed(seed)
|
||||
|
||||
yield llm
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
for llm in generator_inner():
|
||||
yield llm
|
||||
del llm
|
||||
|
||||
|
||||
def get_text_from_llm_generator(llm_generator: Iterable[LLM],
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb: Optional[Callable[[LLM],
|
||||
None]] = None):
|
||||
for llm in llm_generator:
|
||||
if llm_cb:
|
||||
llm_cb(llm)
|
||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
text = [output.outputs[0].text for output in outputs]
|
||||
del llm
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
|
||||
for llm in llm_generator:
|
||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
token_ids = [output.outputs[0].token_ids for output in outputs]
|
||||
del llm
|
||||
|
||||
return token_ids
|
||||
@ -1,479 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from itertools import cycle
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
from .conftest import get_token_ids_from_llm_generator
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_block_manager_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify block manager produces same outputs even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that the KV
|
||||
cache is not corrupted.
|
||||
|
||||
NOTE: We want a significant number of generated tokens so that any incorrect
|
||||
KV mapping has time to build up error.
|
||||
|
||||
NOTE(Kuntai): Though we have removed block manager v1, this test is still
|
||||
useful as it asserts the behavior of block manager v2 (now it is called
|
||||
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
|
||||
keep this test.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# Our prompts will generate 128 tokens; since the prompts themselves are
|
||||
# small, we don't need much KV space beyond 128.
|
||||
"max_model_len": 160,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"block_size": 16,
|
||||
|
||||
# Allow only 2 sequences of ~128 tokens in worst case.
|
||||
# Note 8 = 128/block_size
|
||||
"num_gpu_blocks_override": 2 * (8 + 1),
|
||||
},
|
||||
{
|
||||
"block_size": 8,
|
||||
|
||||
# Allow only 2 sequences of ~128 tokens in worst case.
|
||||
# Note 16 = 128/block_size
|
||||
"num_gpu_blocks_override": 2 * (16 + 2),
|
||||
}
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"num_lookahead_slots": 0,
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
# We run one test with block_size < lookahead_slots, one test with
|
||||
# block_size > lookahead_slots
|
||||
"num_lookahead_slots": 10,
|
||||
"preemption_mode": "swap",
|
||||
},
|
||||
{
|
||||
"num_lookahead_slots": 10,
|
||||
"preemption_mode": "recompute",
|
||||
}
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator,
|
||||
batch_size):
|
||||
"""Verify vLLM produces the same output with greedy sampling, when lookahead
|
||||
scheduling is used vs. not.
|
||||
|
||||
Lookahead scheduling is not expected to modify the output, as it simply
|
||||
allocates empty slots ahead of the known token ids in a sliding fashion.
|
||||
|
||||
This test constrains the total number of blocks to force preemption. It also
|
||||
varies the block size so that the lookahead size is less than and greater
|
||||
than the block size.
|
||||
"""
|
||||
output_len = 128
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids without lookahead scheduling')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with lookahead scheduling')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"enable_chunked_prefill": True,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs",
|
||||
[{
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 2,
|
||||
"max_num_seqs": 2,
|
||||
}, {
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 3,
|
||||
"max_num_seqs": 2,
|
||||
}, {
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 256,
|
||||
"max_num_seqs": 10,
|
||||
}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [
|
||||
{},
|
||||
])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"num_lookahead_slots": 0,
|
||||
},
|
||||
{
|
||||
"num_lookahead_slots": 5,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_chunked_prefill_block_manager(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify that chunked prefill works with SelfAttnBlockSpaceManager,
|
||||
with and without lookahead scheduling.
|
||||
"""
|
||||
output_len = 32
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
("1 + " * 50) + " 1 = ", # Longer prompt.
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with BlockManager')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with BlockManager, with lookahead slots.')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
|
||||
# Enable prefill cache
|
||||
"enable_prefix_caching": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_block_manager_prefix_caching_enabled_with_preemption(
|
||||
baseline_llm_generator, test_llm_generator, batch_size):
|
||||
"""Verify block manager produces same outputs even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that the KV
|
||||
cache is not corrupted.
|
||||
|
||||
NOTE: We want a significant number of generated tokens so that any incorrect
|
||||
KV mapping has time to build up error.
|
||||
|
||||
NOTE(Kuntai): Though we have removed block manager v1, this test is still
|
||||
useful as it asserts the behavior of block manager v2 (now it is called
|
||||
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
|
||||
keep this test.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids from block manager')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids from block manager, with preemption')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"enable_prefix_caching": False
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"enable_prefix_caching": True,
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"enable_prefix_caching": True,
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify block manager v2 with auto prefix caching enabled produces same
|
||||
outputs as auto prefix caching disabled, even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that auto
|
||||
prefix caching itself at least don't cause result error.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with APC disabled')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with APC enabled')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# we keep the blocks small, so that hit eviction quickly
|
||||
"max_model_len": 48,
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"enable_prefix_caching": False
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"enable_prefix_caching": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
|
||||
test_llm_generator):
|
||||
"""Verify block manager v2 with auto prefix caching could work normally
|
||||
even when eviction started.
|
||||
With APC enabled, all blocks are held by native block at the beginning.
|
||||
Then blocks are managed by evictor instead. If cache hit at the evictor's
|
||||
block, then it could be reused, or we need to recompute its kv cache.
|
||||
"""
|
||||
output_len = 10
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"You are a helpful assistant. Please answer truthfully and write "
|
||||
"out your thinking step by step to be sure you get the right answer. "
|
||||
"If you make a mistake, attempt to correct it. who are you?",
|
||||
"You are a helpful assistant. Please answer truthfully and write out "
|
||||
"your thinking step by step to be sure you get the right answer. You "
|
||||
"are helpful and harmless and you follow ethical guidelines. "
|
||||
"who are you?"
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with APC disabled')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with APC enabled')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
@ -1,185 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .conftest import get_text_from_llm_generator
|
||||
|
||||
# relatively small model with 4k sliding window
|
||||
MODEL = "bigcode/starcoder2-3b"
|
||||
BLOCK_SIZE = 16
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
"model": MODEL,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"block_size": BLOCK_SIZE,
|
||||
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
|
||||
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
|
||||
def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
|
||||
batch_size, seed, backend, monkeypatch):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
asks for value of one of them (which is outside the sliding window).
|
||||
If we tell it upfront which we are going to be looking for, then
|
||||
it answers correctly (mostly).
|
||||
|
||||
Additionally, we compare the results of the v1 and v2 managers.
|
||||
"""
|
||||
if backend == "XFORMERS" and current_platform.is_rocm():
|
||||
pytest.skip("Xformers does not support ROCm/HIP.")
|
||||
|
||||
override_backend_env_variable(monkeypatch, backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1024,
|
||||
ignore_eos=True,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
prompts, answer, indices = prep_prompts(batch_size)
|
||||
|
||||
baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb=check_window(prompts))
|
||||
|
||||
check_answers(indices, answer, baseline_texts)
|
||||
|
||||
print('Getting token ids from block manager v2')
|
||||
test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
|
||||
sampling_params)
|
||||
check_answers(indices, answer, test_texts)
|
||||
|
||||
cmp = [
|
||||
expected_text == actual_text
|
||||
for expected_text, actual_text in zip(baseline_texts, test_texts)
|
||||
]
|
||||
print(cmp)
|
||||
# make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
|
||||
# however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
|
||||
# states that xformers and flash_attn have different ideas about the window
|
||||
# size anyways
|
||||
assert sum(cmp) > 0.7 * len(cmp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
"model": MODEL,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"block_size": BLOCK_SIZE,
|
||||
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
|
||||
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
|
||||
backend, monkeypatch):
|
||||
"""
|
||||
This is similar to test_sliding_window_retrieval, however, it doesn't
|
||||
compare against the v1 block manager since v1 doesn't support
|
||||
chunked prefill with sliding window.
|
||||
|
||||
The results with and without chunked prefill are not the same due to
|
||||
numerical instabilities.
|
||||
"""
|
||||
if backend == "XFORMERS" and current_platform.is_rocm():
|
||||
pytest.skip("Xformers does not support ROCm/HIP.")
|
||||
override_backend_env_variable(monkeypatch, backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=10,
|
||||
ignore_eos=True,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
prompts, answer, indices = prep_prompts(batch_size)
|
||||
|
||||
# We don't compare with the baseline model here, since the results
|
||||
# slightly different due to different tailing in attention.
|
||||
test_texts = get_text_from_llm_generator(test_llm_generator,
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb=check_window(prompts))
|
||||
check_answers(indices, answer, test_texts)
|
||||
|
||||
|
||||
def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
|
||||
"""
|
||||
Generate prompts which a bunch of assignments,
|
||||
then asking for the value of one of them.
|
||||
The prompt is just under 10k tokens; sliding window is 4k
|
||||
so the answer is outside sliding window, but should still be correct.
|
||||
|
||||
Args:
|
||||
batch_size: number of prompts to generate
|
||||
ln_range: an argument to control the length of the prompt
|
||||
"""
|
||||
prompts: list[str] = []
|
||||
answer: list[int] = []
|
||||
indices: list[int] = []
|
||||
random.seed(1)
|
||||
for _ in range(batch_size):
|
||||
idx = random.randint(30, 90)
|
||||
indices.append(idx)
|
||||
prompt = "```python\n# We set a number of variables, " + \
|
||||
f"x{idx} will be important later\n"
|
||||
ln = random.randint(*ln_range)
|
||||
for k in range(30, ln):
|
||||
v = random.randint(10, 99)
|
||||
if k == idx:
|
||||
answer.append(v)
|
||||
prompt += f"x{k} = {v}\n"
|
||||
prompt += f"# Now, we check the value of x{idx}:\n"
|
||||
prompt += f"assert x{idx} == "
|
||||
prompts.append(prompt)
|
||||
return prompts, answer, indices
|
||||
|
||||
|
||||
def check_answers(indices: list[int],
|
||||
answer: list[int],
|
||||
outputs: list[str],
|
||||
accept_rate: float = 0.7):
|
||||
answer2 = [int(text[0:2].strip()) for text in outputs]
|
||||
print(list(zip(indices, zip(answer, answer2))))
|
||||
numok = 0
|
||||
for a1, a2 in zip(answer, answer2):
|
||||
if a1 == a2:
|
||||
numok += 1
|
||||
frac_ok = numok / len(answer)
|
||||
print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
|
||||
assert frac_ok >= accept_rate
|
||||
|
||||
|
||||
def check_window(prompts: list[str]):
|
||||
|
||||
def inner(llm: LLM):
|
||||
sliding_window = llm.llm_engine.model_config.get_sliding_window()
|
||||
assert sliding_window and sliding_window > 0
|
||||
assert any(
|
||||
len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
|
||||
for prompt in prompts)
|
||||
|
||||
return inner
|
||||
@ -1,341 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block_manager import SelfAttnBlockSpaceManager
|
||||
from vllm.core.interfaces import AllocStatus
|
||||
from vllm.sequence import Logprob, SequenceStatus
|
||||
from vllm.utils import chunk_list
|
||||
|
||||
from ..utils import create_dummy_prompt, create_seq_group
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
|
||||
num_gpu_blocks: int, watermark: float):
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
)
|
||||
num_watermark_blocks = int(watermark * num_gpu_blocks)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
|
||||
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
|
||||
# the current implementation assumes all seqs are new prompts / don't have
|
||||
# different output lens.
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
|
||||
for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
)
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
can_allocate_result = block_manager.can_allocate(seq_group)
|
||||
|
||||
num_required_blocks = num_prompt_blocks + num_output_blocks
|
||||
|
||||
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
|
||||
assert can_allocate_result == AllocStatus.NEVER
|
||||
elif num_gpu_blocks >= num_required_blocks:
|
||||
assert can_allocate_result == AllocStatus.OK
|
||||
else:
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
|
||||
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
|
||||
def test_append_slots(block_size, prompt_len, num_slots_to_append,
|
||||
num_lookahead_slots):
|
||||
"""Verify append_slots consumes the correct number of blocks from the block
|
||||
table.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
watermark = 0.1
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
watermark=watermark,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=prompt_len,
|
||||
seq_output_lens=[0],
|
||||
)
|
||||
|
||||
# Allocate seq
|
||||
assert block_manager.can_allocate(seq_group)
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
# Seq seq to RUNNING
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
# Append tokens to the sequeqnce
|
||||
for token_id in range(num_slots_to_append):
|
||||
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Append slots for new tokens and lookahead slots.
|
||||
free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
|
||||
block_manager.append_slots(seq, num_lookahead_slots)
|
||||
num_consumed_blocks = (free_blocks_before_append -
|
||||
block_manager.get_num_free_gpu_blocks())
|
||||
|
||||
# Expect consumed blocks to be new blocks required to support the new slots.
|
||||
expected_consumed_blocks = len(
|
||||
list(
|
||||
chunk_list(
|
||||
list(
|
||||
range(prompt_len + num_slots_to_append +
|
||||
num_lookahead_slots)),
|
||||
block_size))) - len(
|
||||
list(chunk_list(list(range(prompt_len)), block_size)))
|
||||
assert num_consumed_blocks == expected_consumed_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [False, True])
|
||||
def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
|
||||
enable_caching):
|
||||
"""Verify blocks number on src/desc device is correct after swapping in/out
|
||||
sequence group (not missing or extra blocks).
|
||||
"""
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
# Emulate a forward pass by appending a single token.
|
||||
# The block manager then knows how many unprocessed
|
||||
# tokens will be written in the next forward pass.
|
||||
token_id = 0
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
gpu_blocks = block_manager.get_block_table(prompt)
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_out(seq_group)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == gpu_blocks
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
|
||||
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_in(seq_group)
|
||||
cpu_blocks = block_manager.get_block_table(prompt)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == [cpu_blocks[0]]
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [True, False])
|
||||
def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
|
||||
enable_caching):
|
||||
""" Verify the block manager can correctly determine if a sequence group
|
||||
can be swapped in/out.
|
||||
"""
|
||||
num_cpu_blocks = num_gpu_blocks
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt, seq_group = create_dummy_prompt(
|
||||
"1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
gpu_blocks = block_manager.get_block_table(prompt)
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_out(seq_group)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == gpu_blocks
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
|
||||
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# At this moment, we still have enough free blocks to swap in the seq group.
|
||||
if num_lookahead_slots <= block_size:
|
||||
assert block_manager.can_swap_in(seq_group,
|
||||
num_lookahead_slots) == AllocStatus.OK
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
# During Swapped out, 2 cached blocks were evicted from the GPU,
|
||||
# so the prompt1 can't be swapped in
|
||||
prompt2_len = 2 * block_size - 1
|
||||
prompt2, seq_group2 = create_dummy_prompt(
|
||||
"2",
|
||||
prompt_length=prompt2_len,
|
||||
prompt_tokens=[10000 + i for i in range(prompt2_len)])
|
||||
prompt2.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group2)
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
if num_lookahead_slots <= block_size:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.LATER
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [False, True])
|
||||
def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
|
||||
"""Verifies that swapping fails if there is not enough free blocks
|
||||
to account for unseen tokens and lookahead_slots.
|
||||
"""
|
||||
block_size = 8
|
||||
num_cpu_blocks = 1
|
||||
num_gpu_blocks = 1
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt_length = block_size - 3
|
||||
assert prompt_length > 0
|
||||
prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
# Emulate a forward pass by appending a single token.
|
||||
# The block manager then knows how many unprocessed
|
||||
# tokens will be written in the next forward pass.
|
||||
token_id = 0
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
block_manager.swap_out(seq_group)
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
# The number of unseen tokens is 1. If the number of existing
|
||||
# tokens plus the unseen ones and number of lookahead slots exceeds
|
||||
# the total number of available GPU blocks then the swap
|
||||
# should fail.
|
||||
num_unseen_tokens = 1
|
||||
if (num_lookahead_slots + num_unseen_tokens +
|
||||
prompt_length) <= (block_size * num_gpu_blocks):
|
||||
assert block_manager.can_swap_in(seq_group,
|
||||
num_lookahead_slots) == AllocStatus.OK
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
|
||||
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8, 16])
|
||||
@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
|
||||
@pytest.mark.parametrize("num_slots_to_append", [50])
|
||||
@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
|
||||
def test_sliding_window(block_size, prompt_len, num_slots_to_append,
|
||||
sliding_window):
|
||||
"""Verify append_slots consumes the correct number of blocks from the block
|
||||
table.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
watermark = 0.1
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
watermark=watermark,
|
||||
sliding_window=sliding_window,
|
||||
)
|
||||
|
||||
def check_used(min_n, max_n=None):
|
||||
if max_n is None:
|
||||
max_n = min_n
|
||||
used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
|
||||
assert min_n <= used
|
||||
assert used <= max_n
|
||||
|
||||
def num_blocks(num_tokens):
|
||||
return (num_tokens + block_size - 1) // block_size
|
||||
|
||||
check_used(0)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=prompt_len,
|
||||
seq_output_lens=[0],
|
||||
)
|
||||
|
||||
check_used(0)
|
||||
|
||||
# Allocate seq
|
||||
assert block_manager.can_allocate(seq_group)
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
check_used(num_blocks(prompt_len))
|
||||
|
||||
# Seq seq to RUNNING
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
seq.data.update_num_computed_tokens(prompt_len)
|
||||
check_used(num_blocks(prompt_len))
|
||||
|
||||
# this is how we compute it in SelfAttnBlockSpaceManager.__init__
|
||||
sliding_blocks = (sliding_window // block_size) + 2
|
||||
# plus one block for null block
|
||||
sliding_blocks += 1
|
||||
|
||||
# Append tokens to the sequeqnce
|
||||
for token_id in range(num_slots_to_append):
|
||||
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
seq.data.update_num_computed_tokens(1)
|
||||
block_manager.append_slots(seq, num_lookahead_slots=0)
|
||||
if prompt_len < sliding_window + 10:
|
||||
check_used(0, sliding_blocks + 1)
|
||||
else:
|
||||
check_used(sliding_blocks, sliding_blocks + 1)
|
||||
@ -1,577 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.block_table import BlockTable
|
||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||
from vllm.utils import Device, cdiv, chunk_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
def test_allocate_naive(block_size: int, sequence_len: int):
|
||||
"""Test the allocation of blocks using the naive allocator.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size and
|
||||
number of blocks. It then allocates multiple BlockTables with varying
|
||||
sequence lengths and verifies that the number of free blocks decreases as
|
||||
expected after each allocation.
|
||||
"""
|
||||
assert block_size > 1
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type="naive",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||||
|
||||
block_tables: list[BlockTable] = []
|
||||
for i in range(5):
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
|
||||
|
||||
block_tables.append(
|
||||
BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
))
|
||||
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
def test_allocate_prefix_caching(block_size: int, sequence_len: int):
|
||||
"""Test the allocation of blocks using the prefix caching allocator.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size and
|
||||
number of blocks, using the prefix caching allocator. It then allocates
|
||||
multiple BlockTables with varying sequence lengths and verifies that the
|
||||
number of free blocks decreases as expected after each allocation.
|
||||
|
||||
The test expects all sequences to share allocations, except for their last
|
||||
block, which may be mutable. It calculates the expected number of immutable
|
||||
and mutable blocks per allocation based on the sequence length and block
|
||||
size.
|
||||
"""
|
||||
assert block_size > 1
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type="prefix_caching",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
chunked_tokens = list(chunk_list(token_ids, block_size))
|
||||
num_mutable_blocks_per_alloc = 0 if len(
|
||||
chunked_tokens[-1]) == block_size else 1
|
||||
num_immutable_blocks_per_alloc = len(
|
||||
chunked_tokens) - num_mutable_blocks_per_alloc
|
||||
|
||||
block_tables: list[BlockTable] = []
|
||||
for alloc_i in range(1, 6):
|
||||
|
||||
block_tables.append(
|
||||
BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
))
|
||||
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Expect all sequences to share allocations, except for their last block
|
||||
# (which may be mutable).
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_gpu_blocks - (
|
||||
num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
|
||||
(alloc_i))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
@pytest.mark.parametrize("device", ["cpu", "gpu"])
|
||||
def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
|
||||
device: str):
|
||||
"""Test the allocation and freeing of blocks using different allocators and
|
||||
devices.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, allocator type, and device. It then allocates a BlockTable
|
||||
multiple times with the same sequence and verifies that the number of free
|
||||
blocks remains consistent after each allocation and freeing.
|
||||
"""
|
||||
device = Device[device.upper()]
|
||||
|
||||
num_device_blocks = 1024
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_device_blocks,
|
||||
num_cpu_blocks=num_device_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
for i in range(5):
|
||||
block_table.allocate(token_ids=token_ids, device=device)
|
||||
assert allocator.get_num_free_blocks(
|
||||
device) == num_device_blocks - num_blocks_per_alloc
|
||||
assert all(block_id is not None
|
||||
for block_id in block_table.physical_block_ids)
|
||||
|
||||
block_table.free()
|
||||
assert allocator.get_num_free_blocks(device) == num_device_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_append_token_ids_allocation(block_size: int, sequence_len: int,
|
||||
append_len: int, allocator_type: str):
|
||||
"""Test the allocation behavior when appending token IDs to a BlockTable.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, and allocator type. It then allocates a BlockTable with an
|
||||
initial sequence and appends additional token IDs to it. The test verifies
|
||||
that the number of allocated blocks before and after appending matches the
|
||||
expected values.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_blocks_before_append = len(
|
||||
list(chunk_list(token_ids, block_size)))
|
||||
num_expected_appended_blocks = len(
|
||||
list(chunk_list(token_ids + token_ids_to_append,
|
||||
block_size))) - num_expected_blocks_before_append
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
assert len(
|
||||
block_table.physical_block_ids) == num_expected_blocks_before_append
|
||||
block_table.append_token_ids(token_ids_to_append)
|
||||
assert len(
|
||||
block_table.physical_block_ids
|
||||
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
|
||||
num_empty_slots: int,
|
||||
allocator_type: str):
|
||||
"""Test the allocation behavior when ensuring a certain number of empty
|
||||
slots in a BlockTable.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, and allocator type. It then allocates a BlockTable with an
|
||||
initial sequence and ensures a certain number of empty slots. The test
|
||||
verifies that the number of allocated blocks before and after ensuring empty
|
||||
slots matches the expected values. It also checks that filling up the empty
|
||||
slots does not consume additional blocks.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_blocks_before_append = len(
|
||||
list(chunk_list(token_ids, block_size)))
|
||||
num_expected_appended_blocks = len(
|
||||
list(chunk_list(token_ids + [-1] * num_empty_slots,
|
||||
block_size))) - num_expected_blocks_before_append
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Assert that the empty slots consume the expected number of additional
|
||||
# blocks.
|
||||
assert len(
|
||||
block_table.physical_block_ids) == num_expected_blocks_before_append
|
||||
block_table.ensure_num_empty_slots(num_empty_slots)
|
||||
assert len(
|
||||
block_table.physical_block_ids
|
||||
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
||||
|
||||
# Now, ensure no additional blocks consumed as we fill up the empty slots.
|
||||
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
|
||||
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
|
||||
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 9])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_size", [1, 4, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
|
||||
append_len: int, allocator_type: str,
|
||||
append_size: int):
|
||||
"""Verify token ids are correctly appended. Appends various amounts of
|
||||
token ids in various append sizes, and verifies the final sequence is
|
||||
correct.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
appended_so_far: list[int] = []
|
||||
for append in chunk_list(token_ids_to_append, append_size):
|
||||
block_table.append_token_ids(append)
|
||||
appended_so_far.extend(append)
|
||||
|
||||
assert block_table._get_all_token_ids() == token_ids + appended_so_far
|
||||
|
||||
assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_len", [1, 9, 129])
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_fork(seq_len: int, block_size: int, allocator_type: str):
|
||||
"""Create a sequence using the specified allocator.
|
||||
1. Assert that after forking the sequence, the free block count is the
|
||||
same.
|
||||
2. Assert that the forked sequence has the same physical mappings.
|
||||
3. Then free the original sequence; verify that the free block count is
|
||||
the same.
|
||||
4. Finally, free the forked sequence and verify that the free block
|
||||
count drops to zero.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(seq_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
block_table.allocate(token_ids)
|
||||
|
||||
num_free_blocks_before_fork = allocator.get_num_free_blocks(
|
||||
device=Device.GPU)
|
||||
|
||||
forked_block_table = block_table.fork()
|
||||
|
||||
# Expect physical_block_ids and token_ids to match.
|
||||
assert (block_table.physical_block_ids ==
|
||||
forked_block_table.physical_block_ids)
|
||||
assert block_table._get_all_token_ids(
|
||||
) == forked_block_table._get_all_token_ids()
|
||||
|
||||
# Do not expect any additional allocations.
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_free_blocks_before_fork
|
||||
|
||||
# Free the original blocks. Assert num free blocks does not change, since
|
||||
# refcount is nonzero.
|
||||
block_table.free()
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_free_blocks_before_fork
|
||||
|
||||
# Expect the forked block table to be unaffected by the free.
|
||||
assert all(block_id is not None
|
||||
for block_id in forked_block_table.physical_block_ids)
|
||||
|
||||
# Free the forked blocks. Assert num free blocks does change, since
|
||||
# refcount is now zero.
|
||||
forked_block_table.free()
|
||||
assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("appender", ["forked", "original"])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_cow(block_size: int, sequence_len: int, append_len: int,
|
||||
allocator_type: str, appender: str):
|
||||
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
original_block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
|
||||
num_expected_cow_blocks = cdiv(sequence_len + append_len,
|
||||
block_size) - (sequence_len // block_size)
|
||||
|
||||
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
original_block_ids = original_block_table.physical_block_ids[:]
|
||||
|
||||
print("original_block_ids = {}".format(original_block_ids))
|
||||
forked_block_table = original_block_table.fork()
|
||||
|
||||
# Expect no additional allocation (copy on _write_).
|
||||
assert allocator.get_num_free_blocks(
|
||||
Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
|
||||
|
||||
if appender == "forked":
|
||||
appender_block_table = forked_block_table
|
||||
static_block_table = original_block_table
|
||||
elif appender == "original":
|
||||
appender_block_table = original_block_table
|
||||
static_block_table = forked_block_table
|
||||
else:
|
||||
raise ValueError(f"unknown test config {appender=}")
|
||||
|
||||
# Write tokens.
|
||||
appender_block_table.append_token_ids(token_ids_to_append)
|
||||
|
||||
# Expect the non-appending block table to have no change.
|
||||
assert static_block_table.physical_block_ids == original_block_ids
|
||||
assert appender_block_table.physical_block_ids != original_block_ids
|
||||
|
||||
# Expect the blocks changed during append to have a CoW.
|
||||
assert allocator.get_num_free_blocks(
|
||||
Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
|
||||
num_expected_cow_blocks)
|
||||
|
||||
cows = allocator.clear_copy_on_writes()
|
||||
if sequence_len % block_size > 0:
|
||||
# If the last block in the sequence is not full, then when appending we
|
||||
# expect a CoW.
|
||||
assert cows
|
||||
|
||||
cow_block_id = sequence_len // block_size
|
||||
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
||||
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
||||
|
||||
assert (expected_src, expected_dst) in cows
|
||||
else:
|
||||
# Otherwise, there should be no copy-on-write.
|
||||
assert not cows
|
||||
|
||||
static_block_table.free()
|
||||
appender_block_table.free()
|
||||
|
||||
# After free, expect all blocks to be freed.
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
|
||||
@pytest.mark.parametrize("appender", ["forked", "original"])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_cow_lookahead_simple(block_size: int, sequence_len: int,
|
||||
append_len: int, lookahead_slots: int,
|
||||
allocator_type: str, appender: str):
|
||||
"""Similar to test_cow, except with lookahead allocation. The assertions are
|
||||
less rigorous due to the complexity of the property under test.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
original_block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Allocate lookahead slots.
|
||||
original_block_table.ensure_num_empty_slots(lookahead_slots)
|
||||
original_block_ids = original_block_table.physical_block_ids[:]
|
||||
|
||||
forked_block_table = original_block_table.fork()
|
||||
|
||||
if appender == "forked":
|
||||
appender_block_table = forked_block_table
|
||||
static_block_table = original_block_table
|
||||
elif appender == "original":
|
||||
appender_block_table = original_block_table
|
||||
static_block_table = forked_block_table
|
||||
else:
|
||||
raise ValueError(f"unknown test config {appender=}")
|
||||
|
||||
# Write tokens.
|
||||
appender_block_table.append_token_ids(token_ids_to_append)
|
||||
|
||||
# Expect the non-appending block table to have no change.
|
||||
assert static_block_table.physical_block_ids == original_block_ids
|
||||
assert appender_block_table.physical_block_ids != original_block_ids
|
||||
|
||||
cows = allocator.clear_copy_on_writes()
|
||||
|
||||
# Always expect copy-on-write
|
||||
assert cows
|
||||
|
||||
if sequence_len % block_size > 0:
|
||||
# If the last block in the sequence is not full, then when appending we
|
||||
# expect a CoW.
|
||||
assert cows
|
||||
|
||||
cow_block_id = sequence_len // block_size
|
||||
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
||||
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
||||
|
||||
assert (expected_src, expected_dst) in cows
|
||||
|
||||
static_block_table.free()
|
||||
appender_block_table.free()
|
||||
|
||||
# After free, expect all blocks to be freed.
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
|
||||
num_new_tokens: int,
|
||||
num_lookahead_slots: int,
|
||||
allocator_type: str):
|
||||
"""Verify correct calculation of get_num_blocks_touched_by_append_slots.
|
||||
|
||||
This is done by using copy-on-write, which requires any modified block to
|
||||
be copied before write if the refcount > 1. We set the refcount>1 by forking
|
||||
a sequence, then measure the free blocks before and after an append. If the
|
||||
number of consumed blocks equals what `get_num_blocks_touched_by_append_
|
||||
slots` returns, then the calculation is correct.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(num_new_tokens))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Add lookahead before fork so both sequences have the same lookahead
|
||||
# blocks.
|
||||
block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
|
||||
|
||||
# Fork sequence so that every block has refcount > 1.
|
||||
_ = block_table.fork()
|
||||
|
||||
# Determine how many blocks should be touched.
|
||||
expected_num_touched_blocks = (
|
||||
block_table.get_num_blocks_touched_by_append_slots(
|
||||
token_ids=token_ids_to_append,
|
||||
num_lookahead_slots=num_lookahead_slots))
|
||||
|
||||
# Measure how many blocks are touched by measuring num_free_blocks before
|
||||
# and after the append.
|
||||
#
|
||||
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
|
||||
num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
|
||||
block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
|
||||
num_consumed_blocks = (num_free_blocks_before_append -
|
||||
allocator.get_num_free_blocks(Device.GPU))
|
||||
|
||||
# TODO(cade) ensure equality when num_lookahead_slots > 0.
|
||||
# The reason we have < is because lookahead blocks are not copied eagerly;
|
||||
# they are copied on first write. This will cause issues for beam search +
|
||||
# speculative decoding. This is acceptable for now as it is a large effort
|
||||
# to combine the two. To fix this, we can ensure single sequence ownership
|
||||
# of lookahead blocks by appending empty slots to each block, which will
|
||||
# trigger the CoW.
|
||||
#
|
||||
# Until then, we can accept that the consumed tokens are <= the expected
|
||||
# tokens when appending with lookahead.
|
||||
if num_lookahead_slots > 0:
|
||||
assert num_consumed_blocks <= expected_num_touched_blocks
|
||||
else:
|
||||
assert num_consumed_blocks == expected_num_touched_blocks
|
||||
@ -1,45 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.common import RefCounter
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
@pytest.mark.parametrize("num_incrs", [1, 100])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
def test_incr(seed: int, num_incrs: int, num_blocks: int):
|
||||
random.seed(seed)
|
||||
|
||||
all_block_indices = list(range(num_blocks))
|
||||
counter = RefCounter(all_block_indices=all_block_indices)
|
||||
|
||||
block_id = random.randint(0, num_blocks - 1)
|
||||
for i in range(num_incrs):
|
||||
value = counter.incr(block_id)
|
||||
assert value == i + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
@pytest.mark.parametrize("num_incrs", [1, 100])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
|
||||
random.seed(seed)
|
||||
|
||||
all_block_indices = list(range(num_blocks))
|
||||
counter = RefCounter(all_block_indices=all_block_indices)
|
||||
|
||||
block_id = random.randint(0, num_blocks - 1)
|
||||
for i in range(num_incrs):
|
||||
value = counter.incr(block_id)
|
||||
assert value == i + 1
|
||||
|
||||
for i in range(num_incrs):
|
||||
value = counter.decr(block_id)
|
||||
assert value == num_incrs - (i + 1)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
counter.decr(block_id)
|
||||
@ -1,96 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||
from vllm.utils import Device, chunk_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||
block_size: int, allocator_type: str):
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
cpu_blocks = [
|
||||
allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
|
||||
for _ in range(num_cpu_blocks)
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
gpu_blocks = [
|
||||
allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
|
||||
for _ in range(num_gpu_blocks)
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in cpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in gpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [2])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||
block_size: int, allocator_type: str):
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
unique_token_ids = list(
|
||||
range((num_cpu_blocks + num_gpu_blocks) * block_size))
|
||||
gpu_token_ids = list(
|
||||
chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
|
||||
cpu_token_ids = list(
|
||||
chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
|
||||
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
cpu_blocks = [
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
device=Device.CPU)
|
||||
for token_ids in cpu_token_ids
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
gpu_blocks = [
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
device=Device.GPU)
|
||||
for token_ids in gpu_token_ids
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in cpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in gpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
@ -1,148 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
|
||||
|
||||
|
||||
class TestNaiveBlockAllocator:
|
||||
|
||||
@staticmethod
|
||||
def create_allocate_lambda(allocate_type: str,
|
||||
allocator: NaiveBlockAllocator,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: list[int]):
|
||||
if allocate_type == "immutable":
|
||||
allocate_block = lambda: allocator.allocate_immutable_block(
|
||||
prev_block=prev_block, token_ids=token_ids)
|
||||
elif allocate_type == "mutable":
|
||||
allocate_block = lambda: allocator.allocate_mutable_block(
|
||||
prev_block=prev_block)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return allocate_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_ooms(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
[allocate_block() for _ in range(num_blocks)]
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_free_prevents_oom(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
blocks = [allocate_block() for _ in range(num_blocks)]
|
||||
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
block_to_free = blocks.pop()
|
||||
|
||||
for _ in range(100):
|
||||
block_id = block_to_free.block_id
|
||||
allocator.free(block_to_free)
|
||||
assert block_to_free.block_id is None
|
||||
|
||||
new_block = allocate_block()
|
||||
assert new_block.block_id == block_id
|
||||
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
block_to_free = new_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
assert allocator.get_num_free_blocks() == num_blocks
|
||||
|
||||
blocks = [allocate_block() for _ in range(num_blocks)]
|
||||
|
||||
for i, block in enumerate(blocks):
|
||||
assert allocator.get_num_free_blocks() == i
|
||||
allocator.free(block)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [4])
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
|
||||
""" Verify the allocator can correctly return the number of
|
||||
full blocks touched.
|
||||
"""
|
||||
allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create a chain of cacheable blocks in the dst
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
"immutable",
|
||||
allocator_src,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
|
||||
|
||||
# All blocks are cached
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks - 1
|
||||
|
||||
# Insert one non-full block in the src
|
||||
allocate_non_full_block = \
|
||||
TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
"mutable", allocator_src,
|
||||
prev_block=src_blocks[-1],token_ids=[]
|
||||
)
|
||||
src_blocks.append(allocate_non_full_block())
|
||||
src_blocks[-1].append_token_ids([0])
|
||||
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks - 1
|
||||
# Fill up the last source block and then invoke
|
||||
# get_num_blocks_touched
|
||||
src_blocks[-1].append_token_ids([0] * (block_size - 1))
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user