[CI/Build][Doc] Move existing benchmark scripts in CI/document/example to vllm bench CLI (#21355)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
This commit is contained in:
Ye (Charlotte) Qi 2025-07-26 07:10:14 -07:00 committed by GitHub
parent 9094d11c5d
commit e7c4f9ee86
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 101 additions and 86 deletions

View File

@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
echo "Container: vllm" echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder # move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm export CURRENT_LLM_SERVING_ENGINE=vllm
return return
fi fi
} }
@ -227,7 +227,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--backend $backend \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \ --model $model \
@ -248,7 +248,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--backend $backend \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \ --model $model \
@ -267,13 +267,13 @@ run_serving_tests() {
$client_args" $client_args"
else else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1 exit 1
fi fi
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
@ -304,7 +304,7 @@ run_serving_tests() {
} }
run_genai_perf_tests() { run_genai_perf_tests() {
# run genai-perf tests # run genai-perf tests
# $1: a json file specifying genai-perf test cases # $1: a json file specifying genai-perf test cases
local genai_perf_test_file local genai_perf_test_file
@ -313,14 +313,14 @@ run_genai_perf_tests() {
# Iterate over genai-perf tests # Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector # if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name." echo "Skip test case $test_name."
continue continue
fi fi
# prepend the current serving engine to the test name # prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@ -371,10 +371,10 @@ run_genai_perf_tests() {
qps=$num_prompts qps=$num_prompts
echo "now qps is $qps" echo "now qps is $qps"
fi fi
new_test_name=$test_name"_qps_"$qps new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE backend=$CURRENT_LLM_SERVING_ENGINE
if [[ "$backend" == *"vllm"* ]]; then if [[ "$backend" == *"vllm"* ]]; then
backend="vllm" backend="vllm"
fi fi
@ -415,7 +415,7 @@ prepare_dataset() {
do do
cat sonnet.txt >> sonnet_4x.txt cat sonnet.txt >> sonnet_4x.txt
done done
} }
main() { main() {

View File

@ -206,7 +206,7 @@ run_latency_tests() {
fi fi
fi fi
latency_command=" $latency_envs python3 benchmark_latency.py \ latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args" $latency_args"
@ -273,7 +273,7 @@ run_throughput_tests() {
fi fi
fi fi
throughput_command=" $throughput_envs python3 benchmark_throughput.py \ throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args" $throughput_args"
@ -394,7 +394,7 @@ run_serving_tests() {
# pass the tensor parallel size to the client so that it can be displayed # pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard # on the benchmark dashboard
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--save-result \ --save-result \
--result-dir $RESULTS_FOLDER \ --result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \

View File

@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32 export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
set -e; set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
@ -69,7 +69,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1 # Note: disable it until supports V1
# Run AWQ test # Run AWQ test
@ -83,7 +83,7 @@ function cpu_tests() {
set -e set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \ --model meta-llama/Llama-3.2-3B-Instruct \

View File

@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite # run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$? bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$? bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite # run server-based benchmarks and upload the result to buildkite
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds # wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name sharegpt \ --dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -77,7 +77,7 @@ done
echo "run benchmark test..." echo "run benchmark test..."
echo "logging to $BM_LOG" echo "logging to $BM_LOG"
echo echo
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name sonnet \ --dataset-name sonnet \

View File

@ -98,7 +98,7 @@ Then run the benchmarking script
```bash ```bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -111,25 +111,25 @@ If successful, you will see the following output
``` ```
============ Serving Benchmark Result ============ ============ Serving Benchmark Result ============
Successful requests: 10 Successful requests: 10
Benchmark duration (s): 5.78 Benchmark duration (s): 5.78
Total input tokens: 1369 Total input tokens: 1369
Total generated tokens: 2212 Total generated tokens: 2212
Request throughput (req/s): 1.73 Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89 Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85 Total Token throughput (tok/s): 619.85
---------------Time to First Token---------------- ---------------Time to First Token----------------
Mean TTFT (ms): 71.54 Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88 Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49 P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------ -----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 7.91 Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96 Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03 P99 TPOT (ms): 8.03
---------------Inter-token Latency---------------- ---------------Inter-token Latency----------------
Mean ITL (ms): 7.74 Mean ITL (ms): 7.74
Median ITL (ms): 7.70 Median ITL (ms): 7.70
P99 ITL (ms): 8.39 P99 ITL (ms): 8.39
================================================== ==================================================
``` ```
@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
{"prompt": "What is the capital of India?"} {"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"} {"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"} {"prompt": "What is the capital of China?"}
``` ```
```bash ```bash
# start server # start server
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```bash ```bash
# run benchmarking script # run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
``` ```
``` bash ``` bash
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \ --model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \ --dataset-name hf \
--dataset-path likaixin/InstructCoder \ --dataset-path likaixin/InstructCoder \
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`AI-MO/aimo-validation-aime`** **`AI-MO/aimo-validation-aime`**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--dataset-name hf \ --dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \ --dataset-path AI-MO/aimo-validation-aime \
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`philschmid/mt-bench`** **`philschmid/mt-bench`**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--dataset-name hf \ --dataset-name hf \
--dataset-path philschmid/mt-bench \ --dataset-path philschmid/mt-bench \
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command: parameters can be specified. Example client command:
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
<br/> <br/>
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \ --dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \ --dataset-path vllm/benchmarks/sonnet.txt \
@ -314,7 +314,7 @@ Total num output tokens: 1500
**VisionArena Benchmark for Vision Language Models** **VisionArena Benchmark for Vision Language Models**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -336,7 +336,7 @@ Total num output tokens: 1280
``` bash ``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \ VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--dataset-name=hf \ --dataset-name=hf \
--dataset-path=likaixin/InstructCoder \ --dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \ --model=meta-llama/Meta-Llama-3-8B-Instruct \
@ -360,7 +360,7 @@ Total num output tokens: 204800
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`AI-MO/aimo-validation-aime`** **`AI-MO/aimo-validation-aime`**
```bash ```bash
python3 benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--backend vllm \ --backend vllm \
--dataset-name hf \ --dataset-name hf \
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
``` bash ``` bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \ --model meta-llama/Llama-2-7b-hf \
--backend vllm \ --backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. # This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# See details in README (benchmarks/auto_tune/README.md). # See details in README (benchmarks/auto_tune/README.md).
TAG=$(date +"%Y_%m_%d_%H_%M") TAG=$(date +"%Y_%m_%d_%H_%M")
@ -56,7 +56,7 @@ start_server() {
local max_num_batched_tokens=$3 local max_num_batched_tokens=$3
local vllm_log=$4 local vllm_log=$4
local profile_dir=$5 local profile_dir=$5
pkill -f vllm pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
@ -73,9 +73,9 @@ start_server() {
# wait for 10 minutes... # wait for 10 minutes...
server_started=0 server_started=0
for i in {1..60}; do for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1 server_started=1
break break
@ -98,10 +98,10 @@ update_best_profile() {
selected_profile_file= selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb" selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi fi
if [[ "$SYSTEM" == "GPU" ]]; then if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}" selected_profile_file="${sorted_paths[$profile_index]}"
fi fi
rm -f $PROFILE_PATH/* rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH cp $selected_profile_file $PROFILE_PATH
} }
@ -129,14 +129,14 @@ run_benchmark() {
echo "server started." echo "server started."
fi fi
echo echo
echo "run benchmark test..." echo "run benchmark test..."
meet_latency_requirement=0 meet_latency_requirement=0
# get a basic qps by using request-rate inf # get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len )) adjusted_input_len=$(( INPUT_LEN - prefix_len ))
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name random \
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5 sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name random \

View File

@ -11,6 +11,7 @@ from typing import Any, Optional
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from typing_extensions import deprecated
import vllm.envs as envs import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_latency.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench latency' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)

View File

@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
import numpy as np import numpy as np
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from typing_extensions import deprecated
from backend_request_func import ( from backend_request_func import (
ASYNC_REQUEST_FUNCS, ASYNC_REQUEST_FUNCS,
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_serving.py is deprecated and will be removed in a future "
"version. Please use 'vllm bench serve' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
random.seed(args.seed) random.seed(args.seed)

View File

@ -15,6 +15,7 @@ import torch
import uvloop import uvloop
from tqdm import tqdm from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
from typing_extensions import deprecated
from benchmark_dataset import ( from benchmark_dataset import (
AIMODataset, AIMODataset,
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
return dataset_cls(**common_kwargs).sample(**sample_kwargs) return dataset_cls(**common_kwargs).sample(**sample_kwargs)
@deprecated(
"benchmark_throughput.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench throughput' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
if args.seed is None: if args.seed is None:
args.seed = 0 args.seed = 0

View File

@ -38,7 +38,7 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
benchmark_serving.py: benchmark_serving.py:
```bash ```bash
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Meta-Llama-3-70B \ --model meta-llama/Meta-Llama-3-70B \
--dataset-name sharegpt \ --dataset-name sharegpt \
@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script:
nsys profile -o report.nsys-rep \ nsys profile -o report.nsys-rep \
--trace-fork-before-exec=true \ --trace-fork-before-exec=true \
--cuda-graph-trace=node \ --cuda-graph-trace=node \
python benchmarks/benchmark_latency.py \ vllm bench latency \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-iters-warmup 5 \ --num-iters-warmup 5 \
--num-iters 1 \ --num-iters 1 \
@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \
vllm serve meta-llama/Llama-3.1-8B-Instruct vllm serve meta-llama/Llama-3.1-8B-Instruct
# client # client
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \ --num-prompts 1 \
@ -132,7 +132,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
... ...
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ---------------------------------------------------------------------------------------------------- -------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of… 46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of… 14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
@ -143,7 +143,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern… 2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in… 1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0 0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0
... ...
``` ```
GUI example: GUI example:

View File

@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
# Detailed Design # Detailed Design
## Overall Process ## Overall Process
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. 1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**. 2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**. 3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. 4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**. 5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**. 6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**. 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7) ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
??? console "Command" ??? console "Command"
```shell ```shell
python3 benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model base_model \ --model base_model \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \ --tokenizer meta-llama/Llama-3.1-8B-Instruct \

View File

@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
PREFILL_GPUS=${PREFILL_GPUS:-0} PREFILL_GPUS=${PREFILL_GPUS:-0}
DECODE_GPUS=${DECODE_GPUS:-1,2,3} DECODE_GPUS=${DECODE_GPUS:-1,2,3}
PREFILL_PORTS=${PREFILL_PORTS:-20003} PREFILL_PORTS=${PREFILL_PORTS:-20003}
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009} DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change." echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
echo "" echo ""
@ -164,7 +164,7 @@ main() {
local gpu_id=${PREFILL_GPU_ARRAY[$i]} local gpu_id=${PREFILL_GPU_ARRAY[$i]}
local port=${PREFILL_PORT_ARRAY[$i]} local port=${PREFILL_PORT_ARRAY[$i]}
local kv_port=$((21001 + i)) local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \ CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
--enforce-eager \ --enforce-eager \
@ -193,7 +193,7 @@ main() {
local gpu_id=${DECODE_GPU_ARRAY[$i]} local gpu_id=${DECODE_GPU_ARRAY[$i]}
local port=${DECODE_PORT_ARRAY[$i]} local port=${DECODE_PORT_ARRAY[$i]}
local kv_port=$((22001 + i)) local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--enforce-eager \ --enforce-eager \
@ -233,7 +233,7 @@ main() {
# Run Benchmark # Run Benchmark
# ============================================================================= # =============================================================================
cd ../../../benchmarks/ cd ../../../benchmarks/
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \ vllm bench serve --port 10001 --seed $(date +%s) \
--model $MODEL \ --model $MODEL \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
@ -243,4 +243,4 @@ main() {
cleanup cleanup
} }
main main

View File

@ -122,7 +122,7 @@ main() {
# begin benchmark # begin benchmark
cd ../../../../benchmarks/ cd ../../../../benchmarks/
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \ vllm bench serve --port 9000 --seed $(date +%s) \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
@ -133,4 +133,4 @@ main() {
} }
main main