[CI/Build][Doc] Move existing benchmark scripts in CI/document/example to vllm bench CLI (#21355)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
This commit is contained in:
Ye (Charlotte) Qi 2025-07-26 07:10:14 -07:00 committed by GitHub
parent 9094d11c5d
commit e7c4f9ee86
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 101 additions and 86 deletions

View File

@ -227,7 +227,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--backend $backend \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \ --model $model \
@ -248,7 +248,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--backend $backend \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \ --model $model \

View File

@ -206,7 +206,7 @@ run_latency_tests() {
fi fi
fi fi
latency_command=" $latency_envs python3 benchmark_latency.py \ latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args" $latency_args"
@ -273,7 +273,7 @@ run_throughput_tests() {
fi fi
fi fi
throughput_command=" $throughput_envs python3 benchmark_throughput.py \ throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args" $throughput_args"
@ -394,7 +394,7 @@ run_serving_tests() {
# pass the tensor parallel size to the client so that it can be displayed # pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard # on the benchmark dashboard
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--save-result \ --save-result \
--result-dir $RESULTS_FOLDER \ --result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \

View File

@ -83,7 +83,7 @@ function cpu_tests() {
set -e set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \ --model meta-llama/Llama-3.2-3B-Instruct \

View File

@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite # run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$? bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$? bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite # run server-based benchmarks and upload the result to buildkite
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds # wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name sharegpt \ --dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -77,7 +77,7 @@ done
echo "run benchmark test..." echo "run benchmark test..."
echo "logging to $BM_LOG" echo "logging to $BM_LOG"
echo echo
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name sonnet \ --dataset-name sonnet \

View File

@ -98,7 +98,7 @@ Then run the benchmarking script
```bash ```bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```bash ```bash
# run benchmarking script # run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
``` ```
``` bash ``` bash
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \ --model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \ --dataset-name hf \
--dataset-path likaixin/InstructCoder \ --dataset-path likaixin/InstructCoder \
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`AI-MO/aimo-validation-aime`** **`AI-MO/aimo-validation-aime`**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--dataset-name hf \ --dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \ --dataset-path AI-MO/aimo-validation-aime \
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`philschmid/mt-bench`** **`philschmid/mt-bench`**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--dataset-name hf \ --dataset-name hf \
--dataset-path philschmid/mt-bench \ --dataset-path philschmid/mt-bench \
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command: parameters can be specified. Example client command:
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \ --endpoint /v1/completions \
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
<br/> <br/>
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \ --dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \ --dataset-path vllm/benchmarks/sonnet.txt \
@ -314,7 +314,7 @@ Total num output tokens: 1500
**VisionArena Benchmark for Vision Language Models** **VisionArena Benchmark for Vision Language Models**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -336,7 +336,7 @@ Total num output tokens: 1280
``` bash ``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \ VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--dataset-name=hf \ --dataset-name=hf \
--dataset-path=likaixin/InstructCoder \ --dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \ --model=meta-llama/Meta-Llama-3-8B-Instruct \
@ -360,7 +360,7 @@ Total num output tokens: 204800
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`AI-MO/aimo-validation-aime`** **`AI-MO/aimo-validation-aime`**
```bash ```bash
python3 benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--backend vllm \ --backend vllm \
--dataset-name hf \ --dataset-name hf \
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
``` bash ``` bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \ --model meta-llama/Llama-2-7b-hf \
--backend vllm \ --backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \

View File

@ -136,7 +136,7 @@ run_benchmark() {
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len )) adjusted_input_len=$(( INPUT_LEN - prefix_len ))
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name random \
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5 sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name random \

View File

@ -11,6 +11,7 @@ from typing import Any, Optional
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from typing_extensions import deprecated
import vllm.envs as envs import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_latency.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench latency' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)

View File

@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
import numpy as np import numpy as np
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from typing_extensions import deprecated
from backend_request_func import ( from backend_request_func import (
ASYNC_REQUEST_FUNCS, ASYNC_REQUEST_FUNCS,
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_serving.py is deprecated and will be removed in a future "
"version. Please use 'vllm bench serve' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
random.seed(args.seed) random.seed(args.seed)

View File

@ -15,6 +15,7 @@ import torch
import uvloop import uvloop
from tqdm import tqdm from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
from typing_extensions import deprecated
from benchmark_dataset import ( from benchmark_dataset import (
AIMODataset, AIMODataset,
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
return dataset_cls(**common_kwargs).sample(**sample_kwargs) return dataset_cls(**common_kwargs).sample(**sample_kwargs)
@deprecated(
"benchmark_throughput.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench throughput' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
if args.seed is None: if args.seed is None:
args.seed = 0 args.seed = 0

View File

@ -38,7 +38,7 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
benchmark_serving.py: benchmark_serving.py:
```bash ```bash
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Meta-Llama-3-70B \ --model meta-llama/Meta-Llama-3-70B \
--dataset-name sharegpt \ --dataset-name sharegpt \
@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script:
nsys profile -o report.nsys-rep \ nsys profile -o report.nsys-rep \
--trace-fork-before-exec=true \ --trace-fork-before-exec=true \
--cuda-graph-trace=node \ --cuda-graph-trace=node \
python benchmarks/benchmark_latency.py \ vllm bench latency \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-iters-warmup 5 \ --num-iters-warmup 5 \
--num-iters 1 \ --num-iters 1 \
@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \
vllm serve meta-llama/Llama-3.1-8B-Instruct vllm serve meta-llama/Llama-3.1-8B-Instruct
# client # client
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \ --num-prompts 1 \

View File

@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
??? console "Command" ??? console "Command"
```shell ```shell
python3 benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model base_model \ --model base_model \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \ --tokenizer meta-llama/Llama-3.1-8B-Instruct \

View File

@ -233,7 +233,7 @@ main() {
# Run Benchmark # Run Benchmark
# ============================================================================= # =============================================================================
cd ../../../benchmarks/ cd ../../../benchmarks/
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \ vllm bench serve --port 10001 --seed $(date +%s) \
--model $MODEL \ --model $MODEL \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log

View File

@ -122,7 +122,7 @@ main() {
# begin benchmark # begin benchmark
cd ../../../../benchmarks/ cd ../../../../benchmarks/
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \ vllm bench serve --port 9000 --seed $(date +%s) \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log