mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 14:27:19 +08:00
[CI/Build][Doc] Move existing benchmark scripts in CI/document/example to vllm bench CLI (#21355)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
This commit is contained in:
parent
9094d11c5d
commit
e7c4f9ee86
@ -227,7 +227,7 @@ run_serving_tests() {
|
||||
|
||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
client_command="vllm bench serve \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
@ -248,7 +248,7 @@ run_serving_tests() {
|
||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
client_command="vllm bench serve \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
|
||||
@ -206,7 +206,7 @@ run_latency_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
latency_command=" $latency_envs python3 benchmark_latency.py \
|
||||
latency_command=" $latency_envs vllm bench latency \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$latency_args"
|
||||
|
||||
@ -273,7 +273,7 @@ run_throughput_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
||||
throughput_command=" $throughput_envs vllm bench throughput \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$throughput_args"
|
||||
|
||||
@ -394,7 +394,7 @@ run_serving_tests() {
|
||||
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="python3 benchmark_serving.py \
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
|
||||
@ -83,7 +83,7 @@ function cpu_tests() {
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
|
||||
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
||||
# run python-based benchmarks and upload the result to buildkite
|
||||
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||
bench_latency_exit_code=$?
|
||||
|
||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||
bench_throughput_exit_code=$?
|
||||
|
||||
# run server-based benchmarks and upload the result to buildkite
|
||||
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
||||
|
||||
# wait for server to start, timeout after 600 seconds
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
|
||||
@ -77,7 +77,7 @@ done
|
||||
echo "run benchmark test..."
|
||||
echo "logging to $BM_LOG"
|
||||
echo
|
||||
python benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
|
||||
@ -98,7 +98,7 @@ Then run the benchmarking script
|
||||
```bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||
vllm bench serve --port 9001 --save-result --save-detailed \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--endpoint /v1/completions \
|
||||
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
```
|
||||
|
||||
``` bash
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name hf \
|
||||
--dataset-path likaixin/InstructCoder \
|
||||
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
**`AI-MO/aimo-validation-aime`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
**`philschmid/mt-bench`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path philschmid/mt-bench \
|
||||
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
parameters can be specified. Example client command:
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
|
||||
<br/>
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||
@ -314,7 +314,7 @@ Total num output tokens: 1500
|
||||
**VisionArena Benchmark for Vision Language Models**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
@ -336,7 +336,7 @@ Total num output tokens: 1280
|
||||
``` bash
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_USE_V1=1 \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--dataset-name=hf \
|
||||
--dataset-path=likaixin/InstructCoder \
|
||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
@ -360,7 +360,7 @@ Total num output tokens: 204800
|
||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
**`AI-MO/aimo-validation-aime`**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model Qwen/QwQ-32B \
|
||||
--backend vllm \
|
||||
--dataset-name hf \
|
||||
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
|
||||
``` bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
vllm bench throughput \
|
||||
--model meta-llama/Llama-2-7b-hf \
|
||||
--backend vllm \
|
||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
|
||||
@ -136,7 +136,7 @@ run_benchmark() {
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name random \
|
||||
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||
sleep 5
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name random \
|
||||
|
||||
@ -11,6 +11,7 @@ from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from typing_extensions import deprecated
|
||||
|
||||
import vllm.envs as envs
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
@deprecated(
|
||||
"benchmark_latency.py is deprecated and will be removed in a "
|
||||
"future version. Please use 'vllm bench latency' instead.",
|
||||
)
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
|
||||
@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
|
||||
import numpy as np
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from backend_request_func import (
|
||||
ASYNC_REQUEST_FUNCS,
|
||||
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
@deprecated(
|
||||
"benchmark_serving.py is deprecated and will be removed in a future "
|
||||
"version. Please use 'vllm bench serve' instead.",
|
||||
)
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
|
||||
@ -15,6 +15,7 @@ import torch
|
||||
import uvloop
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from benchmark_dataset import (
|
||||
AIMODataset,
|
||||
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
|
||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||
|
||||
|
||||
@deprecated(
|
||||
"benchmark_throughput.py is deprecated and will be removed in a "
|
||||
"future version. Please use 'vllm bench throughput' instead.",
|
||||
)
|
||||
def main(args: argparse.Namespace):
|
||||
if args.seed is None:
|
||||
args.seed = 0
|
||||
|
||||
@ -38,7 +38,7 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
||||
benchmark_serving.py:
|
||||
|
||||
```bash
|
||||
python benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model meta-llama/Meta-Llama-3-70B \
|
||||
--dataset-name sharegpt \
|
||||
@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script:
|
||||
nsys profile -o report.nsys-rep \
|
||||
--trace-fork-before-exec=true \
|
||||
--cuda-graph-trace=node \
|
||||
python benchmarks/benchmark_latency.py \
|
||||
vllm bench latency \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--num-iters-warmup 5 \
|
||||
--num-iters 1 \
|
||||
@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \
|
||||
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||
|
||||
# client
|
||||
python benchmarks/benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--num-prompts 1 \
|
||||
|
||||
@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
|
||||
??? console "Command"
|
||||
|
||||
```shell
|
||||
python3 benchmark_serving.py \
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model base_model \
|
||||
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
||||
|
||||
@ -233,7 +233,7 @@ main() {
|
||||
# Run Benchmark
|
||||
# =============================================================================
|
||||
cd ../../../benchmarks/
|
||||
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
|
||||
vllm bench serve --port 10001 --seed $(date +%s) \
|
||||
--model $MODEL \
|
||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
|
||||
|
||||
@ -122,7 +122,7 @@ main() {
|
||||
|
||||
# begin benchmark
|
||||
cd ../../../../benchmarks/
|
||||
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
|
||||
vllm bench serve --port 9000 --seed $(date +%s) \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user