mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 14:17:16 +08:00
[CI/Build][Doc] Move existing benchmark scripts in CI/document/example to vllm bench CLI (#21355)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
This commit is contained in:
parent
9094d11c5d
commit
e7c4f9ee86
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
|
|||||||
echo "Container: vllm"
|
echo "Container: vllm"
|
||||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@ -227,7 +227,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -248,7 +248,7 @@ run_serving_tests() {
|
|||||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -267,13 +267,13 @@ run_serving_tests() {
|
|||||||
$client_args"
|
$client_args"
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
||||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
echo "Client command: $client_command"
|
echo "Client command: $client_command"
|
||||||
@ -304,7 +304,7 @@ run_serving_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_genai_perf_tests() {
|
run_genai_perf_tests() {
|
||||||
# run genai-perf tests
|
# run genai-perf tests
|
||||||
|
|
||||||
# $1: a json file specifying genai-perf test cases
|
# $1: a json file specifying genai-perf test cases
|
||||||
local genai_perf_test_file
|
local genai_perf_test_file
|
||||||
@ -313,14 +313,14 @@ run_genai_perf_tests() {
|
|||||||
# Iterate over genai-perf tests
|
# Iterate over genai-perf tests
|
||||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
echo "Skip test case $test_name."
|
echo "Skip test case $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
# prepend the current serving engine to the test name
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
@ -371,10 +371,10 @@ run_genai_perf_tests() {
|
|||||||
qps=$num_prompts
|
qps=$num_prompts
|
||||||
echo "now qps is $qps"
|
echo "now qps is $qps"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
new_test_name=$test_name"_qps_"$qps
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
backend="vllm"
|
backend="vllm"
|
||||||
fi
|
fi
|
||||||
@ -415,7 +415,7 @@ prepare_dataset() {
|
|||||||
do
|
do
|
||||||
cat sonnet.txt >> sonnet_4x.txt
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
done
|
done
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
|
|||||||
@ -206,7 +206,7 @@ run_latency_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command=" $latency_envs python3 benchmark_latency.py \
|
latency_command=" $latency_envs vllm bench latency \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$latency_args"
|
||||||
|
|
||||||
@ -273,7 +273,7 @@ run_throughput_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
throughput_command=" $throughput_envs vllm bench throughput \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$throughput_args"
|
$throughput_args"
|
||||||
|
|
||||||
@ -394,7 +394,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
# pass the tensor parallel size to the client so that it can be displayed
|
||||||
# on the benchmark dashboard
|
# on the benchmark dashboard
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $RESULTS_FOLDER \
|
--result-dir $RESULTS_FOLDER \
|
||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
|
|||||||
@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
|
|||||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
set -e;
|
set -e;
|
||||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
@ -69,7 +69,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
@ -83,7 +83,7 @@ function cpu_tests() {
|
|||||||
set -e
|
set -e
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
|||||||
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|||||||
|
|
||||||
# wait for server to start, timeout after 600 seconds
|
# wait for server to start, timeout after 600 seconds
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
|||||||
@ -77,7 +77,7 @@ done
|
|||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
echo "logging to $BM_LOG"
|
echo "logging to $BM_LOG"
|
||||||
echo
|
echo
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
|
|||||||
@ -98,7 +98,7 @@ Then run the benchmarking script
|
|||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -111,25 +111,25 @@ If successful, you will see the following output
|
|||||||
|
|
||||||
```
|
```
|
||||||
============ Serving Benchmark Result ============
|
============ Serving Benchmark Result ============
|
||||||
Successful requests: 10
|
Successful requests: 10
|
||||||
Benchmark duration (s): 5.78
|
Benchmark duration (s): 5.78
|
||||||
Total input tokens: 1369
|
Total input tokens: 1369
|
||||||
Total generated tokens: 2212
|
Total generated tokens: 2212
|
||||||
Request throughput (req/s): 1.73
|
Request throughput (req/s): 1.73
|
||||||
Output token throughput (tok/s): 382.89
|
Output token throughput (tok/s): 382.89
|
||||||
Total Token throughput (tok/s): 619.85
|
Total Token throughput (tok/s): 619.85
|
||||||
---------------Time to First Token----------------
|
---------------Time to First Token----------------
|
||||||
Mean TTFT (ms): 71.54
|
Mean TTFT (ms): 71.54
|
||||||
Median TTFT (ms): 73.88
|
Median TTFT (ms): 73.88
|
||||||
P99 TTFT (ms): 79.49
|
P99 TTFT (ms): 79.49
|
||||||
-----Time per Output Token (excl. 1st token)------
|
-----Time per Output Token (excl. 1st token)------
|
||||||
Mean TPOT (ms): 7.91
|
Mean TPOT (ms): 7.91
|
||||||
Median TPOT (ms): 7.96
|
Median TPOT (ms): 7.96
|
||||||
P99 TPOT (ms): 8.03
|
P99 TPOT (ms): 8.03
|
||||||
---------------Inter-token Latency----------------
|
---------------Inter-token Latency----------------
|
||||||
Mean ITL (ms): 7.74
|
Mean ITL (ms): 7.74
|
||||||
Median ITL (ms): 7.70
|
Median ITL (ms): 7.70
|
||||||
P99 ITL (ms): 8.39
|
P99 ITL (ms): 8.39
|
||||||
==================================================
|
==================================================
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
|
|||||||
{"prompt": "What is the capital of India?"}
|
{"prompt": "What is the capital of India?"}
|
||||||
{"prompt": "What is the capital of Iran?"}
|
{"prompt": "What is the capital of Iran?"}
|
||||||
{"prompt": "What is the capital of China?"}
|
{"prompt": "What is the capital of China?"}
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# start server
|
# start server
|
||||||
@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# run benchmarking script
|
# run benchmarking script
|
||||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
vllm bench serve --port 9001 --save-result --save-detailed \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|||||||
```
|
```
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path likaixin/InstructCoder \
|
--dataset-path likaixin/InstructCoder \
|
||||||
@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
**`AI-MO/aimo-validation-aime`**
|
**`AI-MO/aimo-validation-aime`**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
--dataset-path AI-MO/aimo-validation-aime \
|
||||||
@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
**`philschmid/mt-bench`**
|
**`philschmid/mt-bench`**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path philschmid/mt-bench \
|
--dataset-path philschmid/mt-bench \
|
||||||
@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
|
|||||||
parameters can be specified. Example client command:
|
parameters can be specified. Example client command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
|
|||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||||
@ -314,7 +314,7 @@ Total num output tokens: 1500
|
|||||||
**VisionArena Benchmark for Vision Language Models**
|
**VisionArena Benchmark for Vision Language Models**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -336,7 +336,7 @@ Total num output tokens: 1280
|
|||||||
``` bash
|
``` bash
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_USE_V1=1 \
|
VLLM_USE_V1=1 \
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--dataset-name=hf \
|
--dataset-name=hf \
|
||||||
--dataset-path=likaixin/InstructCoder \
|
--dataset-path=likaixin/InstructCoder \
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
@ -360,7 +360,7 @@ Total num output tokens: 204800
|
|||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
**`AI-MO/aimo-validation-aime`**
|
**`AI-MO/aimo-validation-aime`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
|
|||||||
``` bash
|
``` bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
--model meta-llama/Llama-2-7b-hf \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||||
# See details in README (benchmarks/auto_tune/README.md).
|
# See details in README (benchmarks/auto_tune/README.md).
|
||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
@ -56,7 +56,7 @@ start_server() {
|
|||||||
local max_num_batched_tokens=$3
|
local max_num_batched_tokens=$3
|
||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
pkill -f vllm
|
pkill -f vllm
|
||||||
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
||||||
@ -73,9 +73,9 @@ start_server() {
|
|||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
for i in {1..60}; do
|
for i in {1..60}; do
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
server_started=1
|
server_started=1
|
||||||
break
|
break
|
||||||
@ -98,10 +98,10 @@ update_best_profile() {
|
|||||||
selected_profile_file=
|
selected_profile_file=
|
||||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||||
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
||||||
fi
|
fi
|
||||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||||
selected_profile_file="${sorted_paths[$profile_index]}"
|
selected_profile_file="${sorted_paths[$profile_index]}"
|
||||||
fi
|
fi
|
||||||
rm -f $PROFILE_PATH/*
|
rm -f $PROFILE_PATH/*
|
||||||
cp $selected_profile_file $PROFILE_PATH
|
cp $selected_profile_file $PROFILE_PATH
|
||||||
}
|
}
|
||||||
@ -129,14 +129,14 @@ run_benchmark() {
|
|||||||
echo "server started."
|
echo "server started."
|
||||||
fi
|
fi
|
||||||
echo
|
echo
|
||||||
|
|
||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
meet_latency_requirement=0
|
meet_latency_requirement=0
|
||||||
# get a basic qps by using request-rate inf
|
# get a basic qps by using request-rate inf
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
sleep 5
|
sleep 5
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from typing import Any, Optional
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
|
|||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_latency.py is deprecated and will be removed in a "
|
||||||
|
"future version. Please use 'vllm bench latency' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
|
|||||||
@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
|
|||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_serving.py is deprecated and will be removed in a future "
|
||||||
|
"version. Please use 'vllm bench serve' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
|
|||||||
@ -15,6 +15,7 @@ import torch
|
|||||||
import uvloop
|
import uvloop
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from benchmark_dataset import (
|
from benchmark_dataset import (
|
||||||
AIMODataset,
|
AIMODataset,
|
||||||
@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
|
|||||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_throughput.py is deprecated and will be removed in a "
|
||||||
|
"future version. Please use 'vllm bench throughput' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
if args.seed is None:
|
if args.seed is None:
|
||||||
args.seed = 0
|
args.seed = 0
|
||||||
|
|||||||
@ -38,7 +38,7 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
|||||||
benchmark_serving.py:
|
benchmark_serving.py:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model meta-llama/Meta-Llama-3-70B \
|
--model meta-llama/Meta-Llama-3-70B \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script:
|
|||||||
nsys profile -o report.nsys-rep \
|
nsys profile -o report.nsys-rep \
|
||||||
--trace-fork-before-exec=true \
|
--trace-fork-before-exec=true \
|
||||||
--cuda-graph-trace=node \
|
--cuda-graph-trace=node \
|
||||||
python benchmarks/benchmark_latency.py \
|
vllm bench latency \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--num-iters-warmup 5 \
|
--num-iters-warmup 5 \
|
||||||
--num-iters 1 \
|
--num-iters 1 \
|
||||||
@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \
|
|||||||
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
|
||||||
# client
|
# client
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--num-prompts 1 \
|
--num-prompts 1 \
|
||||||
@ -132,7 +132,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
|
|||||||
...
|
...
|
||||||
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
|
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
|
||||||
|
|
||||||
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
|
||||||
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
|
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
|
||||||
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
|
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
|
||||||
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
|
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
|
||||||
@ -143,7 +143,7 @@ You can view these profiles either as summaries in the CLI, using `nsys stats [p
|
|||||||
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
|
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
|
||||||
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
|
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
|
||||||
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
|
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
GUI example:
|
GUI example:
|
||||||
|
|||||||
@ -3,14 +3,14 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica
|
|||||||
# Detailed Design
|
# Detailed Design
|
||||||
|
|
||||||
## Overall Process
|
## Overall Process
|
||||||
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
|
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
|
||||||
|
|
||||||
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
|
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
|
||||||
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
|
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
|
||||||
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
|
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
|
||||||
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
|
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
|
||||||
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
|
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
|
||||||
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
|
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
|
||||||
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
|
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
|
||||||
|
|
||||||

|

|
||||||
@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
|
|||||||
??? console "Command"
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
python3 benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model base_model \
|
--model base_model \
|
||||||
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
||||||
|
|||||||
@ -29,7 +29,7 @@ PROXY_PORT=${PROXY_PORT:-30001}
|
|||||||
PREFILL_GPUS=${PREFILL_GPUS:-0}
|
PREFILL_GPUS=${PREFILL_GPUS:-0}
|
||||||
DECODE_GPUS=${DECODE_GPUS:-1,2,3}
|
DECODE_GPUS=${DECODE_GPUS:-1,2,3}
|
||||||
PREFILL_PORTS=${PREFILL_PORTS:-20003}
|
PREFILL_PORTS=${PREFILL_PORTS:-20003}
|
||||||
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
|
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
|
||||||
|
|
||||||
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
|
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
|
||||||
echo ""
|
echo ""
|
||||||
@ -164,7 +164,7 @@ main() {
|
|||||||
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
|
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
|
||||||
local port=${PREFILL_PORT_ARRAY[$i]}
|
local port=${PREFILL_PORT_ARRAY[$i]}
|
||||||
local kv_port=$((21001 + i))
|
local kv_port=$((21001 + i))
|
||||||
|
|
||||||
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||||
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
|
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
@ -193,7 +193,7 @@ main() {
|
|||||||
local gpu_id=${DECODE_GPU_ARRAY[$i]}
|
local gpu_id=${DECODE_GPU_ARRAY[$i]}
|
||||||
local port=${DECODE_PORT_ARRAY[$i]}
|
local port=${DECODE_PORT_ARRAY[$i]}
|
||||||
local kv_port=$((22001 + i))
|
local kv_port=$((22001 + i))
|
||||||
|
|
||||||
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
@ -233,7 +233,7 @@ main() {
|
|||||||
# Run Benchmark
|
# Run Benchmark
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
cd ../../../benchmarks/
|
cd ../../../benchmarks/
|
||||||
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
|
vllm bench serve --port 10001 --seed $(date +%s) \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||||
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
|
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
|
||||||
@ -243,4 +243,4 @@ main() {
|
|||||||
cleanup
|
cleanup
|
||||||
}
|
}
|
||||||
|
|
||||||
main
|
main
|
||||||
|
|||||||
@ -122,7 +122,7 @@ main() {
|
|||||||
|
|
||||||
# begin benchmark
|
# begin benchmark
|
||||||
cd ../../../../benchmarks/
|
cd ../../../../benchmarks/
|
||||||
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \
|
vllm bench serve --port 9000 --seed $(date +%s) \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||||
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
||||||
@ -133,4 +133,4 @@ main() {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main
|
main
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user