mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-08 01:27:57 +08:00
[vLLM Benchmark Suite] Add default parameters section and update CPU benchmark cases (#29381)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com> Signed-off-by: Louie Tsai <louie.tsai@intel.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
This commit is contained in:
parent
70fb77b4dc
commit
8bbcf8b6e7
@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
|
|||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
|
#### Default Parameters Field
|
||||||
|
|
||||||
|
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary> An Example of default parameters field </summary>
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"defaults": {
|
||||||
|
"qps_list": [
|
||||||
|
"inf"
|
||||||
|
],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"block_size": 128,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"num_prompts": 200,
|
||||||
|
"ignore-eos": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tests": [
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama3B_tp2_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen3_tp4_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-14B",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-14B",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||||
|
|||||||
@ -110,7 +110,8 @@ json2envs() {
|
|||||||
wait_for_server() {
|
wait_for_server() {
|
||||||
# wait for vllm server to start
|
# wait for vllm server to start
|
||||||
# return 1 if vllm server crashes
|
# return 1 if vllm server crashes
|
||||||
timeout 1200 bash -c '
|
local timeout_val="1200"
|
||||||
|
timeout "$timeout_val" bash -c '
|
||||||
until curl -X POST localhost:8000/v1/completions; do
|
until curl -X POST localhost:8000/v1/completions; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done' && return 0 || return 1
|
done' && return 0 || return 1
|
||||||
@ -316,12 +317,44 @@ run_throughput_tests() {
|
|||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `vllm bench serve` command
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
#
|
||||||
|
# Supported JSON formats:
|
||||||
|
# 1) Plain format: top-level array
|
||||||
|
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||||
|
#
|
||||||
|
# 2) Default parameters field + plain format tests
|
||||||
|
# {
|
||||||
|
# "defaults": { ... },
|
||||||
|
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||||
|
# }
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
serving_test_file=$1
|
serving_test_file=$1
|
||||||
|
|
||||||
# Iterate over serving tests
|
# Iterate over serving tests
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
jq -c '
|
||||||
|
if type == "array" then
|
||||||
|
# Plain format: test cases array
|
||||||
|
.[]
|
||||||
|
elif (type == "object" and has("tests")) then
|
||||||
|
# merge the default parameters into each test cases
|
||||||
|
. as $root
|
||||||
|
| ($root.defaults // {}) as $d
|
||||||
|
| ($root.tests // [])[]
|
||||||
|
# default qps / max_concurrency from defaults if missing
|
||||||
|
| .qps_list = (.qps_list // $d.qps_list)
|
||||||
|
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
|
||||||
|
# merge envs / params: test overrides defaults
|
||||||
|
| .server_environment_variables =
|
||||||
|
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
|
||||||
|
| .server_parameters =
|
||||||
|
(($d.server_parameters // {}) + (.server_parameters // {}))
|
||||||
|
| .client_parameters =
|
||||||
|
(($d.client_parameters // {}) + (.client_parameters // {}))
|
||||||
|
else
|
||||||
|
error("Unsupported serving test file format: must be array or object with .tests")
|
||||||
|
end
|
||||||
|
' "$serving_test_file" | while read -r params; do
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||||
@ -335,20 +368,25 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# get client and server arguments
|
# get client and server arguments (after merged the default parameters)
|
||||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
|
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
server_envs=$(json2envs "$server_envs")
|
server_envs=$(json2envs "$server_envs")
|
||||||
client_args=$(json2args "$client_params")
|
client_args=$(json2args "$client_params")
|
||||||
|
|
||||||
|
# qps_list
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over qps list $qps_list"
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# max_concurrency_list (fallback to num_prompts if missing)
|
||||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
||||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
||||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||||
max_concurrency_list="[$num_prompts]"
|
max_concurrency_list="[$num_prompts]"
|
||||||
fi
|
fi
|
||||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over max concurrency list $max_concurrency_list"
|
echo "Running over max concurrency list $max_concurrency_list"
|
||||||
|
|||||||
@ -1,610 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,276 +1,246 @@
|
|||||||
[
|
{
|
||||||
{
|
"defaults": {
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"qps_list": [
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"inf"
|
||||||
"max_concurrency_list": [32],
|
],
|
||||||
"server_environment_variables": {
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"server_environment_variables": {
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
},
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
"server_parameters": {
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"tensor_parallel_size": 1,
|
||||||
"max_concurrency_list": [32],
|
"dtype": "bfloat16",
|
||||||
"server_environment_variables": {
|
"distributed_executor_backend": "mp",
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"block_size": 128,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"trust_remote_code": "",
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"disable_log_stats": "",
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
"enforce_eager": "",
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"max_num_batched_tokens": 2048,
|
||||||
},
|
"max_num_seqs": 256,
|
||||||
"server_parameters": {
|
"load_format": "dummy"
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
"client_parameters": {
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"backend": "vllm",
|
||||||
"max_concurrency_list": [32],
|
"ignore-eos": "",
|
||||||
"server_environment_variables": {
|
"num_prompts": 200
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
]
|
},
|
||||||
|
"tests": [
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-1.7B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-1.7B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-4B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-4B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-8B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-8B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "zai-org/glm-4-9b-hf",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "zai-org/glm-4-9b-hf",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "google/gemma-7b",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "google/gemma-7b",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|||||||
@ -138,6 +138,35 @@ vllm serve facebook/opt-125m --dtype=bfloat16
|
|||||||
|
|
||||||
Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
|
Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
|
||||||
|
|
||||||
|
### What are supported models on CPU?
|
||||||
|
|
||||||
|
For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu)
|
||||||
|
|
||||||
|
### How to find benchmark configuration examples for supported CPU models?
|
||||||
|
|
||||||
|
For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
|
||||||
|
For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details).
|
||||||
|
To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.
|
||||||
|
|
||||||
|
Below is an example command to benchmark all CPU-supported models using optimized configurations.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ON_CPU=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
The benchmark results will be saved in `./benchmark/results/`.
|
||||||
|
In the directory, the generated `.commands` files contain all example commands for the benchmark.
|
||||||
|
|
||||||
|
We recommend configuring tensor-parallel-size to match the number of NUMA nodes on your system. Note that the current release does not support tensor-parallel-size=6.
|
||||||
|
To determine the number of NUMA nodes available, use the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
lscpu | grep "NUMA node(s):" | awk '{print $3}'
|
||||||
|
```
|
||||||
|
|
||||||
|
For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
|
||||||
|
, which publishes default-model CPU results produced using the same Benchmark Suite.
|
||||||
|
|
||||||
### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
|
### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
|
||||||
|
|
||||||
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
|
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user