From 6f8d26188200385fa994526b10b5858a3da1ede7 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 29 Jul 2025 22:57:03 -0700 Subject: [PATCH] Update vLLM Benchmark Suite for Xeon based on 0.9.2 release (#21486) Signed-off-by: Tsai, Louie --- .../convert-results-json-to-markdown.py | 1 + .../scripts/run-performance-benchmarks.sh | 2 +- .../tests/serving-tests-cpu-snc2.json | 209 +++++++++++++++++ .../tests/serving-tests-cpu-snc3.json | 211 ++++++++++++++++++ .../tests/serving-tests-cpu.json | 15 ++ 5 files changed, 437 insertions(+), 1 deletion(-) create mode 100644 .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json create mode 100644 .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 05623879c0c2c..554256b4bdb8b 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -44,6 +44,7 @@ serving_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", "completed": "# of req.", + "max_concurrency": "# of max concurrency.", "request_throughput": "Tput (req/s)", "total_token_throughput": "Total Token Tput (tok/s)", "output_throughput": "Output Tput (tok/s)", diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index b515ee43934d1..2c57666a81aa3 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -33,7 +33,7 @@ check_gpus() { check_cpus() { # check the number of CPUs and NUMA Node and GPU type. - declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)") + declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') if [[ $numa_count -gt 0 ]]; then echo "NUMA found." echo $numa_count diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json new file mode 100644 index 0000000000000..a144b4420fbf1 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -0,0 +1,209 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json new file mode 100644 index 0000000000000..e6e69b63b74df --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -0,0 +1,211 @@ +[ + { + "test_name": "serving_llama8B_pp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_pp3_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2pp6_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_pp1_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_pp3_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL:": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_tp2pp3_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index 22f71c993ff33..ce1f924de387f 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -6,6 +6,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -18,6 +19,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { @@ -36,6 +39,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -48,6 +52,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { @@ -66,6 +72,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -78,6 +85,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { @@ -96,6 +105,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -109,6 +119,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { @@ -129,6 +141,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -142,6 +155,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": {