diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 015f48c2520d6..64a262c6cb401 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models. **Benchmarking Duration**: about 1hr. @@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh Runtime environment variables: -- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0. - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). @@ -34,8 +34,9 @@ Runtime environment variables: See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. -For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. -> +> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. +> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead. + ### Latency test Here is an example of one test inside `latency-tests.json`: diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh old mode 100644 new mode 100755 index 34ceefe0996f2..6b6a7e472b9c8 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -49,7 +49,11 @@ check_cpus() { echo "Need at least 1 NUMA to run benchmarking." exit 1 fi - declare -g gpu_type="cpu" + if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then + declare -g gpu_type="arm64-cpu" + else + declare -g gpu_type="cpu" + fi echo "GPU type is $gpu_type" } @@ -207,8 +211,8 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -276,8 +280,8 @@ run_throughput_tests() { # check if there is enough GPU to run the test tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -393,8 +397,8 @@ run_serving_tests() { # check if there is enough resources to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -496,9 +500,9 @@ run_serving_tests() { main() { local ARCH ARCH='' - if [ "$ON_CPU" == "1" ];then - check_cpus - ARCH='-cpu' + if [[ "$ON_CPU" == "1" ]]; then + check_cpus + ARCH="-$gpu_type" else check_gpus ARCH="$arch_suffix" diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json new file mode 100644 index 0000000000000..fba695041e3ee --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json @@ -0,0 +1,26 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json new file mode 100644 index 0000000000000..63f1f8ab887b3 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json @@ -0,0 +1,130 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [ + 12, + 16, + 24, + 32, + 64, + 128, + 200 + ], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + } + ] +} \ No newline at end of file diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json new file mode 100644 index 0000000000000..da84dd4d0c67a --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json @@ -0,0 +1,27 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md index 7cc4d23250df9..4cbc1a6a0a4fb 100644 --- a/docs/benchmarking/dashboard.md +++ b/docs/benchmarking/dashboard.md @@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar ## Manually Trigger the benchmark Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite. -For CPU environment, please use the image with "-cpu" postfix. +For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix. -Here is an example for docker run command for CPU. +Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var. ```bash -docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu +export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch +export HF_TOKEN= +if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then + IMG_SUFFIX="arm64-cpu" +else + IMG_SUFFIX="cpu" +fi +docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX} ``` Then, run below command inside the docker instance. @@ -26,7 +33,7 @@ When run, benchmark script generates results under **benchmark/results** folder, ### Runtime environment variables -- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0. - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).