diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index 72c52d5bb5e9b..cdf6a645147e5 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -11,7 +11,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. **Benchmarking Duration**: about 1hr. @@ -31,13 +31,27 @@ Performance benchmark will be triggered when: - A PR being merged into vllm. - Every commit for those PRs with `perf-benchmarks` label AND `ready` label. +Manually Trigger the benchmark + +```bash +bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +``` + +Runtime environment variables: +- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). +- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). +- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). +- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. +- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. + Nightly benchmark will be triggered when: - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. ## Performance benchmark details See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. - +> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. ### Latency test Here is an example of one test inside `latency-tests.json`: @@ -119,6 +133,30 @@ If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. +The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. +When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. +`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. + +Here is an example using the script to compare result_a and result_b without detail test name. +`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name` + +| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | +|----|----------------------------------------|----------------------------------------|----------| +| 0 | 142.633982 | 156.526018 | 1.097396 | +| 1 | 241.620334 | 294.018783 | 1.216863 | +| 2 | 218.298905 | 262.664916 | 1.203235 | +| 3 | 242.743860 | 299.816190 | 1.235113 | + +Here is an example using the script to compare result_a and result_b with detail test name. +`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` +| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio | +|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------| +| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 | +| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 | +| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 | +| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 | +| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 | + ## Nightly test details See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md index cacaef986c658..a1f8441ccdac8 100644 --- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md @@ -4,7 +4,8 @@ - Input length: 32 tokens. - Output length: 128 tokens. - Batch size: fixed (8). -- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- CPU Models: llama-3.1 8B. - Evaluation metrics: end-to-end latency (mean, median, p99). {latency_tests_markdown_table} @@ -14,7 +15,8 @@ - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm to achieve maximum throughput. -- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- CPU Models: llama-3.1 8B. - Evaluation metrics: throughput. {throughput_tests_markdown_table} @@ -25,12 +27,18 @@ - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm and the arrival pattern of the requests. - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. -- We also added a speculative decoding test for llama-3 70B, under QPS 2 +- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 +- CPU Models: llama-3.1 8B. - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). +- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts. {serving_tests_markdown_table} +## Platform Information + +{platform_markdown_table} + ## json version of the benchmarking tables This section contains the data of the markdown tables above in JSON format. diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py new file mode 100644 index 0000000000000..20c106234935c --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +import pandas as pd + + +def compare_data_columns( + files, name_column, data_column, drop_column, ignore_test_name=False +): + print("\ncompare_data_column: " + data_column) + frames = [] + compare_frames = [] + for file in files: + data_df = pd.read_json(file) + serving_df = data_df.dropna(subset=[drop_column], ignore_index=True) + if ignore_test_name is False: + serving_df = serving_df.rename(columns={name_column: file + "_name"}) + frames.append(serving_df[file + "_name"]) + serving_df = serving_df.rename(columns={data_column: file}) + frames.append(serving_df[file]) + compare_frames.append(serving_df[file]) + if len(compare_frames) >= 2: + # Compare numbers among two files + ratio_df = compare_frames[1] / compare_frames[0] + frames.append(ratio_df) + compare_frames.pop(1) + + concat_df = pd.concat(frames, axis=1) + return concat_df + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-f", "--file", action="append", type=str, help="input file name" + ) + parser.add_argument( + "--ignore_test_name", action="store_true", help="ignore_test_name or not" + ) + args = parser.parse_args() + files = args.file + print("comparing : " + ", ".join(files)) + + drop_column = "P99" + name_column = "Test name" + data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] + html_msgs_for_data_cols = [ + "Compare Output Tokens /n", + "Median TTFT /n", + "Median TPOT /n", + ] + ignore_test_name = args.ignore_test_name + with open("perf_comparison.html", "w") as text_file: + for i in range(len(data_cols_to_compare)): + output_df = compare_data_columns( + files, + name_column, + data_cols_to_compare[i], + drop_column, + ignore_test_name=ignore_test_name, + ) + print(output_df) + html = output_df.to_html() + text_file.write(html_msgs_for_data_cols[i]) + text_file.write(html) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index a4f1638c1adb8..724b53056ca8f 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -3,9 +3,11 @@ import json import os +from importlib import util from pathlib import Path import pandas as pd +import psutil from tabulate import tabulate results_folder = Path("results/") @@ -29,11 +31,11 @@ throughput_results = [] throughput_results_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", - # "num_requests": "# of req.", - # "total_num_tokens": "Total # of tokens", - # "elapsed_time": "Elapsed time (s)", + "num_requests": "# of req.", + "total_num_tokens": "Total # of tokens", + "elapsed_time": "Elapsed time (s)", "requests_per_second": "Tput (req/s)", - # "tokens_per_second": "Tput (tok/s)", + "tokens_per_second": "Tput (tok/s)", } # serving results and the keys that will be printed into markdown @@ -41,16 +43,18 @@ serving_results = [] serving_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", - # "completed": "# of req.", + "completed": "# of req.", "request_throughput": "Tput (req/s)", - # "input_throughput": "Input Tput (tok/s)", - # "output_throughput": "Output Tput (tok/s)", + "total_token_throughput": "Total Token Tput (tok/s)", + "output_throughput": "Output Tput (tok/s)", + "total_input_tokens": "Total input tokens", + "total_output_tokens": "Total output tokens", "mean_ttft_ms": "Mean TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)", - # "mean_tpot_ms": "Mean TPOT (ms)", - # "median_tpot_ms": "Median", - # "p99_tpot_ms": "P99", + "mean_tpot_ms": "Mean TPOT (ms)", + "median_tpot_ms": "Median", + "p99_tpot_ms": "P99", "mean_itl_ms": "Mean ITL (ms)", "median_itl_ms": "Median ITL (ms)", "p99_itl_ms": "P99 ITL (ms)", @@ -75,6 +79,20 @@ def results_to_json(latency, throughput, serving): ) +def get_size_with_unit(bytes, suffix="B"): + """ + Scale bytes to its proper format + e.g: + 1253656 => '1.20MB' + 1253656678 => '1.17GB' + """ + factor = 1024 + for unit in ["", "K", "M", "G", "T", "P"]: + if bytes < factor: + return f"{bytes:.2f}{unit}{suffix}" + bytes /= factor + + if __name__ == "__main__": # collect results for test_file in results_folder.glob("*.json"): @@ -155,6 +173,27 @@ if __name__ == "__main__": serving_results = pd.DataFrame.from_dict(serving_results) throughput_results = pd.DataFrame.from_dict(throughput_results) + svmem = psutil.virtual_memory() + platform_data = { + "Physical cores": [psutil.cpu_count(logical=False)], + "Total cores": [psutil.cpu_count(logical=True)], + "Total Memory": [get_size_with_unit(svmem.total)], + } + + if util.find_spec("numa") is not None: + from numa import info + + platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()] + + if util.find_spec("cpuinfo") is not None: + from cpuinfo import get_cpu_info + + platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]] + + platform_results = pd.DataFrame.from_dict( + platform_data, orient="index", columns=["Platform Info"] + ) + raw_results_json = results_to_json( latency_results, throughput_results, serving_results ) @@ -200,6 +239,9 @@ if __name__ == "__main__": throughput_md_table = tabulate( throughput_results, headers="keys", tablefmt="pipe", showindex=False ) + platform_md_table = tabulate( + platform_results, headers="keys", tablefmt="pipe", showindex=True + ) # document the result with open(results_folder / "benchmark_results.md", "w") as f: @@ -211,6 +253,7 @@ if __name__ == "__main__": latency_tests_markdown_table=latency_md_table, throughput_tests_markdown_table=throughput_md_table, serving_tests_markdown_table=serving_md_table, + platform_markdown_table=platform_md_table, benchmarking_results_in_json_string=processed_results_json, ) f.write(results) diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 80ebb370ad461..f05040618981c 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -31,6 +31,20 @@ check_gpus() { echo "GPU type is $gpu_type" } +check_cpus() { + # check the number of CPUs and NUMA Node and GPU type. + declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)") + if [[ $numa_count -gt 0 ]]; then + echo "NUMA found." + echo $numa_count + else + echo "Need at least 1 NUMA to run benchmarking." + exit 1 + fi + declare -g gpu_type="cpu" + echo "GPU type is $gpu_type" +} + check_hf_token() { # check if HF_TOKEN is available and valid if [[ -z "$HF_TOKEN" ]]; then @@ -69,6 +83,22 @@ json2args() { echo "$args" } +json2envs() { + # transforms the JSON string to environment variables. + # example: + # input: { "VLLM_CPU_KVCACHE_SPACE": 5 } + # output: VLLM_CPU_KVCACHE_SPACE=5 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map((.key ) + "=" + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + wait_for_server() { # wait for vllm server to start # return 1 if vllm server crashes @@ -158,15 +188,24 @@ run_latency_tests() { # get arguments latency_params=$(echo "$params" | jq -r '.parameters') latency_args=$(json2args "$latency_params") + latency_environment_variables=$(echo "$params" | jq -r '.environment_variables') + latency_envs=$(json2envs "$latency_environment_variables") # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') - if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue + if [ "$ON_CPU" == "1" ];then + if [[ $numa_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + continue + fi + else + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi fi - latency_command="python3 benchmark_latency.py \ + latency_command=" $latency_envs python3 benchmark_latency.py \ --output-json $RESULTS_FOLDER/${test_name}.json \ $latency_args" @@ -216,15 +255,24 @@ run_throughput_tests() { # get arguments throughput_params=$(echo "$params" | jq -r '.parameters') throughput_args=$(json2args "$throughput_params") + throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables') + throughput_envs=$(json2envs "$throughput_environment_variables") # check if there is enough GPU to run the test tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') - if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue + if [ "$ON_CPU" == "1" ];then + if [[ $numa_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + continue + fi + else + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi fi - throughput_command="python3 benchmark_throughput.py \ + throughput_command=" $throughput_envs python3 benchmark_throughput.py \ --output-json $RESULTS_FOLDER/${test_name}.json \ $throughput_args" @@ -272,18 +320,27 @@ run_serving_tests() { # get client and server arguments server_params=$(echo "$params" | jq -r '.server_parameters') + server_envs=$(echo "$params" | jq -r '.server_environment_variables') client_params=$(echo "$params" | jq -r '.client_parameters') server_args=$(json2args "$server_params") + server_envs=$(json2envs "$server_envs") client_args=$(json2args "$client_params") qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" - # check if there is enough GPU to run the test + # check if there is enough resources to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') - if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue + if [ "$ON_CPU" == "1" ];then + if [[ $numa_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + continue + fi + else + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi fi # check if server model and client model is aligned @@ -294,23 +351,33 @@ run_serving_tests() { continue fi - server_command="python3 \ + server_command="$server_envs python3 \ -m vllm.entrypoints.openai.api_server \ $server_args" # run the server echo "Running test case $test_name" echo "Server command: $server_command" - bash -c "$server_command" & - server_pid=$! - - # wait until the server is alive - if wait_for_server; then - echo "" - echo "vllm server is up and running." + # support remote vllm server + client_remote_args="" + if [[ -z "${REMOTE_HOST}" ]]; then + bash -c "$server_command" & + server_pid=$! + # wait until the server is alive + if wait_for_server; then + echo "" + echo "vLLM server is up and running." + else + echo "" + echo "vLLM failed to start within the timeout period." + fi else - echo "" - echo "vllm failed to start within the timeout period." + server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" + if [[ ${REMOTE_PORT} ]]; then + client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT " + else + client_remote_args=" --host=$REMOTE_HOST " + fi fi # iterate over different QPS @@ -332,7 +399,7 @@ run_serving_tests() { --result-filename ${new_test_name}.json \ --request-rate $qps \ --metadata "tensor_parallel_size=$tp" \ - $client_args" + $client_args $client_remote_args " echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" @@ -360,7 +427,14 @@ run_serving_tests() { } main() { - check_gpus + local ARCH + ARCH='' + if [ "$ON_CPU" == "1" ];then + check_cpus + ARCH='-cpu' + else + check_gpus + fi check_hf_token # Set to v1 to run v1 benchmark @@ -386,9 +460,9 @@ main() { QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ # benchmarking - run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json - run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json - run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" # postprocess benchmarking results pip install tabulate pandas diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json new file mode 100644 index 0000000000000..da93fdd1dbac1 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json @@ -0,0 +1,30 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json new file mode 100644 index 0000000000000..22f71c993ff33 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -0,0 +1,158 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_random_1024_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 1024, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 100, + "num_prompts": 100 + } + }, + { + "test_name": "serving_llama8B_pp6_random_1024_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 6, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 1024, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 100, + "num_prompts": 100 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json new file mode 100644 index 0000000000000..f159c30637d34 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 13bd03c5696ab..0a756ea7298c0 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -25,7 +25,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ apt-get update -y \ && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \ - gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \ + gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && curl -LsSf https://astral.sh/uv/install.sh | sh @@ -134,6 +134,7 @@ ADD ./tests/ ./tests/ ADD ./examples/ ./examples/ ADD ./benchmarks/ ./benchmarks/ ADD ./vllm/collect_env.py . +ADD ./.buildkite/ ./.buildkite/ # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \