From 8bff831f0aa239006f34b721e63e1340e3472067 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 29 Oct 2025 21:43:37 -0700 Subject: [PATCH] [Benchmark] Cleanup deprecated nightly benchmark and adjust the docstring for performance benchmark (#25786) Signed-off-by: KuntaiDu --- .../benchmark-pipeline.yaml | 184 ------- .../nightly-benchmarks/nightly-annotation.md | 28 -- .../nightly-descriptions.md | 39 -- .../nightly-benchmarks/nightly-pipeline.yaml | 196 -------- .../scripts/download-tokenizer.py | 26 - .../scripts/generate-nightly-markdown.py | 97 ---- .../scripts/get-lmdeploy-modelname.py | 9 - .../scripts/nightly-annotate.sh | 78 --- .../scripts/run-nightly-benchmarks.sh | 464 ------------------ .../scripts/summary-nightly-results.py | 82 ---- .../scripts/wait-for-image.sh | 23 - .../README.md | 54 +- .../performance-benchmarks-descriptions.md | 0 .../scripts/compare-json-results.py | 0 .../convert-results-json-to-markdown.py | 2 +- .../scripts/launch-server.sh | 0 .../scripts/run-performance-benchmarks.sh | 2 +- .../tests/genai-perf-tests.json | 0 .../tests/latency-tests-cpu.json | 0 .../tests/latency-tests.json | 0 .../tests/nightly-tests.json | 0 .../tests/serving-tests-cpu-snc2.json | 0 .../tests/serving-tests-cpu-snc3.json | 0 .../tests/serving-tests-cpu.json | 0 .../tests/serving-tests.json | 0 .../tests/throughput-tests-cpu.json | 0 .../tests/throughput-tests.json | 0 .github/mergify.yml | 2 +- docs/contributing/benchmarks.md | 13 +- 29 files changed, 10 insertions(+), 1289 deletions(-) delete mode 100644 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml delete mode 100644 .buildkite/nightly-benchmarks/nightly-annotation.md delete mode 100644 .buildkite/nightly-benchmarks/nightly-descriptions.md delete mode 100644 .buildkite/nightly-benchmarks/nightly-pipeline.yaml delete mode 100644 .buildkite/nightly-benchmarks/scripts/download-tokenizer.py delete mode 100644 .buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py delete mode 100644 .buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py delete mode 100644 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh delete mode 100644 .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh delete mode 100644 .buildkite/nightly-benchmarks/scripts/summary-nightly-results.py delete mode 100644 .buildkite/nightly-benchmarks/scripts/wait-for-image.sh rename .buildkite/{nightly-benchmarks => performance-benchmarks}/README.md (69%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/performance-benchmarks-descriptions.md (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/scripts/compare-json-results.py (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/scripts/convert-results-json-to-markdown.py (99%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/scripts/launch-server.sh (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/scripts/run-performance-benchmarks.sh (99%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/genai-perf-tests.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/latency-tests-cpu.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/latency-tests.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/nightly-tests.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/serving-tests-cpu-snc2.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/serving-tests-cpu-snc3.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/serving-tests-cpu.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/serving-tests.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/throughput-tests-cpu.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/throughput-tests.json (100%) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml deleted file mode 100644 index 4259514940d3f..0000000000000 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ /dev/null @@ -1,184 +0,0 @@ -steps: - - label: "Wait for container to be ready" - key: wait-for-container-image - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - containers: - - image: badouralix/curl-jq - command: - - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - - label: "Cleanup H100" - agents: - queue: H100 - depends_on: ~ - command: docker system prune -a --volumes --force - - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - - label: "H200" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: 4,5,6,7 - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - #- block: "Run H100 Benchmark" - #key: block-h100 - #depends_on: ~ - - - label: "H100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - # Premerge benchmark - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - - label: "H200" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: 4,5,6,7 - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - #- block: "Run H100 Benchmark" - #key: block-h100 - #depends_on: ~ - - - label: "H100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md deleted file mode 100644 index 466def07b6f1f..0000000000000 --- a/.buildkite/nightly-benchmarks/nightly-annotation.md +++ /dev/null @@ -1,28 +0,0 @@ -# Nightly benchmark annotation - -## Description - -This file contains the downloading link for benchmarking results. - -- [benchmarking pipeline](artifact://nightly-pipeline.yaml) -- [benchmarking results](artifact://results.zip) -- [benchmarking code](artifact://nightly-benchmarks.zip) - -Please download the visualization scripts in the post - -## Results reproduction - -- Find the docker we use in `benchmarking pipeline` -- Deploy the docker, and inside the docker: - - Download `nightly-benchmarks.zip`. - - In the same folder, run the following code: - - ```bash - export HF_TOKEN= - apt update - apt install -y git - unzip nightly-benchmarks.zip - VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - ``` - -And the results will be inside `./benchmarks/results`. diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md deleted file mode 100644 index 2ef36089b6afb..0000000000000 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ /dev/null @@ -1,39 +0,0 @@ - -# Nightly benchmark - -This benchmark aims to: - -- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. -- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. - -Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. - -Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176) - -## Setup - -- Docker images: - - vLLM: `vllm/vllm-openai:v0.6.2` - - SGLang: `lmsysorg/sglang:v0.3.2-cu121` - - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` - - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` - - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.* - - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. -- Hardware - - 8x Nvidia A100 GPUs -- Workload: - - Dataset - - ShareGPT dataset - - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) - - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) - - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. - - Models: llama-3 8B, llama-3 70B. - - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). - - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. - - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. - - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). - -## Known issues - -- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). -- TGI does not support `ignore-eos` flag. diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml deleted file mode 100644 index 199517e8b067c..0000000000000 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ /dev/null @@ -1,196 +0,0 @@ -common_pod_spec: &common_pod_spec - priorityClassName: perf-benchmark - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - -common_container_settings: &common_container_settings - command: - - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - -steps: - - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - - - - - label: "A100 vllm step 10" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: vllm/vllm-openai:v0.6.2 - <<: *common_container_settings - - - - - label: "A100 sglang benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: lmsysorg/sglang:v0.3.2-cu121 - <<: *common_container_settings - - - label: "A100 lmdeploy benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: openmmlab/lmdeploy:v0.6.1-cu12 - <<: *common_container_settings - - - - - - label: "A100 trt llama-8B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - <<: *common_container_settings - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - name: TEST_SELECTOR - value: "llama8B" - - - - label: "A100 trt llama-70B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - <<: *common_container_settings - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - name: TEST_SELECTOR - value: "llama70B" - - - # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image - # - label: "A100 trt benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # <<: *common_pod_spec - # containers: - # - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - # <<: *common_container_settings - - - # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`. - # - label: "A100 tgi benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # <<: *common_pod_spec - # containers: - # - image: ghcr.io/huggingface/text-generation-inference:2.2.0 - # <<: *common_container_settings - - - wait - - - label: "Collect the results" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: vllm/vllm-openai:v0.5.0.post1 - command: - - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - - block: ":rocket: check the results!" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py deleted file mode 100644 index 8532ff7ef798c..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse - -from transformers import AutoTokenizer - - -def main(model, cachedir): - # Load the tokenizer and save it to the specified directory - tokenizer = AutoTokenizer.from_pretrained(model) - tokenizer.save_pretrained(cachedir) - print(f"Tokenizer saved to {cachedir}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Download and save Hugging Face tokenizer" - ) - parser.add_argument("--model", type=str, required=True, help="Name of the model") - parser.add_argument( - "--cachedir", type=str, required=True, help="Directory to save the tokenizer" - ) - - args = parser.parse_args() - main(args.model, args.cachedir) diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py deleted file mode 100644 index 053fd52c35ae9..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import json -from pathlib import Path - -import numpy as np -import pandas as pd -from tabulate import tabulate - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description="Parse command line arguments for summary-nightly-results script." - ) - parser.add_argument( - "--results-folder", - type=str, - required=True, - help="The folder where the results are stored.", - ) - parser.add_argument( - "--description", type=str, required=True, help="Description of the results." - ) - - args = parser.parse_args() - return args - - -def get_perf(df, method, model, metric): - means = [] - - for qps in [2, 4, 8, 16, "inf"]: - target = df["Test name"].str.contains(model) - target = target & df["Engine"].str.contains(method) - target = target & df["Test name"].str.contains("qps_" + str(qps)) - filtered_df = df[target] - - if filtered_df.empty: - means.append(0.0) - else: - means.append(filtered_df[metric].values[0]) - - return np.array(means) - - -def get_perf_w_std(df, method, model, metric): - if metric in ["TTFT", "ITL"]: - mean = get_perf(df, method, model, "Mean " + metric + " (ms)") - mean = mean.tolist() - std = get_perf(df, method, model, "Std " + metric + " (ms)") - if std.mean() == 0: - std = None - success = get_perf(df, method, model, "Successful req.") - if std is not None: - std = std / np.sqrt(success) - std = std.tolist() - - else: - assert metric == "Tput" - mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( - df, method, model, "Output Tput (tok/s)" - ) - mean = mean.tolist() - std = None - - return mean, std - - -def main(args): - results_folder = Path(args.results_folder) - - results = [] - - # collect results - for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file) as f: - results = results + json.loads(f.read()) - - # generate markdown table - df = pd.DataFrame.from_dict(results) - - md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) - - with open(args.description) as f: - description = f.read() - - description = description.format(nightly_results_benchmarking_table=md_table) - - with open("nightly_results.md", "w") as f: - f.write(description) - - -if __name__ == "__main__": - args = parse_arguments() - main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py deleted file mode 100644 index ddea1d2b1b1ed..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from lmdeploy.serve.openai.api_client import APIClient - -api_client = APIClient("http://localhost:8000") -model_name = api_client.available_models[0] - -print(model_name) diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh deleted file mode 100644 index 69b6b146b3549..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -set -ex -set -o pipefail - - -main() { - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get update && apt-get -y install jq) - (which zip) || (apt-get install -y zip) - - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip plotting the results." - exit 0 - fi - - # initial annotation - #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" - - # download results - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - mkdir -p results/ - /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ - ls - ls results/ - - # upload benchmark results - zip -r results.zip results/ - /workspace/buildkite-agent artifact upload "results.zip" - - # upload benchmarking scripts - cd "$VLLM_SOURCE_CODE_LOC/" - zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ - /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" - - cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - # upload benchmarking pipeline - /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" - - cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md - - - - # The figures should be generated by a separate process outside the CI/CD pipeline - - # # generate figures - # python3 -m pip install tabulate pandas matplotlib - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \ - # --description $description \ - # --results-folder results/ - - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sharegpt - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sonnet_2048_128 - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sonnet_128_2048 - - # # upload results and figures - # /workspace/buildkite-agent artifact upload "nightly_results*.png" - # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml - # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json - # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh deleted file mode 100644 index a00de940cbbb8..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ /dev/null @@ -1,464 +0,0 @@ -#!/bin/bash - -set -o pipefail -set -x - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" - echo "GPU type is $gpu_type" -} - -check_hf_token() { - # check if HF_TOKEN is available and valid - if [[ -z "$HF_TOKEN" ]]; then - echo "Error: HF_TOKEN is not set." - exit 1 - elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then - echo "Error: HF_TOKEN does not start with 'hf_'." - exit 1 - else - echo "HF_TOKEN is set and valid." - fi -} - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - - -get_current_llm_serving_engine() { - - if which lmdeploy >/dev/null; then - echo "Container: lmdeploy" - export CURRENT_LLM_SERVING_ENGINE=lmdeploy - return - fi - - if [ -e /tgi-entrypoint.sh ]; then - echo "Container: tgi" - export CURRENT_LLM_SERVING_ENGINE=tgi - return - fi - - if which trtllm-build >/dev/null; then - echo "Container: tensorrt-llm" - export CURRENT_LLM_SERVING_ENGINE=trt - return - fi - - if [ -e /sgl-workspace ]; then - echo "Container: sglang" - export CURRENT_LLM_SERVING_ENGINE=sglang - return - fi - - if [ -e /vllm-workspace ]; then - echo "Container: vllm" - # move to a completely irrelevant directory, to avoid import vllm from current folder - export CURRENT_LLM_SERVING_ENGINE=vllm - - return - fi -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -kill_gpu_processes() { - pkill -f '[p]ython' - pkill -f '[p]ython3' - pkill -f '[t]ritonserver' - pkill -f '[p]t_main_thread' - pkill -f '[t]ext-generation' - pkill -f '[l]mdeploy' - # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 - pkill -f '[V]LLM' - - while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do - sleep 1 - done -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - timeout 1200 bash -c ' - until curl -s localhost:8000/v1/completions > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -ensure_installed() { - # Ensure that the given command is installed by apt-get - local cmd=$1 - if ! which "$cmd" >/dev/null; then - apt-get update && apt-get install -y "$cmd" - fi -} - -run_serving_tests() { - # run serving tests using `vllm bench serve` command - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # prepend the current serving engine to the test name - test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - reuse_server=$(echo "$common_params" | jq -r '.reuse_server') - - # get client and server arguments - server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") - client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if [[ $reuse_server == "true" ]]; then - echo "Reuse previous server for test case $test_name" - else - kill_gpu_processes - bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ - "$server_params" "$common_params" - fi - - if wait_for_server; then - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." - else - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." - break - fi - - # prepare tokenizer - # this is required for lmdeploy. - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - - - # change model name for lmdeploy (it will not follow standard hf name) - if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then - model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - backend=$CURRENT_LLM_SERVING_ENGINE - - if [[ $backend = "trt" ]]; then - backend="tensorrt-llm" - fi - - if [[ "$backend" == *"vllm"* ]]; then - backend="vllm" - fi - - if [[ "$dataset_name" = "sharegpt" ]]; then - - client_command="vllm bench serve \ - --backend $backend \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --ignore-eos \ - $client_args" - - elif [[ "$dataset_name" = "sonnet" ]]; then - - sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') - sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') - sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') - - client_command="vllm bench serve \ - --backend $backend \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --sonnet-input-len $sonnet_input_len \ - --sonnet-output-len $sonnet_output_len \ - --sonnet-prefix-len $sonnet_prefix_len \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --ignore-eos \ - $client_args" - - else - - echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." - exit 1 - - fi - - - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - server_command="None" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - done - - kill_gpu_processes -} - -run_genai_perf_tests() { - # run genai-perf tests - - # $1: a json file specifying genai-perf test cases - local genai_perf_test_file - genai_perf_test_file=$1 - - # Iterate over genai-perf tests - jq -c '.[]' "$genai_perf_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # prepend the current serving engine to the test name - test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - reuse_server=$(echo "$common_params" | jq -r '.reuse_server') - - # get client and server arguments - server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if [[ $reuse_server == "true" ]]; then - echo "Reuse previous server for test case $test_name" - else - kill_gpu_processes - bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ - "$server_params" "$common_params" - fi - - if wait_for_server; then - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." - else - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." - break - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps=$num_prompts - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - backend=$CURRENT_LLM_SERVING_ENGINE - - if [[ "$backend" == *"vllm"* ]]; then - backend="vllm" - fi - #TODO: add output dir. - client_command="genai-perf profile \ - -m $model \ - --service-kind openai \ - --backend "$backend" \ - --endpoint-type chat \ - --streaming \ - --url localhost:$port \ - --request-rate $qps \ - --num-prompts $num_prompts \ - " - - echo "Client command: $client_command" - - eval "$client_command" - - #TODO: process/record outputs - done - done - - kill_gpu_processes - -} - -prepare_dataset() { - - # download sharegpt dataset - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - # duplicate sonnet by 4x, to allow benchmarking with input length 2048 - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - echo "" > sonnet_4x.txt - for _ in {1..4} - do - cat sonnet.txt >> sonnet_4x.txt - done - -} - -main() { - - # check if the environment variable is successfully injected from yaml - - check_gpus - check_hf_token - get_current_llm_serving_engine - - pip install -U transformers - - pip install -r requirements/dev.txt - which genai-perf - - # check storage - df -h - - ensure_installed wget - ensure_installed curl - ensure_installed jq - # genai-perf dependency - ensure_installed libb64-0d - - prepare_dataset - - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - - # run the test - run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" - - # run genai-perf tests - run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" - mv artifacts/ $RESULTS_FOLDER/ - - # upload benchmark results to buildkite - python3 -m pip install tabulate pandas - python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py deleted file mode 100644 index fb3b9d5e34e03..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import datetime -import json -import os -from pathlib import Path - -import pandas as pd -from tabulate import tabulate - -results_folder = Path("results/") - -# serving results and the keys that will be printed into markdown -serving_results = [] -serving_column_mapping = { - "test_name": "Test name", - "gpu_type": "GPU", - "completed": "Successful req.", - "request_throughput": "Tput (req/s)", - "mean_ttft_ms": "Mean TTFT (ms)", - "std_ttft_ms": "Std TTFT (ms)", - "median_ttft_ms": "Median TTFT (ms)", - "mean_itl_ms": "Mean ITL (ms)", - "std_itl_ms": "Std ITL (ms)", - "median_itl_ms": "Median ITL (ms)", - "mean_tpot_ms": "Mean TPOT (ms)", - "std_tpot_ms": "Std TPOT (ms)", - "median_tpot_ms": "Median TPOT (ms)", - "total_token_throughput": "Total Token Tput (tok/s)", - "output_throughput": "Output Tput (tok/s)", - "total_input_tokens": "Total input tokens", - "total_output_tokens": "Total output tokens", - "engine": "Engine", -} - -if __name__ == "__main__": - # collect results - for test_file in results_folder.glob("*.json"): - with open(test_file) as f: - raw_result = json.loads(f.read()) - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # add the result to raw_result - serving_results.append(raw_result) - continue - - serving_results = pd.DataFrame.from_dict(serving_results) - - if not serving_results.empty: - serving_results = serving_results[list(serving_column_mapping.keys())].rename( - columns=serving_column_mapping - ) - - serving_md_table_with_headers = tabulate( - serving_results, headers="keys", tablefmt="pipe", showindex=False - ) - # remove the first line of header - serving_md_table_lines = serving_md_table_with_headers.split("\n") - serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) - - prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") - - # document benchmarking results in markdown - with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: - # document results with header. - # for those who wants to reproduce our benchmark. - f.write(serving_md_table_with_headers) - f.write("\n") - - # document benchmarking results in json - with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: - results = serving_results.to_dict(orient="records") - f.write(json.dumps(results)) diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh deleted file mode 100644 index 50e1ab0242202..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) -if [[ "$BUILDKITE_BRANCH" == "main" ]]; then - URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" -else - URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" -fi - -TIMEOUT_SECONDS=10 - -retries=0 -while [ $retries -lt 1000 ]; do - if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then - exit 0 - fi - - echo "Waiting for image to be available..." - - retries=$((retries + 1)) - sleep 5 -done - -exit 1 diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md similarity index 69% rename from .buildkite/nightly-benchmarks/README.md rename to .buildkite/performance-benchmarks/README.md index e6f5c8b60f459..332142ba5d170 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -2,40 +2,23 @@ ## Introduction -This directory contains two sets of benchmark for vllm. - -- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance -- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. - -See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. +This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance. +vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD. ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models. **Benchmarking Duration**: about 1hr. **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. -## Nightly benchmark quick overview - -**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. - -**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. - -**Benchmarking Duration**: about 3.5hrs. - ## Trigger the benchmark -Performance benchmark will be triggered when: - -- A PR being merged into vllm. -- Every commit for those PRs with `perf-benchmarks` label AND `ready` label. - -Manually Trigger the benchmark +The benchmark needs to be triggered manually: ```bash -bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh ``` Runtime environment variables: @@ -47,10 +30,6 @@ Runtime environment variables: - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. -Nightly benchmark will be triggered when: - -- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. - ## Performance benchmark details See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. @@ -152,26 +131,3 @@ Here is an example using the script to compare result_a and result_b with Model, A comparison diagram will be generated below the table. Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 image - -## Nightly test details - -See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. - -### Workflow - -- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. -- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container. -- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`. -- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. - -### Nightly tests - -In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. - -### Docker containers - -The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. - -WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`. - -WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md similarity index 100% rename from .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md rename to .buildkite/performance-benchmarks/performance-benchmarks-descriptions.md diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py similarity index 100% rename from .buildkite/nightly-benchmarks/scripts/compare-json-results.py rename to .buildkite/performance-benchmarks/scripts/compare-json-results.py diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py similarity index 99% rename from .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py rename to .buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py index a7544aeef4c74..80bb4d846a226 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py @@ -392,7 +392,7 @@ if __name__ == "__main__": json_file = "benchmark_results.json" with open(results_folder / md_file, "w") as f: results = read_markdown( - "../.buildkite/nightly-benchmarks/" + "../.buildkite/performance-benchmarks/" + "performance-benchmarks-descriptions.md" ) results = results.format( diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/performance-benchmarks/scripts/launch-server.sh similarity index 100% rename from .buildkite/nightly-benchmarks/scripts/launch-server.sh rename to .buildkite/performance-benchmarks/scripts/launch-server.sh diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh similarity index 99% rename from .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh rename to .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index 5a47576483bbf..9447ceffd7e22 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -469,7 +469,7 @@ main() { ensure_sharegpt_downloaded declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER - QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/ # dump vllm info via vllm collect-env env_output=$(vllm collect-env) diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/genai-perf-tests.json rename to .buildkite/performance-benchmarks/tests/genai-perf-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/latency-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/latency-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/performance-benchmarks/tests/latency-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/latency-tests.json rename to .buildkite/performance-benchmarks/tests/latency-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/performance-benchmarks/tests/nightly-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/nightly-tests.json rename to .buildkite/performance-benchmarks/tests/nightly-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests.json rename to .buildkite/performance-benchmarks/tests/serving-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/throughput-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/performance-benchmarks/tests/throughput-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/throughput-tests.json rename to .buildkite/performance-benchmarks/tests/throughput-tests.json diff --git a/.github/mergify.yml b/.github/mergify.yml index de1a8314a4ecd..18d4a2e83144b 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -108,7 +108,7 @@ pull_request_rules: - files~=^benchmarks/ - files~=^vllm/benchmarks/ - files~=^tests/benchmarks/ - - files~=^\.buildkite/nightly-benchmarks/ + - files~=^\.buildkite/performance-benchmarks/ actions: label: add: diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index be3e32a73a332..dca01eab5b426 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -9,7 +9,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu - **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing - **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development -- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives [Benchmark CLI]: #benchmark-cli @@ -1167,7 +1166,7 @@ docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingf Then, run below command inside the docker instance. ```bash -bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh ``` When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json. @@ -1185,7 +1184,7 @@ For more results visualization, check the [visualizing the results](https://gith The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). -More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). +More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md). ### Continuous Benchmarking @@ -1210,11 +1209,3 @@ The benchmarking currently runs on a predefined set of models configured in the #### Viewing Results All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). - -## Nightly Benchmarks - -These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels. - -The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). - -More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).