diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 3f2e2da39797..b39f9899a8f2 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
@@ -138,28 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-Here is an example using the script to compare result_a and result_b without detail test name.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
-
-| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
-|----|----------------------------------------|----------------------------------------|----------|
-| 0 | 142.633982 | 156.526018 | 1.097396 |
-| 1 | 241.620334 | 294.018783 | 1.216863 |
-| 2 | 218.298905 | 262.664916 | 1.203235 |
-| 3 | 242.743860 | 299.816190 | 1.235113 |
-
-Here is an example using the script to compare result_a and result_b with detail test name.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
-|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
-| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
-| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
-| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
-| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
-| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
+| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
+|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
+| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
+| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
+
+A comparison diagram will be generated below the table.
+Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
+
## Nightly test details
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 20c106234935..12c4ba6aa69a 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -1,24 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
+import json
+import os
import pandas as pd
def compare_data_columns(
- files, name_column, data_column, drop_column, ignore_test_name=False
+ files, name_column, data_column, info_cols, drop_column, debug=False
):
print("\ncompare_data_column: " + data_column)
frames = []
+ raw_data_cols = []
compare_frames = []
for file in files:
data_df = pd.read_json(file)
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
- if ignore_test_name is False:
+ # Show all info columns in the first couple columns
+ if not frames:
+ for col in info_cols:
+ if col not in serving_df.columns:
+ print(f"Skipping missing column: {col}")
+ continue
+ frames.append(serving_df[col])
+ # only show test name under debug mode
+ if debug is True:
serving_df = serving_df.rename(columns={name_column: file + "_name"})
frames.append(serving_df[file + "_name"])
+
+ file = "/".join(file.split("/")[:-1])
serving_df = serving_df.rename(columns={data_column: file})
frames.append(serving_df[file])
+ raw_data_cols.append(file)
compare_frames.append(serving_df[file])
if len(compare_frames) >= 2:
# Compare numbers among two files
@@ -27,7 +41,68 @@ def compare_data_columns(
compare_frames.pop(1)
concat_df = pd.concat(frames, axis=1)
- return concat_df
+ print(raw_data_cols)
+ return concat_df, raw_data_cols
+
+
+def split_json_by_tp_pp(
+ input_file: str = "benchmark_results.json", output_root: str = "."
+) -> list[str]:
+ """
+ Split a benchmark JSON into separate folders by (TP Size, PP Size).
+
+ Creates: /tp{TP}_pp{PP}/benchmark_results.json
+ Returns: list of file paths written.
+ """
+ # Load JSON data into DataFrame
+ with open(input_file, encoding="utf-8") as f:
+ data = json.load(f)
+
+ # If the JSON is a dict with a list under common keys, use that list
+ if isinstance(data, dict):
+ for key in ("results", "serving_results", "benchmarks", "data"):
+ if isinstance(data.get(key), list):
+ data = data[key]
+ break
+
+ df = pd.DataFrame(data)
+
+ # Handle alias column names
+ rename_map = {
+ "tp_size": "TP Size",
+ "tensor_parallel_size": "TP Size",
+ "pp_size": "PP Size",
+ "pipeline_parallel_size": "PP Size",
+ }
+ df.rename(
+ columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
+ )
+
+ # Ensure TP/PP columns exist (default to 1 if missing)
+ if "TP Size" not in df.columns:
+ df["TP Size"] = 1
+ if "PP Size" not in df.columns:
+ df["PP Size"] = 1
+
+ # make sure TP/PP are numeric ints with no NaN
+ df["TP Size"] = (
+ pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
+ )
+ df["PP Size"] = (
+ pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
+ )
+
+ # Split into separate folders
+ saved_paths: list[str] = []
+ for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
+ folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
+ os.makedirs(folder_name, exist_ok=True)
+ filepath = os.path.join(folder_name, "benchmark_results.json")
+ group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
+ print(f"Saved: {filepath}")
+ saved_paths.append(filepath)
+
+ return saved_paths
if __name__ == "__main__":
@@ -36,31 +111,105 @@ if __name__ == "__main__":
"-f", "--file", action="append", type=str, help="input file name"
)
parser.add_argument(
- "--ignore_test_name", action="store_true", help="ignore_test_name or not"
+ "--debug", action="store_true", help="show all information for debugging"
+ )
+ parser.add_argument(
+ "--plot",
+ action=argparse.BooleanOptionalAction,
+ default=True,
+ help="plot perf diagrams or not --no-plot --plot",
+ )
+ parser.add_argument(
+ "-x",
+ "--xaxis",
+ type=str,
+ default="# of max concurrency.",
+ help="column name to use as X Axis in comparision graph",
)
args = parser.parse_args()
- files = args.file
- print("comparing : " + ", ".join(files))
drop_column = "P99"
name_column = "Test name"
+ info_cols = [
+ "Model",
+ "Dataset Name",
+ "Input Len",
+ "Output Len",
+ "TP Size",
+ "PP Size",
+ "# of max concurrency.",
+ "qps",
+ ]
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
- ignore_test_name = args.ignore_test_name
+
+ if len(args.file) == 1:
+ files = split_json_by_tp_pp(args.file[0], output_root="splits")
+ info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
+ else:
+ files = args.file
+ print("comparing : " + ", ".join(files))
+ debug = args.debug
+ plot = args.plot
+ # For Plot feature, assign y axis from one of info_cols
+ y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
with open("perf_comparison.html", "w") as text_file:
for i in range(len(data_cols_to_compare)):
- output_df = compare_data_columns(
+ output_df, raw_data_cols = compare_data_columns(
files,
name_column,
data_cols_to_compare[i],
+ info_cols,
drop_column,
- ignore_test_name=ignore_test_name,
+ debug=debug,
)
- print(output_df)
- html = output_df.to_html()
- text_file.write(html_msgs_for_data_cols[i])
- text_file.write(html)
+
+ # For Plot feature, insert y axis from one of info_cols
+ raw_data_cols.insert(0, info_cols[y_axis_index])
+
+ filtered_info_cols = info_cols[:-2]
+ existing_group_cols = [
+ c for c in filtered_info_cols if c in output_df.columns
+ ]
+ if not existing_group_cols:
+ raise ValueError(
+ f"No valid group-by columns "
+ f"Expected subset: {filtered_info_cols}, "
+ f"but DataFrame has: {list(output_df.columns)}"
+ )
+
+ output_df_sorted = output_df.sort_values(by=existing_group_cols)
+ output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
+ for name, group in output_groups:
+ html = group.to_html()
+ text_file.write(html_msgs_for_data_cols[i])
+ text_file.write(html)
+
+ if plot is True:
+ import pandas as pd
+ import plotly.express as px
+
+ df = group[raw_data_cols]
+ df_sorted = df.sort_values(by=info_cols[y_axis_index])
+ # Melt DataFrame for plotting
+ df_melted = df_sorted.melt(
+ id_vars=info_cols[y_axis_index],
+ var_name="Configuration",
+ value_name=data_cols_to_compare[i],
+ )
+ title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+ # Create Plotly line chart
+ fig = px.line(
+ df_melted,
+ x=info_cols[y_axis_index],
+ y=data_cols_to_compare[i],
+ color="Configuration",
+ title=title,
+ markers=True,
+ )
+ # Export to HTML
+ text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 554256b4bdb8..496ee6083abd 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,17 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
import json
import os
+import re
+import shlex
from importlib import util
from pathlib import Path
+from typing import Any
import pandas as pd
import psutil
from tabulate import tabulate
-results_folder = Path("results/")
-
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
@@ -42,14 +44,22 @@ throughput_results_column_mapping = {
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
+ "model_id": "Model",
+ "dataset_name": "Dataset Name",
+ "input_len": "Input Len",
+ "output_len": "Output Len",
+ "tp_size": "TP Size",
+ "pp_size": "PP Size",
+ "dtype": "dtype",
"gpu_type": "GPU",
"completed": "# of req.",
+ "qps": "qps",
"max_concurrency": "# of max concurrency.",
"request_throughput": "Tput (req/s)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
- "total_input_tokens": "Total input tokens",
- "total_output_tokens": "Total output tokens",
+ # "total_input_tokens": "Total input tokens",
+ # "total_output_tokens": "Total output tokens",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
@@ -94,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"):
bytes /= factor
+def _coerce(val: str) -> Any:
+ """Best-effort type coercion from string to Python types."""
+ low = val.lower()
+ if low == "null":
+ return None
+ if low == "true":
+ return True
+ if low == "false":
+ return False
+ # integers
+ if re.fullmatch(r"[+-]?\d+", val):
+ try:
+ return int(val)
+ except ValueError:
+ pass
+ # floats (keep 'inf'/'-inf'/'nan' as strings)
+ if re.fullmatch(r"[+-]?\d*\.\d+", val):
+ try:
+ return float(val)
+ except ValueError:
+ pass
+ return val
+
+
+def parse_client_command(cmd: str) -> dict[str, Any]:
+ """Parse the client_command shell string into {executable, script, args}."""
+ toks = shlex.split(cmd)
+ if len(toks) < 2:
+ raise ValueError("client_command must include an executable and a script")
+ executable, script = toks[0], toks[1]
+ args: dict[str, Any] = {}
+
+ i = 2
+ while i < len(toks):
+ t = toks[i]
+ if t.startswith("--"):
+ # --key=value or --key (value) or boolean flag
+ if "=" in t:
+ key, val = t.split("=", 1)
+ if key == "--metadata":
+ md = {}
+ if val:
+ if "=" in val:
+ k, v = val.split("=", 1)
+ md[k] = _coerce(v)
+ else:
+ md[val] = True
+ args[key] = md
+ else:
+ args[key] = _coerce(val)
+ i += 1
+ continue
+
+ key = t
+
+ # Special: consume metadata k=v pairs until next --flag
+ if key == "--metadata":
+ i += 1
+ md = {}
+ while i < len(toks) and not toks[i].startswith("--"):
+ pair = toks[i]
+ if "=" in pair:
+ k, v = pair.split("=", 1)
+ md[k] = _coerce(v)
+ else:
+ md[pair] = True
+ i += 1
+ args[key] = md
+ continue
+
+ # Standard: check if next token is a value (not a flag)
+ if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
+ args[key] = _coerce(toks[i + 1])
+ i += 2
+ else:
+ # lone flag -> True
+ args[key] = True
+ i += 1
+ else:
+ # unexpected positional; skip
+ i += 1
+
+ return {"executable": executable, "script": script, "args": args}
+
+
if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-r",
+ "--result",
+ type=str,
+ default="results",
+ help="Folder name for benchmark output results.",
+ )
+ args = parser.parse_args()
+ results_folder = Path(args.result)
+ if not results_folder.exists():
+ raise FileNotFoundError(f"results folder does not exist: {results_folder}")
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file) as f:
@@ -102,7 +209,6 @@ if __name__ == "__main__":
if "serving" in str(test_file):
# this result is generated via `vllm bench serve` command
-
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
@@ -110,12 +216,44 @@ if __name__ == "__main__":
except OSError as e:
print(e)
continue
+ # Parse Server Command Arg
+ out: dict[str, Any] = {
+ "server_command": parse_client_command(command["server_command"])
+ }
+ parse_args = [
+ "--tensor-parallel-size",
+ "--pipeline-parallel-size",
+ "--dtype",
+ ]
+ col_mapping = ["tp_size", "pp_size", "dtype"]
+ for index, arg in enumerate(parse_args):
+ if arg in out["server_command"]["args"]:
+ raw_result.update(
+ {col_mapping[index]: out["server_command"]["args"][arg]}
+ )
+ # Parse Client Command Arg
+ out: dict[str, Any] = {
+ "client_command": parse_client_command(command["client_command"])
+ }
+ parse_args = [
+ "--dataset-name",
+ "--random-input-len",
+ "--random-output-len",
+ "--request-rate",
+ ]
+ col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
+
+ for index, arg in enumerate(parse_args):
+ if arg in out["client_command"]["args"]:
+ raw_result.update(
+ {col_mapping[index]: out["client_command"]["args"][arg]}
+ )
+ # Add Server, Client command
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
-
# add the result to raw_result
serving_results.append(raw_result)
continue
@@ -205,7 +343,10 @@ if __name__ == "__main__":
columns=latency_column_mapping
)
if not serving_results.empty:
- serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+ valid_columns = [
+ col for col in serving_column_mapping if col in serving_results.columns
+ ]
+ serving_results = serving_results[valid_columns].rename(
columns=serving_column_mapping
)
if not throughput_results.empty:
@@ -245,7 +386,9 @@ if __name__ == "__main__":
)
# document the result
- with open(results_folder / "benchmark_results.md", "w") as f:
+ md_file = "benchmark_results.md"
+ json_file = "benchmark_results.json"
+ with open(results_folder / md_file, "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/"
+ "performance-benchmarks-descriptions.md"
@@ -260,7 +403,7 @@ if __name__ == "__main__":
f.write(results)
# document benchmarking results in json
- with open(results_folder / "benchmark_results.json", "w") as f:
+ with open(results_folder / json_file, "w") as f:
results = (
latency_results.to_dict(orient="records")
+ throughput_results.to_dict(orient="records")
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 2c57666a81aa..b1b7d2d77a44 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -194,9 +194,11 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -261,9 +263,11 @@ run_throughput_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -329,12 +333,21 @@ run_serving_tests() {
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
+ max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
+ if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
+ num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+ max_concurrency_list="[$num_prompts]"
+ fi
+ max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
+ echo "Running over max concurrency list $max_concurrency_list"
# check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -390,35 +403,39 @@ run_serving_tests() {
echo "now qps is $qps"
fi
- new_test_name=$test_name"_qps_"$qps
+ # iterate over different max_concurrency
+ for max_concurrency in $max_concurrency_list; do
+ new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+ echo " new test name $new_test_name"
+ # pass the tensor parallel size to the client so that it can be displayed
+ # on the benchmark dashboard
+ client_command="vllm bench serve \
+ --save-result \
+ --result-dir $RESULTS_FOLDER \
+ --result-filename ${new_test_name}.json \
+ --request-rate $qps \
+ --max-concurrency $max_concurrency \
+ --metadata "tensor_parallel_size=$tp" \
+ $client_args $client_remote_args "
- # pass the tensor parallel size to the client so that it can be displayed
- # on the benchmark dashboard
- client_command="vllm bench serve \
- --save-result \
- --result-dir $RESULTS_FOLDER \
- --result-filename ${new_test_name}.json \
- --request-rate $qps \
- --metadata "tensor_parallel_size=$tp" \
- $client_args $client_remote_args "
+ echo "Running test case $test_name with qps $qps"
+ echo "Client command: $client_command"
- echo "Running test case $test_name with qps $qps"
- echo "Client command: $client_command"
+ bash -c "$client_command"
- bash -c "$client_command"
-
- # record the benchmarking commands
- jq_output=$(jq -n \
- --arg server "$server_command" \
- --arg client "$client_command" \
- --arg gpu "$gpu_type" \
- '{
- server_command: $server,
- client_command: $client,
- gpu_type: $gpu
- }')
- echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+ # record the benchmarking commands
+ jq_output=$(jq -n \
+ --arg server "$server_command" \
+ --arg client "$client_command" \
+ --arg gpu "$gpu_type" \
+ '{
+ server_command: $server,
+ client_command: $client,
+ gpu_type: $gpu
+ }')
+ echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+ done
done
# clean up
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
index da93fdd1dbac..569117aae852 100644
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
@@ -20,7 +20,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index dd0e24edff98..2d88a0b30c4f 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,7 +1,8 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -87,17 +88,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -106,7 +107,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -120,19 +121,19 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -141,7 +142,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -155,19 +156,19 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -176,7 +177,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -190,13 +191,11 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
- "ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
}
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index f1bda65a7590..823abbaa99f8 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,7 +1,8 @@
[
{
"test_name": "serving_llama8B_pp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_pp3_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
- "test_name": "serving_llama8B_tp2pp6_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "test_name": "serving_llama8B_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
@@ -88,17 +89,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_pp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -107,7 +108,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -121,28 +122,28 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_pp3_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL:": 1,
+ "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -156,19 +157,19 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp2pp3_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -177,7 +178,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
@@ -192,13 +193,12 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
}
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index f150b9abeea4..e21c8df0a9fe 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -2,6 +2,7 @@
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -87,17 +88,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -106,7 +107,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -120,19 +121,19 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 100,
"num_prompts": 100
}
},
{
"test_name": "serving_llama8B_pp6_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -141,7 +142,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -155,13 +156,12 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 100,
"num_prompts": 100
}
}
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
index f159c30637d3..48c015aa8403 100644
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -21,7 +21,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 0ebd99ba5ae1..2bbed778f3c6 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -11,7 +11,7 @@ vLLM contains two sets of benchmarks:
The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
-The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
+The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).