mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-23 03:19:12 +08:00
Merge e41c10d5cffe898e83a47fdce9032aa33969bda3 into 254f6b986720c92ddf97fbb1a6a6465da8e87e29
This commit is contained in:
commit
5011eb74da
@ -176,19 +176,6 @@ If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||
|
||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
||||
#### Performance Results Comparison
|
||||
|
||||
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||
|
||||
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
|
||||
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
|
||||
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
|
||||
|
||||
A comparison diagram will be generated below the table.
|
||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
||||
Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
|
||||
|
||||
@ -1,8 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import html as _html
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from importlib import util
|
||||
|
||||
import pandas as pd
|
||||
@ -10,27 +15,49 @@ import pandas as pd
|
||||
pd.options.display.float_format = "{:.2f}".format
|
||||
plotly_found = util.find_spec("plotly.express") is not None
|
||||
|
||||
DEFAULT_INFO_COLS = [
|
||||
"Model",
|
||||
"Dataset Name",
|
||||
"Input Len",
|
||||
"Output Len",
|
||||
# "TP Size",
|
||||
# "PP Size",
|
||||
"# of max concurrency.",
|
||||
"qps",
|
||||
]
|
||||
|
||||
# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
|
||||
pd.set_option("display.precision", 2)
|
||||
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Core data compare
|
||||
# -----------------------------
|
||||
def compare_data_columns(
|
||||
files, name_column, data_column, info_cols, drop_column, debug=False
|
||||
files: list[str],
|
||||
name_column: str,
|
||||
data_column: str,
|
||||
info_cols: list[str],
|
||||
drop_column: str,
|
||||
debug: bool = False,
|
||||
):
|
||||
"""
|
||||
Align concatenation by keys derived from info_cols instead of row order.
|
||||
- Pick one canonical key list: subset of info_cols present in ALL files.
|
||||
- For each file: set index to those keys, aggregate duplicates
|
||||
- (mean for metric, first for names).
|
||||
(mean for metric, first for names).
|
||||
- Concat along axis=1 (indexes align), then reset_index so callers can
|
||||
- group by columns.
|
||||
group by columns.
|
||||
- If --debug, add a <file_label>_name column per file.
|
||||
"""
|
||||
print("\ncompare_data_column:", data_column)
|
||||
|
||||
frames = []
|
||||
raw_data_cols = []
|
||||
raw_data_cols: list[str] = []
|
||||
compare_frames = []
|
||||
|
||||
# 1) choose a canonical key list from info_cols that exists in ALL files
|
||||
cols_per_file = []
|
||||
cols_per_file: list[set] = []
|
||||
for f in files:
|
||||
try:
|
||||
df_tmp = pd.read_json(f, orient="records")
|
||||
@ -40,24 +67,20 @@ def compare_data_columns(
|
||||
|
||||
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
||||
if not key_cols:
|
||||
# soft fallback: use any info_cols present in the first file
|
||||
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
|
||||
if not key_cols:
|
||||
raise ValueError(
|
||||
"No common key columns found from info_cols across the input files."
|
||||
)
|
||||
|
||||
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
|
||||
meta_added = False
|
||||
|
||||
for file in files:
|
||||
df = pd.read_json(file, orient="records")
|
||||
|
||||
# Keep rows that actually have the compared metric (same as original behavior)
|
||||
if drop_column in df.columns:
|
||||
df = df.dropna(subset=[drop_column], ignore_index=True)
|
||||
|
||||
# Stabilize numeric key columns (harmless if missing)
|
||||
for c in (
|
||||
"Input Len",
|
||||
"Output Len",
|
||||
@ -69,32 +92,26 @@ def compare_data_columns(
|
||||
if c in df.columns:
|
||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
||||
|
||||
# Ensure all key columns exist
|
||||
for c in key_cols:
|
||||
if c not in df.columns:
|
||||
df[c] = pd.NA
|
||||
|
||||
# Set index = key_cols and aggregate duplicates → unique MultiIndex
|
||||
df_idx = df.set_index(key_cols, drop=False)
|
||||
|
||||
# meta (key columns), unique per key
|
||||
meta = df_idx[key_cols]
|
||||
if not meta.index.is_unique:
|
||||
meta = meta.groupby(level=key_cols, dropna=False).first()
|
||||
|
||||
# metric series for this file, aggregated to one row per key
|
||||
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
||||
s = df_idx[data_column]
|
||||
if not s.index.is_unique:
|
||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
||||
s.name = file_label # column label like original
|
||||
s.name = file_label
|
||||
|
||||
# add meta once (from first file) so keys are the leftmost columns
|
||||
if not meta_added:
|
||||
frames.append(meta)
|
||||
meta_added = True
|
||||
|
||||
# (NEW) debug: aligned test-name column per file
|
||||
if debug and name_column in df_idx.columns:
|
||||
name_s = df_idx[name_column]
|
||||
if not name_s.index.is_unique:
|
||||
@ -106,26 +123,19 @@ def compare_data_columns(
|
||||
raw_data_cols.append(file_label)
|
||||
compare_frames.append(s)
|
||||
|
||||
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
|
||||
if len(compare_frames) >= 2:
|
||||
base = compare_frames[0]
|
||||
current = compare_frames[-1]
|
||||
if "P99" in data_column or "Median" in data_column:
|
||||
ratio = base / current # for latency
|
||||
ratio = base / current
|
||||
else:
|
||||
ratio = current / base
|
||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||
ratio = ratio.mask(base == 0)
|
||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||
frames.append(ratio)
|
||||
|
||||
# 4) concat on columns with aligned MultiIndex;
|
||||
# then reset_index to return keys as columns
|
||||
concat_df = pd.concat(frames, axis=1)
|
||||
concat_df = concat_df.reset_index(drop=True).reset_index()
|
||||
if "index" in concat_df.columns:
|
||||
concat_df = concat_df.drop(columns=["index"])
|
||||
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
|
||||
|
||||
# Ensure key/info columns appear first (in your info_cols order)
|
||||
front = [c for c in info_cols if c in concat_df.columns]
|
||||
rest = [c for c in concat_df.columns if c not in front]
|
||||
concat_df = concat_df[front + rest]
|
||||
@ -134,20 +144,15 @@ def compare_data_columns(
|
||||
return concat_df, raw_data_cols
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Split helper
|
||||
# -----------------------------
|
||||
def split_json_by_tp_pp(
|
||||
input_file: str = "benchmark_results.json", output_root: str = "."
|
||||
) -> list[str]:
|
||||
"""
|
||||
Split a benchmark JSON into separate folders by (TP Size, PP Size).
|
||||
|
||||
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
|
||||
Returns: list of file paths written.
|
||||
"""
|
||||
# Load JSON data into DataFrame
|
||||
with open(input_file, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# If the JSON is a dict with a list under common keys, use that list
|
||||
if isinstance(data, dict):
|
||||
for key in ("results", "serving_results", "benchmarks", "data"):
|
||||
if isinstance(data.get(key), list):
|
||||
@ -156,7 +161,6 @@ def split_json_by_tp_pp(
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Keep only "serving" tests
|
||||
name_col = next(
|
||||
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
|
||||
)
|
||||
@ -165,7 +169,6 @@ def split_json_by_tp_pp(
|
||||
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
|
||||
].copy()
|
||||
|
||||
# Handle alias column names
|
||||
rename_map = {
|
||||
"tp_size": "TP Size",
|
||||
"tensor_parallel_size": "TP Size",
|
||||
@ -176,21 +179,14 @@ def split_json_by_tp_pp(
|
||||
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
|
||||
)
|
||||
|
||||
# Ensure TP/PP columns exist (default to 1 if missing)
|
||||
if "TP Size" not in df.columns:
|
||||
df["TP Size"] = 1
|
||||
if "PP Size" not in df.columns:
|
||||
df["PP Size"] = 1
|
||||
|
||||
# make sure TP/PP are numeric ints with no NaN
|
||||
df["TP Size"] = (
|
||||
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
|
||||
)
|
||||
df["PP Size"] = (
|
||||
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
|
||||
)
|
||||
df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
|
||||
df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)
|
||||
|
||||
# Split into separate folders
|
||||
saved_paths: list[str] = []
|
||||
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
|
||||
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
|
||||
@ -203,32 +199,9 @@ def split_json_by_tp_pp(
|
||||
return saved_paths
|
||||
|
||||
|
||||
def _add_limit_line(fig, y_value, label):
|
||||
# Visible dashed line + annotation
|
||||
fig.add_hline(
|
||||
y=y_value,
|
||||
line_dash="dash",
|
||||
line_color="red" if "ttft" in label.lower() else "blue",
|
||||
annotation_text=f"{label}: {y_value} ms",
|
||||
annotation_position="top left",
|
||||
)
|
||||
# Optional: add a legend item (as a transparent helper trace)
|
||||
if plot and plotly_found:
|
||||
import plotly.graph_objects as go
|
||||
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=[None],
|
||||
y=[None],
|
||||
mode="lines",
|
||||
line=dict(
|
||||
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
||||
),
|
||||
name=f"{label}",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Styling helpers
|
||||
# -----------------------------
|
||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||
for c in [
|
||||
"# of max concurrency.",
|
||||
@ -239,7 +212,6 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||
]:
|
||||
if c in df.columns:
|
||||
return c
|
||||
# Fallback: guess an integer-like column (harmless if unused)
|
||||
for c in df.columns:
|
||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
||||
return c
|
||||
@ -248,8 +220,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||
|
||||
def _highlight_threshold(
|
||||
df: pd.DataFrame, threshold: float
|
||||
) -> "pd.io.formats.style.Styler":
|
||||
"""Highlight numeric per-configuration columns with value <= threshold."""
|
||||
) -> pd.io.formats.style.Styler:
|
||||
conc_col = _find_concurrency_col(df)
|
||||
key_cols = [
|
||||
c
|
||||
@ -260,6 +231,7 @@ def _highlight_threshold(
|
||||
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
||||
]
|
||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||
|
||||
return df.style.map(
|
||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
||||
if pd.notna(v) and v <= threshold
|
||||
@ -268,7 +240,264 @@ def _highlight_threshold(
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
|
||||
ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
|
||||
if not ratio_cols:
|
||||
return styler
|
||||
|
||||
styler = styler.apply(
|
||||
lambda _: ["background-color: #fff3b0"] * len(styler.data),
|
||||
subset=ratio_cols,
|
||||
axis=0,
|
||||
)
|
||||
|
||||
styler = styler.set_table_styles(
|
||||
[
|
||||
{
|
||||
"selector": f"th.col_heading.level0.col{i}",
|
||||
"props": [("background-color", "#fff3b0")],
|
||||
}
|
||||
for i, col in enumerate(styler.data.columns)
|
||||
if col in ratio_cols
|
||||
],
|
||||
overwrite=False,
|
||||
)
|
||||
return styler
|
||||
|
||||
|
||||
def _apply_two_decimals(
|
||||
styler: pd.io.formats.style.Styler,
|
||||
) -> pd.io.formats.style.Styler:
|
||||
df = styler.data
|
||||
num_cols = df.select_dtypes("number").columns
|
||||
if len(num_cols) == 0:
|
||||
return styler
|
||||
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Valid max concurrency summary helpers
|
||||
# -----------------------------
|
||||
def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
|
||||
key_cols = [
|
||||
c
|
||||
for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
|
||||
if c in df.columns
|
||||
]
|
||||
exclude = set(key_cols + [conc_col, "qps", "QPS"])
|
||||
|
||||
cols: list[str] = []
|
||||
for c in df.columns:
|
||||
if c in exclude:
|
||||
continue
|
||||
lc = str(c).lower()
|
||||
if lc.startswith("ratio"):
|
||||
continue
|
||||
if lc.endswith("_name") or lc == "test name" or lc == "test_name":
|
||||
continue
|
||||
if pd.api.types.is_numeric_dtype(df[c]):
|
||||
cols.append(c)
|
||||
return cols
|
||||
|
||||
|
||||
def _max_concurrency_ok(
|
||||
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
|
||||
):
|
||||
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
|
||||
return pd.NA
|
||||
|
||||
d = df[[conc_col, cfg_col]].copy()
|
||||
d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
|
||||
d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
|
||||
d = d.dropna(subset=[conc_col, cfg_col])
|
||||
|
||||
if d.empty:
|
||||
return pd.NA
|
||||
|
||||
ok = d[d[cfg_col] <= threshold]
|
||||
if ok.empty:
|
||||
return pd.NA
|
||||
|
||||
return ok[conc_col].max()
|
||||
|
||||
|
||||
def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
|
||||
if (
|
||||
df is None
|
||||
or conc_col not in df.columns
|
||||
or cfg_col not in df.columns
|
||||
or pd.isna(conc_value)
|
||||
):
|
||||
return pd.NA
|
||||
|
||||
d = df[[conc_col, cfg_col]].copy()
|
||||
d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
|
||||
d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
|
||||
|
||||
conc_value = pd.to_numeric(conc_value, errors="coerce")
|
||||
if pd.isna(conc_value):
|
||||
return pd.NA
|
||||
|
||||
hit = d[d[conc_col] == conc_value]
|
||||
if hit.empty:
|
||||
return pd.NA
|
||||
return hit[cfg_col].iloc[0]
|
||||
|
||||
|
||||
def build_valid_max_concurrency_summary_html(
|
||||
tput_group_df: pd.DataFrame | None,
|
||||
ttft_group_df: pd.DataFrame | None,
|
||||
tpot_group_df: pd.DataFrame | None,
|
||||
conc_col: str,
|
||||
args,
|
||||
) -> str:
|
||||
if ttft_group_df is None and tpot_group_df is None:
|
||||
return ""
|
||||
|
||||
ttft_cols = (
|
||||
_config_value_columns(ttft_group_df, conc_col)
|
||||
if ttft_group_df is not None
|
||||
else []
|
||||
)
|
||||
tpot_cols = (
|
||||
_config_value_columns(tpot_group_df, conc_col)
|
||||
if tpot_group_df is not None
|
||||
else []
|
||||
)
|
||||
tput_cols = (
|
||||
_config_value_columns(tput_group_df, conc_col)
|
||||
if tput_group_df is not None
|
||||
else []
|
||||
)
|
||||
|
||||
if ttft_group_df is not None and tpot_group_df is not None:
|
||||
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
|
||||
if tput_group_df is not None:
|
||||
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
|
||||
else:
|
||||
cfg_cols = ttft_cols or tpot_cols
|
||||
|
||||
if not cfg_cols:
|
||||
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
||||
|
||||
rows = []
|
||||
for cfg in cfg_cols:
|
||||
ttft_max = (
|
||||
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
||||
if ttft_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
tpot_max = (
|
||||
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
||||
if tpot_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
both = (
|
||||
pd.NA
|
||||
if (pd.isna(ttft_max) or pd.isna(tpot_max))
|
||||
else min(ttft_max, tpot_max)
|
||||
)
|
||||
|
||||
tput_at_both = (
|
||||
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
|
||||
if tput_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
ttft_at_both = (
|
||||
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
|
||||
if ttft_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
tpot_at_both = (
|
||||
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
|
||||
if tpot_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"Configuration": cfg,
|
||||
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
||||
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
||||
f"Max {conc_col} (Both)": both,
|
||||
"Output Tput @ Both (tok/s)": tput_at_both,
|
||||
"TTFT @ Both (ms)": ttft_at_both,
|
||||
"TPOT @ Both (ms)": tpot_at_both,
|
||||
}
|
||||
)
|
||||
|
||||
summary_df = pd.DataFrame(rows)
|
||||
|
||||
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
|
||||
for c in summary_df.columns:
|
||||
if c == "Configuration":
|
||||
continue
|
||||
summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
|
||||
|
||||
both_col = f"Max {conc_col} (Both)"
|
||||
|
||||
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
|
||||
formatters = {}
|
||||
for c in summary_df.columns:
|
||||
if c == "Configuration":
|
||||
continue
|
||||
# default argument binds per-column formatter correctly
|
||||
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
|
||||
|
||||
styler = summary_df.style.format(formatters)
|
||||
|
||||
def _green(v):
|
||||
return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
|
||||
|
||||
if both_col in summary_df.columns:
|
||||
styler = styler.map(_green, subset=[both_col])
|
||||
|
||||
title = (
|
||||
'<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
|
||||
"Valid Max Concurrency Summary"
|
||||
"</div>\n"
|
||||
)
|
||||
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Plot helper
|
||||
# -----------------------------
|
||||
def _add_limit_line(fig, y_value: float, label: str):
|
||||
fig.add_hline(
|
||||
y=y_value,
|
||||
line_dash="dash",
|
||||
line_color="red" if "ttft" in label.lower() else "blue",
|
||||
annotation_text=f"{label}: {y_value} ms",
|
||||
annotation_position="top left",
|
||||
)
|
||||
if plotly_found:
|
||||
import plotly.graph_objects as go
|
||||
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=[None],
|
||||
y=[None],
|
||||
mode="lines",
|
||||
line=dict(
|
||||
dash="dash",
|
||||
color="red" if "ttft" in label.lower() else "blue",
|
||||
),
|
||||
name=label,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Refactored main + group-first report
|
||||
# -----------------------------
|
||||
@dataclass(frozen=True)
|
||||
class MetricPlan:
|
||||
data_cols: list[str]
|
||||
drop_column: str
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-f", "--file", action="append", type=str, help="input file name"
|
||||
@ -308,149 +537,289 @@ if __name__ == "__main__":
|
||||
default=100.0,
|
||||
help="Reference limit for TPOT plots (ms)",
|
||||
)
|
||||
return parser
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
def choose_metrics(latency: str) -> MetricPlan:
|
||||
latency = (latency or "").lower()
|
||||
drop_column = "P99"
|
||||
name_column = "Test name"
|
||||
info_cols = [
|
||||
"Model",
|
||||
"Dataset Name",
|
||||
"Input Len",
|
||||
"Output Len",
|
||||
"TP Size",
|
||||
"PP Size",
|
||||
"# of max concurrency.",
|
||||
"qps",
|
||||
]
|
||||
|
||||
if "median" in args.latency:
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"Median TTFT /n",
|
||||
"Median TPOT /n",
|
||||
]
|
||||
drop_column = "P99"
|
||||
elif "p99" in args.latency:
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"P99 TTFT /n",
|
||||
"P99 TPOT /n",
|
||||
]
|
||||
if "median" in latency:
|
||||
return MetricPlan(
|
||||
data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
|
||||
drop_column=drop_column,
|
||||
)
|
||||
|
||||
return MetricPlan(
|
||||
data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
|
||||
drop_column=drop_column,
|
||||
)
|
||||
|
||||
|
||||
def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
|
||||
if not args.file:
|
||||
raise ValueError("No input files provided. Use -f/--file.")
|
||||
|
||||
if len(args.file) == 1:
|
||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
|
||||
else:
|
||||
files = args.file
|
||||
|
||||
return files, info_cols
|
||||
|
||||
|
||||
def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
|
||||
y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
|
||||
return info_cols[y_axis_index]
|
||||
|
||||
|
||||
def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
|
||||
filtered_info_cols = info_cols[:4]
|
||||
group_cols = [c for c in filtered_info_cols if c in output_df.columns]
|
||||
if not group_cols:
|
||||
raise ValueError(
|
||||
f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
|
||||
f"but DataFrame has: {list(output_df.columns)}"
|
||||
)
|
||||
return group_cols
|
||||
|
||||
|
||||
def normalize_group_key(name):
|
||||
return name if isinstance(name, tuple) else (name,)
|
||||
|
||||
|
||||
def group_filename(name, prefix: str = "perf_comparison_") -> str:
|
||||
name_vals = normalize_group_key(name)
|
||||
safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
|
||||
return f"{prefix}{safe}.html"
|
||||
|
||||
|
||||
def build_group_suffix(group_cols: list[str], name) -> str:
|
||||
name_vals = normalize_group_key(name)
|
||||
return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
|
||||
|
||||
|
||||
def render_metric_table_html(
|
||||
display_group: pd.DataFrame,
|
||||
metric_label: str,
|
||||
group_suffix: str,
|
||||
args,
|
||||
) -> str:
|
||||
title = (
|
||||
f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
|
||||
f"{_html.escape(metric_label)}"
|
||||
f" — {_html.escape(group_suffix)}"
|
||||
f"</div>\n"
|
||||
)
|
||||
|
||||
metric_name = metric_label.lower()
|
||||
if "ttft" in metric_name:
|
||||
styler = _highlight_threshold(display_group, args.ttft_max_ms)
|
||||
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
||||
styler = _highlight_threshold(display_group, args.tpot_max_ms)
|
||||
else:
|
||||
styler = display_group.style
|
||||
|
||||
styler = _apply_two_decimals(styler)
|
||||
styler = highlight_ratio_columns(styler)
|
||||
|
||||
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
||||
|
||||
|
||||
def maybe_write_plot(
|
||||
main_fh,
|
||||
sub_fh,
|
||||
group_df: pd.DataFrame,
|
||||
raw_data_cols: list[str],
|
||||
metric_label: str,
|
||||
y_axis_col: str,
|
||||
args,
|
||||
):
|
||||
if not (args.plot and plotly_found):
|
||||
return
|
||||
|
||||
import plotly.express as px
|
||||
|
||||
df = group_df[raw_data_cols].sort_values(by=y_axis_col)
|
||||
df_melted = df.melt(
|
||||
id_vars=y_axis_col,
|
||||
var_name="Configuration",
|
||||
value_name=metric_label,
|
||||
)
|
||||
|
||||
fig = px.line(
|
||||
df_melted,
|
||||
x=y_axis_col,
|
||||
y=metric_label,
|
||||
color="Configuration",
|
||||
title=f"{metric_label} vs {y_axis_col}",
|
||||
markers=True,
|
||||
)
|
||||
|
||||
# Ensure plot hover + y tick labels are also 2 decimals.
|
||||
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
|
||||
fig.update_yaxes(tickformat=".2f")
|
||||
|
||||
metric_name = metric_label.lower()
|
||||
if "ttft" in metric_name:
|
||||
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
||||
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
||||
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
||||
|
||||
html = fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||
main_fh.write(html)
|
||||
sub_fh.write(html)
|
||||
|
||||
|
||||
def build_group_keys(
|
||||
df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
|
||||
):
|
||||
if sort_cols:
|
||||
df = df.sort_values(by=sort_cols)
|
||||
gb = df.groupby(group_cols, dropna=False)
|
||||
return [k for k, _ in gb]
|
||||
|
||||
|
||||
def write_report_group_first(
|
||||
files: list[str], info_cols: list[str], plan: MetricPlan, args
|
||||
):
|
||||
name_column = "Test name"
|
||||
y_axis_col = get_y_axis_col(info_cols, args.xaxis)
|
||||
|
||||
print("comparing : " + ", ".join(files))
|
||||
debug = args.debug
|
||||
plot = args.plot
|
||||
# For Plot feature, assign y axis from one of info_cols
|
||||
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
|
||||
with open("perf_comparison.html", "w") as text_file:
|
||||
for i in range(len(data_cols_to_compare)):
|
||||
output_df, raw_data_cols = compare_data_columns(
|
||||
files,
|
||||
name_column,
|
||||
data_cols_to_compare[i],
|
||||
info_cols,
|
||||
drop_column,
|
||||
debug=debug,
|
||||
|
||||
metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
|
||||
group_cols_canonical: list[str] | None = None
|
||||
|
||||
for metric_label in plan.data_cols:
|
||||
output_df, raw_data_cols = compare_data_columns(
|
||||
files,
|
||||
name_column,
|
||||
metric_label,
|
||||
info_cols,
|
||||
plan.drop_column,
|
||||
debug=args.debug,
|
||||
)
|
||||
|
||||
raw_data_cols = list(raw_data_cols)
|
||||
raw_data_cols.insert(0, y_axis_col)
|
||||
|
||||
group_cols = get_group_cols(output_df, info_cols)
|
||||
if group_cols_canonical is None:
|
||||
group_cols_canonical = group_cols
|
||||
else:
|
||||
group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
|
||||
|
||||
metric_cache[metric_label] = (
|
||||
output_df.sort_values(by=args.xaxis),
|
||||
raw_data_cols,
|
||||
)
|
||||
|
||||
if not group_cols_canonical:
|
||||
raise ValueError("No canonical group columns found across metrics.")
|
||||
|
||||
first_metric = plan.data_cols[0]
|
||||
first_df_sorted, _ = metric_cache[first_metric]
|
||||
group_keys = build_group_keys(
|
||||
first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
|
||||
)
|
||||
|
||||
metric_groupbys = {
|
||||
metric_label: df.groupby(group_cols_canonical, dropna=False)
|
||||
for metric_label, (df, _) in metric_cache.items()
|
||||
}
|
||||
|
||||
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
||||
main_fh.write('<meta charset="utf-8">\n')
|
||||
for gkey in group_keys:
|
||||
gkey_tuple = normalize_group_key(gkey)
|
||||
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
|
||||
sub_path = group_filename(gkey_tuple)
|
||||
group_header = (
|
||||
'<div style="font-size: 1.4em; font-weight: 700; '
|
||||
'margin: 18px 0 10px 0;">'
|
||||
f"{_html.escape(suffix)}"
|
||||
"</div>\n"
|
||||
)
|
||||
|
||||
# For Plot feature, insert y axis from one of info_cols
|
||||
raw_data_cols.insert(0, info_cols[y_axis_index])
|
||||
main_fh.write(group_header)
|
||||
with open(sub_path, "w", encoding="utf-8") as sub_fh:
|
||||
sub_fh.write('<meta charset="utf-8">\n')
|
||||
sub_fh.write(group_header)
|
||||
tput_group_df = None
|
||||
ttft_group_df = None
|
||||
tpot_group_df = None
|
||||
conc_col = args.xaxis
|
||||
|
||||
filtered_info_cols = info_cols[:-2]
|
||||
existing_group_cols = [
|
||||
c for c in filtered_info_cols if c in output_df.columns
|
||||
]
|
||||
if not existing_group_cols:
|
||||
raise ValueError(
|
||||
f"No valid group-by columns "
|
||||
f"Expected subset: {filtered_info_cols}, "
|
||||
f"but DataFrame has: {list(output_df.columns)}"
|
||||
for metric_label in plan.data_cols:
|
||||
gb = metric_groupbys[metric_label]
|
||||
df_sorted, raw_data_cols = metric_cache[metric_label]
|
||||
|
||||
try:
|
||||
group_df = gb.get_group(gkey)
|
||||
except KeyError:
|
||||
missing = (
|
||||
'<div style="font-size: 1.1em; font-weight: 600; '
|
||||
'margin: 10px 0;">'
|
||||
f"{_html.escape(metric_label)} — missing for this group"
|
||||
"</div>\n"
|
||||
)
|
||||
|
||||
main_fh.write(missing)
|
||||
sub_fh.write(missing)
|
||||
continue
|
||||
|
||||
if conc_col not in group_df.columns:
|
||||
conc_col = _find_concurrency_col(group_df)
|
||||
|
||||
mn = metric_label.lower().strip()
|
||||
if "tok/s" in mn:
|
||||
tput_group_df = group_df
|
||||
elif "ttft" in mn:
|
||||
ttft_group_df = group_df
|
||||
elif mn in ("p99", "median") or "tpot" in mn:
|
||||
tpot_group_df = group_df
|
||||
|
||||
display_group = group_df.drop(
|
||||
columns=group_cols_canonical, errors="ignore"
|
||||
)
|
||||
|
||||
html = render_metric_table_html(
|
||||
display_group, metric_label, suffix, args
|
||||
)
|
||||
main_fh.write(html)
|
||||
sub_fh.write(html)
|
||||
|
||||
maybe_write_plot(
|
||||
main_fh,
|
||||
sub_fh,
|
||||
group_df=group_df,
|
||||
raw_data_cols=raw_data_cols,
|
||||
metric_label=metric_label,
|
||||
y_axis_col=y_axis_col,
|
||||
args=args,
|
||||
)
|
||||
|
||||
summary_html = build_valid_max_concurrency_summary_html(
|
||||
tput_group_df=tput_group_df,
|
||||
ttft_group_df=ttft_group_df,
|
||||
tpot_group_df=tpot_group_df,
|
||||
conc_col=conc_col,
|
||||
args=args,
|
||||
)
|
||||
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||
for name, group in output_groups:
|
||||
group_name = (
|
||||
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
||||
)
|
||||
group_html_name = "perf_comparison_" + group_name + ".html"
|
||||
if summary_html:
|
||||
main_fh.write(summary_html)
|
||||
sub_fh.write(summary_html)
|
||||
|
||||
metric_name = str(data_cols_to_compare[i]).lower()
|
||||
if "tok/s" in metric_name:
|
||||
html = group.to_html()
|
||||
elif "ttft" in metric_name:
|
||||
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||
na_rep="—",
|
||||
)
|
||||
html = styler.to_html(
|
||||
table_attributes='border="1" class="dataframe"'
|
||||
)
|
||||
elif (
|
||||
"tpot" in metric_name
|
||||
or "median" in metric_name
|
||||
or "p99" in metric_name
|
||||
):
|
||||
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||
na_rep="—",
|
||||
)
|
||||
html = styler.to_html(
|
||||
table_attributes='border="1" class="dataframe"'
|
||||
)
|
||||
|
||||
text_file.write(html_msgs_for_data_cols[i])
|
||||
text_file.write(html)
|
||||
with open(group_html_name, "a+") as sub_text_file:
|
||||
sub_text_file.write(html_msgs_for_data_cols[i])
|
||||
sub_text_file.write(html)
|
||||
def main():
|
||||
args = build_parser().parse_args()
|
||||
info_cols = list(DEFAULT_INFO_COLS)
|
||||
plan = choose_metrics(args.latency)
|
||||
files, info_cols = prepare_input_files(args, info_cols)
|
||||
write_report_group_first(files, info_cols, plan, args)
|
||||
|
||||
if plot and plotly_found:
|
||||
import plotly.express as px
|
||||
|
||||
df = group[raw_data_cols]
|
||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||
# Melt DataFrame for plotting
|
||||
df_melted = df_sorted.melt(
|
||||
id_vars=info_cols[y_axis_index],
|
||||
var_name="Configuration",
|
||||
value_name=data_cols_to_compare[i],
|
||||
)
|
||||
title = (
|
||||
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||
)
|
||||
# Create Plotly line chart
|
||||
fig = px.line(
|
||||
df_melted,
|
||||
x=info_cols[y_axis_index],
|
||||
y=data_cols_to_compare[i],
|
||||
color="Configuration",
|
||||
title=title,
|
||||
markers=True,
|
||||
)
|
||||
|
||||
# ---- Add threshold lines based on metric name ----
|
||||
if "ttft" in metric_name:
|
||||
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
||||
elif (
|
||||
"tpot" in metric_name
|
||||
or "median" in metric_name
|
||||
or "p99" in metric_name
|
||||
):
|
||||
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
||||
|
||||
# Export to HTML
|
||||
text_file.write(
|
||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||
)
|
||||
sub_text_file.write(
|
||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||
)
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -19,10 +19,8 @@
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
"max_num_seqs": 256
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
@ -151,6 +149,45 @@
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
|
||||
@ -40,7 +40,58 @@ When run, benchmark script generates results under **benchmark/results** folder,
|
||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||
|
||||
For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
|
||||
### Visualization
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||
|
||||
#### Performance Results Comparison
|
||||
|
||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
||||
|
||||
Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||
|
||||
***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
|
||||
|
||||
| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||
|----|------|-----|-----------|----------|----------|
|
||||
| 0 | 12 | inf | 24.98 | 186.03 | 7.45 |
|
||||
| 1 | 16 | inf| 25.49 | 246.92 | 9.69 |
|
||||
| 2 | 24 | inf| 27.74 | 293.34 | 10.57 |
|
||||
| 3 | 32 | inf| 28.61 |306.69 | 10.72 |
|
||||
|
||||
***compare-json-results.py – Command-Line Parameters***
|
||||
|
||||
compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots.
|
||||
In most cases, users only need to specify --file to parse the desired benchmark results.
|
||||
|
||||
| Parameter | Type | Default Value | Description |
|
||||
| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- |
|
||||
| `--file` | `str` (appendable) | *None* | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs. |
|
||||
| `--debug` | `bool` | `False` | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. |
|
||||
| `--plot` / `--no-plot` | `bool` | `True` | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation. |
|
||||
| `--xaxis` | `str` | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size). |
|
||||
| `--latency` | `str` | `p99` | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`. |
|
||||
| `--ttft-max-ms` | `float` | `3000.0` | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds. |
|
||||
| `--tpot-max-ms` | `float` | `100.0` | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds. |
|
||||
|
||||
***Valid Max Concurrency Summary***
|
||||
|
||||
Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result.
|
||||
The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously.
|
||||
This value is typically used in capacity planning and sizing guides.
|
||||
|
||||
| # | Configuration | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) |
|
||||
| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- |
|
||||
| 0 | results-a | 128.00 | 12.00 | 12.00 | 127.76 | 3000.82 | 93.24 |
|
||||
| 1 | results-b | 128.00 | 32.00 | 32.00 | 371.42 | 2261.53 | 81.74 |
|
||||
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user