/tp{TP}_pp{PP}/benchmark_results.json
- Returns: list of file paths written.
- """
- # Load JSON data into DataFrame
with open(input_file, encoding="utf-8") as f:
data = json.load(f)
- # If the JSON is a dict with a list under common keys, use that list
if isinstance(data, dict):
for key in ("results", "serving_results", "benchmarks", "data"):
if isinstance(data.get(key), list):
@@ -156,7 +161,6 @@ def split_json_by_tp_pp(
df = pd.DataFrame(data)
- # Keep only "serving" tests
name_col = next(
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
)
@@ -165,7 +169,6 @@ def split_json_by_tp_pp(
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
].copy()
- # Handle alias column names
rename_map = {
"tp_size": "TP Size",
"tensor_parallel_size": "TP Size",
@@ -176,21 +179,14 @@ def split_json_by_tp_pp(
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
)
- # Ensure TP/PP columns exist (default to 1 if missing)
if "TP Size" not in df.columns:
df["TP Size"] = 1
if "PP Size" not in df.columns:
df["PP Size"] = 1
- # make sure TP/PP are numeric ints with no NaN
- df["TP Size"] = (
- pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
- )
- df["PP Size"] = (
- pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
- )
+ df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
+ df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)
- # Split into separate folders
saved_paths: list[str] = []
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
@@ -203,32 +199,9 @@ def split_json_by_tp_pp(
return saved_paths
-def _add_limit_line(fig, y_value, label):
- # Visible dashed line + annotation
- fig.add_hline(
- y=y_value,
- line_dash="dash",
- line_color="red" if "ttft" in label.lower() else "blue",
- annotation_text=f"{label}: {y_value} ms",
- annotation_position="top left",
- )
- # Optional: add a legend item (as a transparent helper trace)
- if plot and plotly_found:
- import plotly.graph_objects as go
-
- fig.add_trace(
- go.Scatter(
- x=[None],
- y=[None],
- mode="lines",
- line=dict(
- dash="dash", color="red" if "ttft" in label.lower() else "blue"
- ),
- name=f"{label}",
- )
- )
-
-
+# -----------------------------
+# Styling helpers
+# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
@@ -239,7 +212,6 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
]:
if c in df.columns:
return c
- # Fallback: guess an integer-like column (harmless if unused)
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
@@ -248,8 +220,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
def _highlight_threshold(
df: pd.DataFrame, threshold: float
-) -> "pd.io.formats.style.Styler":
- """Highlight numeric per-configuration columns with value <= threshold."""
+) -> pd.io.formats.style.Styler:
conc_col = _find_concurrency_col(df)
key_cols = [
c
@@ -260,6 +231,7 @@ def _highlight_threshold(
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+
return df.style.map(
lambda v: "background-color:#e6ffe6;font-weight:bold;"
if pd.notna(v) and v <= threshold
@@ -268,7 +240,264 @@ def _highlight_threshold(
)
-if __name__ == "__main__":
+def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
+ ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
+ if not ratio_cols:
+ return styler
+
+ styler = styler.apply(
+ lambda _: ["background-color: #fff3b0"] * len(styler.data),
+ subset=ratio_cols,
+ axis=0,
+ )
+
+ styler = styler.set_table_styles(
+ [
+ {
+ "selector": f"th.col_heading.level0.col{i}",
+ "props": [("background-color", "#fff3b0")],
+ }
+ for i, col in enumerate(styler.data.columns)
+ if col in ratio_cols
+ ],
+ overwrite=False,
+ )
+ return styler
+
+
+def _apply_two_decimals(
+ styler: pd.io.formats.style.Styler,
+) -> pd.io.formats.style.Styler:
+ df = styler.data
+ num_cols = df.select_dtypes("number").columns
+ if len(num_cols) == 0:
+ return styler
+ return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
+
+
+# -----------------------------
+# Valid max concurrency summary helpers
+# -----------------------------
+def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
+ key_cols = [
+ c
+ for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
+ if c in df.columns
+ ]
+ exclude = set(key_cols + [conc_col, "qps", "QPS"])
+
+ cols: list[str] = []
+ for c in df.columns:
+ if c in exclude:
+ continue
+ lc = str(c).lower()
+ if lc.startswith("ratio"):
+ continue
+ if lc.endswith("_name") or lc == "test name" or lc == "test_name":
+ continue
+ if pd.api.types.is_numeric_dtype(df[c]):
+ cols.append(c)
+ return cols
+
+
+def _max_concurrency_ok(
+ df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+):
+ if df is None or conc_col not in df.columns or cfg_col not in df.columns:
+ return pd.NA
+
+ d = df[[conc_col, cfg_col]].copy()
+ d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+ d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+ d = d.dropna(subset=[conc_col, cfg_col])
+
+ if d.empty:
+ return pd.NA
+
+ ok = d[d[cfg_col] <= threshold]
+ if ok.empty:
+ return pd.NA
+
+ return ok[conc_col].max()
+
+
+def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
+ if (
+ df is None
+ or conc_col not in df.columns
+ or cfg_col not in df.columns
+ or pd.isna(conc_value)
+ ):
+ return pd.NA
+
+ d = df[[conc_col, cfg_col]].copy()
+ d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+ d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+
+ conc_value = pd.to_numeric(conc_value, errors="coerce")
+ if pd.isna(conc_value):
+ return pd.NA
+
+ hit = d[d[conc_col] == conc_value]
+ if hit.empty:
+ return pd.NA
+ return hit[cfg_col].iloc[0]
+
+
+def build_valid_max_concurrency_summary_html(
+ tput_group_df: pd.DataFrame | None,
+ ttft_group_df: pd.DataFrame | None,
+ tpot_group_df: pd.DataFrame | None,
+ conc_col: str,
+ args,
+) -> str:
+ if ttft_group_df is None and tpot_group_df is None:
+ return ""
+
+ ttft_cols = (
+ _config_value_columns(ttft_group_df, conc_col)
+ if ttft_group_df is not None
+ else []
+ )
+ tpot_cols = (
+ _config_value_columns(tpot_group_df, conc_col)
+ if tpot_group_df is not None
+ else []
+ )
+ tput_cols = (
+ _config_value_columns(tput_group_df, conc_col)
+ if tput_group_df is not None
+ else []
+ )
+
+ if ttft_group_df is not None and tpot_group_df is not None:
+ cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+ if tput_group_df is not None:
+ cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+ else:
+ cfg_cols = ttft_cols or tpot_cols
+
+ if not cfg_cols:
+ cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+ rows = []
+ for cfg in cfg_cols:
+ ttft_max = (
+ _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+ if ttft_group_df is not None
+ else pd.NA
+ )
+ tpot_max = (
+ _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+ if tpot_group_df is not None
+ else pd.NA
+ )
+ both = (
+ pd.NA
+ if (pd.isna(ttft_max) or pd.isna(tpot_max))
+ else min(ttft_max, tpot_max)
+ )
+
+ tput_at_both = (
+ _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+ if tput_group_df is not None
+ else pd.NA
+ )
+ ttft_at_both = (
+ _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+ if ttft_group_df is not None
+ else pd.NA
+ )
+ tpot_at_both = (
+ _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+ if tpot_group_df is not None
+ else pd.NA
+ )
+
+ rows.append(
+ {
+ "Configuration": cfg,
+ f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+ f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+ f"Max {conc_col} (Both)": both,
+ "Output Tput @ Both (tok/s)": tput_at_both,
+ "TTFT @ Both (ms)": ttft_at_both,
+ "TPOT @ Both (ms)": tpot_at_both,
+ }
+ )
+
+ summary_df = pd.DataFrame(rows)
+
+ # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
+ for c in summary_df.columns:
+ if c == "Configuration":
+ continue
+ summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
+
+ both_col = f"Max {conc_col} (Both)"
+
+ # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
+ formatters = {}
+ for c in summary_df.columns:
+ if c == "Configuration":
+ continue
+ # default argument binds per-column formatter correctly
+ formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
+
+ styler = summary_df.style.format(formatters)
+
+ def _green(v):
+ return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
+
+ if both_col in summary_df.columns:
+ styler = styler.map(_green, subset=[both_col])
+
+ title = (
+ ''
+ "Valid Max Concurrency Summary"
+ "
\n"
+ )
+ return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+# -----------------------------
+# Plot helper
+# -----------------------------
+def _add_limit_line(fig, y_value: float, label: str):
+ fig.add_hline(
+ y=y_value,
+ line_dash="dash",
+ line_color="red" if "ttft" in label.lower() else "blue",
+ annotation_text=f"{label}: {y_value} ms",
+ annotation_position="top left",
+ )
+ if plotly_found:
+ import plotly.graph_objects as go
+
+ fig.add_trace(
+ go.Scatter(
+ x=[None],
+ y=[None],
+ mode="lines",
+ line=dict(
+ dash="dash",
+ color="red" if "ttft" in label.lower() else "blue",
+ ),
+ name=label,
+ )
+ )
+
+
+# -----------------------------
+# Refactored main + group-first report
+# -----------------------------
+@dataclass(frozen=True)
+class MetricPlan:
+ data_cols: list[str]
+ drop_column: str
+
+
+def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
parser.add_argument(
"-f", "--file", action="append", type=str, help="input file name"
@@ -308,149 +537,289 @@ if __name__ == "__main__":
default=100.0,
help="Reference limit for TPOT plots (ms)",
)
+ return parser
- args = parser.parse_args()
+def choose_metrics(latency: str) -> MetricPlan:
+ latency = (latency or "").lower()
drop_column = "P99"
- name_column = "Test name"
- info_cols = [
- "Model",
- "Dataset Name",
- "Input Len",
- "Output Len",
- "TP Size",
- "PP Size",
- "# of max concurrency.",
- "qps",
- ]
- if "median" in args.latency:
- data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
- html_msgs_for_data_cols = [
- "Compare Output Tokens /n",
- "Median TTFT /n",
- "Median TPOT /n",
- ]
- drop_column = "P99"
- elif "p99" in args.latency:
- data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
- html_msgs_for_data_cols = [
- "Compare Output Tokens /n",
- "P99 TTFT /n",
- "P99 TPOT /n",
- ]
+ if "median" in latency:
+ return MetricPlan(
+ data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
+ drop_column=drop_column,
+ )
+
+ return MetricPlan(
+ data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
+ drop_column=drop_column,
+ )
+
+
+def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
+ if not args.file:
+ raise ValueError("No input files provided. Use -f/--file.")
if len(args.file) == 1:
files = split_json_by_tp_pp(args.file[0], output_root="splits")
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
else:
files = args.file
+
+ return files, info_cols
+
+
+def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
+ y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
+ return info_cols[y_axis_index]
+
+
+def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
+ filtered_info_cols = info_cols[:4]
+ group_cols = [c for c in filtered_info_cols if c in output_df.columns]
+ if not group_cols:
+ raise ValueError(
+ f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
+ f"but DataFrame has: {list(output_df.columns)}"
+ )
+ return group_cols
+
+
+def normalize_group_key(name):
+ return name if isinstance(name, tuple) else (name,)
+
+
+def group_filename(name, prefix: str = "perf_comparison_") -> str:
+ name_vals = normalize_group_key(name)
+ safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
+ return f"{prefix}{safe}.html"
+
+
+def build_group_suffix(group_cols: list[str], name) -> str:
+ name_vals = normalize_group_key(name)
+ return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
+
+
+def render_metric_table_html(
+ display_group: pd.DataFrame,
+ metric_label: str,
+ group_suffix: str,
+ args,
+) -> str:
+ title = (
+ f''
+ f"{_html.escape(metric_label)}"
+ f" — {_html.escape(group_suffix)}"
+ f"
\n"
+ )
+
+ metric_name = metric_label.lower()
+ if "ttft" in metric_name:
+ styler = _highlight_threshold(display_group, args.ttft_max_ms)
+ elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+ styler = _highlight_threshold(display_group, args.tpot_max_ms)
+ else:
+ styler = display_group.style
+
+ styler = _apply_two_decimals(styler)
+ styler = highlight_ratio_columns(styler)
+
+ return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+def maybe_write_plot(
+ main_fh,
+ sub_fh,
+ group_df: pd.DataFrame,
+ raw_data_cols: list[str],
+ metric_label: str,
+ y_axis_col: str,
+ args,
+):
+ if not (args.plot and plotly_found):
+ return
+
+ import plotly.express as px
+
+ df = group_df[raw_data_cols].sort_values(by=y_axis_col)
+ df_melted = df.melt(
+ id_vars=y_axis_col,
+ var_name="Configuration",
+ value_name=metric_label,
+ )
+
+ fig = px.line(
+ df_melted,
+ x=y_axis_col,
+ y=metric_label,
+ color="Configuration",
+ title=f"{metric_label} vs {y_axis_col}",
+ markers=True,
+ )
+
+ # Ensure plot hover + y tick labels are also 2 decimals.
+ fig.update_traces(hovertemplate="%{y:.2f}")
+ fig.update_yaxes(tickformat=".2f")
+
+ metric_name = metric_label.lower()
+ if "ttft" in metric_name:
+ _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+ elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+ _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+ html = fig.to_html(full_html=True, include_plotlyjs="cdn")
+ main_fh.write(html)
+ sub_fh.write(html)
+
+
+def build_group_keys(
+ df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
+):
+ if sort_cols:
+ df = df.sort_values(by=sort_cols)
+ gb = df.groupby(group_cols, dropna=False)
+ return [k for k, _ in gb]
+
+
+def write_report_group_first(
+ files: list[str], info_cols: list[str], plan: MetricPlan, args
+):
+ name_column = "Test name"
+ y_axis_col = get_y_axis_col(info_cols, args.xaxis)
+
print("comparing : " + ", ".join(files))
- debug = args.debug
- plot = args.plot
- # For Plot feature, assign y axis from one of info_cols
- y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
- with open("perf_comparison.html", "w") as text_file:
- for i in range(len(data_cols_to_compare)):
- output_df, raw_data_cols = compare_data_columns(
- files,
- name_column,
- data_cols_to_compare[i],
- info_cols,
- drop_column,
- debug=debug,
+
+ metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
+ group_cols_canonical: list[str] | None = None
+
+ for metric_label in plan.data_cols:
+ output_df, raw_data_cols = compare_data_columns(
+ files,
+ name_column,
+ metric_label,
+ info_cols,
+ plan.drop_column,
+ debug=args.debug,
+ )
+
+ raw_data_cols = list(raw_data_cols)
+ raw_data_cols.insert(0, y_axis_col)
+
+ group_cols = get_group_cols(output_df, info_cols)
+ if group_cols_canonical is None:
+ group_cols_canonical = group_cols
+ else:
+ group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
+
+ metric_cache[metric_label] = (
+ output_df.sort_values(by=args.xaxis),
+ raw_data_cols,
+ )
+
+ if not group_cols_canonical:
+ raise ValueError("No canonical group columns found across metrics.")
+
+ first_metric = plan.data_cols[0]
+ first_df_sorted, _ = metric_cache[first_metric]
+ group_keys = build_group_keys(
+ first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
+ )
+
+ metric_groupbys = {
+ metric_label: df.groupby(group_cols_canonical, dropna=False)
+ for metric_label, (df, _) in metric_cache.items()
+ }
+
+ with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+ main_fh.write('\n')
+ for gkey in group_keys:
+ gkey_tuple = normalize_group_key(gkey)
+ suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+ sub_path = group_filename(gkey_tuple)
+ group_header = (
+ ''
+ f"{_html.escape(suffix)}"
+ "
\n"
)
- # For Plot feature, insert y axis from one of info_cols
- raw_data_cols.insert(0, info_cols[y_axis_index])
+ main_fh.write(group_header)
+ with open(sub_path, "w", encoding="utf-8") as sub_fh:
+ sub_fh.write('\n')
+ sub_fh.write(group_header)
+ tput_group_df = None
+ ttft_group_df = None
+ tpot_group_df = None
+ conc_col = args.xaxis
- filtered_info_cols = info_cols[:-2]
- existing_group_cols = [
- c for c in filtered_info_cols if c in output_df.columns
- ]
- if not existing_group_cols:
- raise ValueError(
- f"No valid group-by columns "
- f"Expected subset: {filtered_info_cols}, "
- f"but DataFrame has: {list(output_df.columns)}"
+ for metric_label in plan.data_cols:
+ gb = metric_groupbys[metric_label]
+ df_sorted, raw_data_cols = metric_cache[metric_label]
+
+ try:
+ group_df = gb.get_group(gkey)
+ except KeyError:
+ missing = (
+ ''
+ f"{_html.escape(metric_label)} — missing for this group"
+ "
\n"
+ )
+
+ main_fh.write(missing)
+ sub_fh.write(missing)
+ continue
+
+ if conc_col not in group_df.columns:
+ conc_col = _find_concurrency_col(group_df)
+
+ mn = metric_label.lower().strip()
+ if "tok/s" in mn:
+ tput_group_df = group_df
+ elif "ttft" in mn:
+ ttft_group_df = group_df
+ elif mn in ("p99", "median") or "tpot" in mn:
+ tpot_group_df = group_df
+
+ display_group = group_df.drop(
+ columns=group_cols_canonical, errors="ignore"
+ )
+
+ html = render_metric_table_html(
+ display_group, metric_label, suffix, args
+ )
+ main_fh.write(html)
+ sub_fh.write(html)
+
+ maybe_write_plot(
+ main_fh,
+ sub_fh,
+ group_df=group_df,
+ raw_data_cols=raw_data_cols,
+ metric_label=metric_label,
+ y_axis_col=y_axis_col,
+ args=args,
+ )
+
+ summary_html = build_valid_max_concurrency_summary_html(
+ tput_group_df=tput_group_df,
+ ttft_group_df=ttft_group_df,
+ tpot_group_df=tpot_group_df,
+ conc_col=conc_col,
+ args=args,
)
- # output_df_sorted = output_df.sort_values(by=existing_group_cols)
- output_df_sorted = output_df.sort_values(by=args.xaxis)
- output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
- for name, group in output_groups:
- group_name = (
- ",".join(map(str, name)).replace(",", "_").replace("/", "-")
- )
- group_html_name = "perf_comparison_" + group_name + ".html"
+ if summary_html:
+ main_fh.write(summary_html)
+ sub_fh.write(summary_html)
- metric_name = str(data_cols_to_compare[i]).lower()
- if "tok/s" in metric_name:
- html = group.to_html()
- elif "ttft" in metric_name:
- styler = _highlight_threshold(group, args.ttft_max_ms).format(
- {c: "{:.2f}" for c in group.select_dtypes("number").columns},
- na_rep="—",
- )
- html = styler.to_html(
- table_attributes='border="1" class="dataframe"'
- )
- elif (
- "tpot" in metric_name
- or "median" in metric_name
- or "p99" in metric_name
- ):
- styler = _highlight_threshold(group, args.tpot_max_ms).format(
- {c: "{:.2f}" for c in group.select_dtypes("number").columns},
- na_rep="—",
- )
- html = styler.to_html(
- table_attributes='border="1" class="dataframe"'
- )
- text_file.write(html_msgs_for_data_cols[i])
- text_file.write(html)
- with open(group_html_name, "a+") as sub_text_file:
- sub_text_file.write(html_msgs_for_data_cols[i])
- sub_text_file.write(html)
+def main():
+ args = build_parser().parse_args()
+ info_cols = list(DEFAULT_INFO_COLS)
+ plan = choose_metrics(args.latency)
+ files, info_cols = prepare_input_files(args, info_cols)
+ write_report_group_first(files, info_cols, plan, args)
- if plot and plotly_found:
- import plotly.express as px
- df = group[raw_data_cols]
- df_sorted = df.sort_values(by=info_cols[y_axis_index])
- # Melt DataFrame for plotting
- df_melted = df_sorted.melt(
- id_vars=info_cols[y_axis_index],
- var_name="Configuration",
- value_name=data_cols_to_compare[i],
- )
- title = (
- data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
- )
- # Create Plotly line chart
- fig = px.line(
- df_melted,
- x=info_cols[y_axis_index],
- y=data_cols_to_compare[i],
- color="Configuration",
- title=title,
- markers=True,
- )
-
- # ---- Add threshold lines based on metric name ----
- if "ttft" in metric_name:
- _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
- elif (
- "tpot" in metric_name
- or "median" in metric_name
- or "p99" in metric_name
- ):
- _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
-
- # Export to HTML
- text_file.write(
- fig.to_html(full_html=True, include_plotlyjs="cdn")
- )
- sub_text_file.write(
- fig.to_html(full_html=True, include_plotlyjs="cdn")
- )
+if __name__ == "__main__":
+ main()
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index 8f7200862d20c..25ed7415ec0e4 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -19,10 +19,8 @@
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
- "enforce_eager": "",
"max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
+ "max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
@@ -151,6 +149,45 @@
"random-output-len": 128
}
},
+ {
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md
index 4cbc1a6a0a4fb..701fb16ae2cf1 100644
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -40,7 +40,58 @@ When run, benchmark script generates results under **benchmark/results** folder,
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
-For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+### Visualization
+
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table with real benchmarking results.
+You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
+If you do not see the table, please wait till the benchmark finish running.
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+#### Performance Results Comparison
+
+The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
+When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
+
+Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+
+***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
+
+| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
+|----|------|-----|-----------|----------|----------|
+| 0 | 12 | inf | 24.98 | 186.03 | 7.45 |
+| 1 | 16 | inf| 25.49 | 246.92 | 9.69 |
+| 2 | 24 | inf| 27.74 | 293.34 | 10.57 |
+| 3 | 32 | inf| 28.61 |306.69 | 10.72 |
+
+***compare-json-results.py – Command-Line Parameters***
+
+compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots.
+In most cases, users only need to specify --file to parse the desired benchmark results.
+
+| Parameter | Type | Default Value | Description |
+| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- |
+| `--file` | `str` (appendable) | *None* | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs. |
+| `--debug` | `bool` | `False` | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. |
+| `--plot` / `--no-plot` | `bool` | `True` | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation. |
+| `--xaxis` | `str` | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size). |
+| `--latency` | `str` | `p99` | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`. |
+| `--ttft-max-ms` | `float` | `3000.0` | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds. |
+| `--tpot-max-ms` | `float` | `100.0` | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds. |
+
+***Valid Max Concurrency Summary***
+
+Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result.
+The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously.
+This value is typically used in capacity planning and sizing guides.
+
+| # | Configuration | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) |
+| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- |
+| 0 | results-a | 128.00 | 12.00 | 12.00 | 127.76 | 3000.82 | 93.24 |
+| 1 | results-b | 128.00 | 32.00 | 32.00 | 371.42 | 2261.53 | 81.74 |
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).