diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index 6cb05879cdc17..ece004107b669 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -9,7 +9,7 @@ import json import os from dataclasses import dataclass from importlib import util -from typing import List, Tuple +from typing import Dict, List, Tuple import pandas as pd @@ -51,11 +51,11 @@ def compare_data_columns( print("\ncompare_data_column:", data_column) frames = [] - raw_data_cols = [] + raw_data_cols: List[str] = [] compare_frames = [] # 1) choose a canonical key list from info_cols that exists in ALL files - cols_per_file = [] + cols_per_file: List[set] = [] for f in files: try: df_tmp = pd.read_json(f, orient="records") @@ -143,10 +143,7 @@ def compare_data_columns( ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) - concat_df = pd.concat(frames, axis=1) - - # NOTE: meta already contains key columns as normal columns, so we can drop the index cleanly. - concat_df = concat_df.reset_index(drop=True) + concat_df = pd.concat(frames, axis=1).reset_index(drop=True) # Ensure key/info columns appear first (in your info_cols order) front = [c for c in info_cols if c in concat_df.columns] @@ -158,7 +155,7 @@ def compare_data_columns( # ----------------------------- -# Split helper (restored) +# Split helper # ----------------------------- def split_json_by_tp_pp( input_file: str = "benchmark_results.json", output_root: str = "." @@ -231,6 +228,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: ]: if c in df.columns: return c + # Fallback: guess an integer-like column (harmless if unused) for c in df.columns: if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: return c @@ -240,9 +238,16 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": """Highlight numeric per-configuration columns with value <= threshold.""" conc_col = _find_concurrency_col(df) - key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns] - conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")] + key_cols = [ + c + for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] + if c in df.columns + ] + conf_cols = [ + c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") + ] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] + return df.style.map( lambda v: "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) and v <= threshold @@ -257,17 +262,20 @@ def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): if not ratio_cols: return styler - # highlight cells + # Highlight entire column (cells) styler = styler.apply( lambda _: ["background-color: #fff3b0"] * len(styler.data), subset=ratio_cols, axis=0, ) - # highlight headers + # Highlight column headers styler = styler.set_table_styles( [ - {"selector": f"th.col_heading.level0.col{i}", "props": [("background-color", "#fff3b0")]} + { + "selector": f"th.col_heading.level0.col{i}", + "props": [("background-color", "#fff3b0")], + } for i, col in enumerate(styler.data.columns) if col in ratio_cols ], @@ -296,14 +304,17 @@ def _add_limit_line(fig, y_value: float, label: str): x=[None], y=[None], mode="lines", - line=dict(dash="dash", color="red" if "ttft" in label.lower() else "blue"), + line=dict( + dash="dash", + color="red" if "ttft" in label.lower() else "blue", + ), name=label, ) ) # ----------------------------- -# Refactored "main" +# Refactored main + group-first report # ----------------------------- @dataclass(frozen=True) class MetricPlan: @@ -343,11 +354,14 @@ def build_parser() -> argparse.ArgumentParser: def choose_metrics(latency: str) -> MetricPlan: latency = (latency or "").lower() drop_column = "P99" + if "median" in latency: return MetricPlan( data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"], drop_column=drop_column, ) + + # default: p99 return MetricPlan( data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"], drop_column=drop_column, @@ -357,11 +371,13 @@ def choose_metrics(latency: str) -> MetricPlan: def prepare_input_files(args, info_cols: List[str]) -> Tuple[List[str], List[str]]: if not args.file: raise ValueError("No input files provided. Use -f/--file.") + if len(args.file) == 1: files = split_json_by_tp_pp(args.file[0], output_root="splits") info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] else: files = args.file + return files, info_cols @@ -371,6 +387,7 @@ def get_y_axis_col(info_cols: List[str], xaxis: str) -> str: def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: + # Your current grouping rule: first 4 info columns filtered_info_cols = info_cols[:4] group_cols = [c for c in filtered_info_cols if c in output_df.columns] if not group_cols: @@ -381,27 +398,38 @@ def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: return group_cols -def group_suffix(group_cols: List[str], name) -> str: - name_vals = name if isinstance(name, tuple) else (name,) - return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals)) +def normalize_group_key(name): + """Pandas group key can be scalar (1 col) or tuple (N cols). Normalize to tuple.""" + return name if isinstance(name, tuple) else (name,) def group_filename(name, prefix: str = "perf_comparison_") -> str: - name_vals = name if isinstance(name, tuple) else (name,) + name_vals = normalize_group_key(name) safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-") return f"{prefix}{safe}.html" -def render_metric_table_html(display_group: pd.DataFrame, metric_label: str, suffix: str, args) -> str: +def build_group_suffix(group_cols: List[str], name) -> str: + name_vals = normalize_group_key(name) + return " , ".join( + f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals) + ) + + +def render_metric_table_html( + display_group: pd.DataFrame, + metric_label: str, + group_suffix: str, + args, +) -> str: title = ( f'
' f'{_html.escape(metric_label)}' - f' — {_html.escape(suffix)}' + f' — {_html.escape(group_suffix)}' f"
\n" ) metric_name = metric_label.lower() - if "ttft" in metric_name: styler = _highlight_threshold(display_group, args.ttft_max_ms) elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): @@ -409,7 +437,6 @@ def render_metric_table_html(display_group: pd.DataFrame, metric_label: str, suf else: styler = display_group.style - # format numbers + highlight ratios styler = styler.format( {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, na_rep="—", @@ -460,41 +487,106 @@ def maybe_write_plot( sub_fh.write(html) -def write_report(files: List[str], info_cols: List[str], plan: MetricPlan, args): +def build_group_keys(df: pd.DataFrame, group_cols: List[str], sort_cols: List[str] | None = None): + """Return a stable list of group keys from df.""" + if sort_cols: + df = df.sort_values(by=sort_cols) + gb = df.groupby(group_cols, dropna=False) + return [k for k, _ in gb] + + +def write_report_group_first(files: List[str], info_cols: List[str], plan: MetricPlan, args): + """ + Group-first layout: + For each group, emit tok/s then TTFT then TPOT (or Median variants) together. + """ name_column = "Test name" y_axis_col = get_y_axis_col(info_cols, args.xaxis) print("comparing : " + ", ".join(files)) + # Precompute per-metric dataframes once + metric_cache: Dict[str, Tuple[pd.DataFrame, List[str]]] = {} + group_cols_canonical: List[str] | None = None + + for metric_label in plan.data_cols: + output_df, raw_data_cols = compare_data_columns( + files, + name_column, + metric_label, + info_cols, + plan.drop_column, + debug=args.debug, + ) + + # plot expects y-axis column at the front + raw_data_cols = list(raw_data_cols) + raw_data_cols.insert(0, y_axis_col) + + group_cols = get_group_cols(output_df, info_cols) + if group_cols_canonical is None: + group_cols_canonical = group_cols + else: + # keep intersection (stable order) + group_cols_canonical = [c for c in group_cols_canonical if c in group_cols] + + metric_cache[metric_label] = (output_df.sort_values(by=args.xaxis), raw_data_cols) + + if not group_cols_canonical: + raise ValueError("No canonical group columns found across metrics.") + + # Canonical group keys from first metric (typically tok/s) + first_metric = plan.data_cols[0] + first_df_sorted, _ = metric_cache[first_metric] + group_keys = build_group_keys(first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]) + + # Pre-build groupby objects per metric + metric_groupbys = { + metric_label: df.groupby(group_cols_canonical, dropna=False) + for metric_label, (df, _) in metric_cache.items() + } + with open("perf_comparison.html", "w") as main_fh: - for metric_label in plan.data_cols: - output_df, raw_data_cols = compare_data_columns( - files, - name_column, - metric_label, - info_cols, - plan.drop_column, - debug=args.debug, + for gkey in group_keys: + gkey_tuple = normalize_group_key(gkey) + suffix = build_group_suffix(group_cols_canonical, gkey_tuple) + sub_path = group_filename(gkey_tuple) + + # Optional group header (separates each group visually) + group_header = ( + f'
' + f'{_html.escape(suffix)}' + f"
\n" ) - raw_data_cols = list(raw_data_cols) - raw_data_cols.insert(0, y_axis_col) + main_fh.write(group_header) + with open(sub_path, "w") as sub_fh: + sub_fh.write(group_header) - group_cols = get_group_cols(output_df, info_cols) + for metric_label in plan.data_cols: + gb = metric_groupbys[metric_label] + df_sorted, raw_data_cols = metric_cache[metric_label] - output_df_sorted = output_df.sort_values(by=args.xaxis) - for name, group_df in output_df_sorted.groupby(group_cols, dropna=False): - suffix = group_suffix(group_cols, name) - sub_path = group_filename(name) + try: + group_df = gb.get_group(gkey) + except KeyError: + missing = ( + f'
' + f'{_html.escape(metric_label)} — missing for this group' + f"
\n" + ) + main_fh.write(missing) + sub_fh.write(missing) + continue - # drop group columns from display only - display_group = group_df.drop(columns=group_cols, errors="ignore") + # Display-only: drop group columns + display_group = group_df.drop(columns=group_cols_canonical, errors="ignore") - html = render_metric_table_html(display_group, metric_label, suffix, args) + html = render_metric_table_html(display_group, metric_label, suffix, args) - main_fh.write(html) - with open(sub_path, "a+") as sub_fh: + main_fh.write(html) sub_fh.write(html) + maybe_write_plot( main_fh, sub_fh, @@ -513,7 +605,9 @@ def main(): plan = choose_metrics(args.latency) files, info_cols = prepare_input_files(args, info_cols) - write_report(files, info_cols, plan, args) + + # Group-first report layout + write_report_group_first(files, info_cols, plan, args) if __name__ == "__main__":