diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index 6cb05879cdc17..ece004107b669 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -9,7 +9,7 @@ import json import os from dataclasses import dataclass from importlib import util -from typing import List, Tuple +from typing import Dict, List, Tuple import pandas as pd @@ -51,11 +51,11 @@ def compare_data_columns( print("\ncompare_data_column:", data_column) frames = [] - raw_data_cols = [] + raw_data_cols: List[str] = [] compare_frames = [] # 1) choose a canonical key list from info_cols that exists in ALL files - cols_per_file = [] + cols_per_file: List[set] = [] for f in files: try: df_tmp = pd.read_json(f, orient="records") @@ -143,10 +143,7 @@ def compare_data_columns( ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) - concat_df = pd.concat(frames, axis=1) - - # NOTE: meta already contains key columns as normal columns, so we can drop the index cleanly. - concat_df = concat_df.reset_index(drop=True) + concat_df = pd.concat(frames, axis=1).reset_index(drop=True) # Ensure key/info columns appear first (in your info_cols order) front = [c for c in info_cols if c in concat_df.columns] @@ -158,7 +155,7 @@ def compare_data_columns( # ----------------------------- -# Split helper (restored) +# Split helper # ----------------------------- def split_json_by_tp_pp( input_file: str = "benchmark_results.json", output_root: str = "." @@ -231,6 +228,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: ]: if c in df.columns: return c + # Fallback: guess an integer-like column (harmless if unused) for c in df.columns: if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: return c @@ -240,9 +238,16 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": """Highlight numeric per-configuration columns with value <= threshold.""" conc_col = _find_concurrency_col(df) - key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns] - conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")] + key_cols = [ + c + for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] + if c in df.columns + ] + conf_cols = [ + c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") + ] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] + return df.style.map( lambda v: "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) and v <= threshold @@ -257,17 +262,20 @@ def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): if not ratio_cols: return styler - # highlight cells + # Highlight entire column (cells) styler = styler.apply( lambda _: ["background-color: #fff3b0"] * len(styler.data), subset=ratio_cols, axis=0, ) - # highlight headers + # Highlight column headers styler = styler.set_table_styles( [ - {"selector": f"th.col_heading.level0.col{i}", "props": [("background-color", "#fff3b0")]} + { + "selector": f"th.col_heading.level0.col{i}", + "props": [("background-color", "#fff3b0")], + } for i, col in enumerate(styler.data.columns) if col in ratio_cols ], @@ -296,14 +304,17 @@ def _add_limit_line(fig, y_value: float, label: str): x=[None], y=[None], mode="lines", - line=dict(dash="dash", color="red" if "ttft" in label.lower() else "blue"), + line=dict( + dash="dash", + color="red" if "ttft" in label.lower() else "blue", + ), name=label, ) ) # ----------------------------- -# Refactored "main" +# Refactored main + group-first report # ----------------------------- @dataclass(frozen=True) class MetricPlan: @@ -343,11 +354,14 @@ def build_parser() -> argparse.ArgumentParser: def choose_metrics(latency: str) -> MetricPlan: latency = (latency or "").lower() drop_column = "P99" + if "median" in latency: return MetricPlan( data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"], drop_column=drop_column, ) + + # default: p99 return MetricPlan( data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"], drop_column=drop_column, @@ -357,11 +371,13 @@ def choose_metrics(latency: str) -> MetricPlan: def prepare_input_files(args, info_cols: List[str]) -> Tuple[List[str], List[str]]: if not args.file: raise ValueError("No input files provided. Use -f/--file.") + if len(args.file) == 1: files = split_json_by_tp_pp(args.file[0], output_root="splits") info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] else: files = args.file + return files, info_cols @@ -371,6 +387,7 @@ def get_y_axis_col(info_cols: List[str], xaxis: str) -> str: def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: + # Your current grouping rule: first 4 info columns filtered_info_cols = info_cols[:4] group_cols = [c for c in filtered_info_cols if c in output_df.columns] if not group_cols: @@ -381,27 +398,38 @@ def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: return group_cols -def group_suffix(group_cols: List[str], name) -> str: - name_vals = name if isinstance(name, tuple) else (name,) - return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals)) +def normalize_group_key(name): + """Pandas group key can be scalar (1 col) or tuple (N cols). Normalize to tuple.""" + return name if isinstance(name, tuple) else (name,) def group_filename(name, prefix: str = "perf_comparison_") -> str: - name_vals = name if isinstance(name, tuple) else (name,) + name_vals = normalize_group_key(name) safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-") return f"{prefix}{safe}.html" -def render_metric_table_html(display_group: pd.DataFrame, metric_label: str, suffix: str, args) -> str: +def build_group_suffix(group_cols: List[str], name) -> str: + name_vals = normalize_group_key(name) + return " , ".join( + f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals) + ) + + +def render_metric_table_html( + display_group: pd.DataFrame, + metric_label: str, + group_suffix: str, + args, +) -> str: title = ( f'