diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index 24d9155e64b41..6cb05879cdc17 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -1,26 +1,51 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + import argparse +import html as _html import json import os +from dataclasses import dataclass from importlib import util +from typing import List, Tuple import pandas as pd pd.options.display.float_format = "{:.2f}".format plotly_found = util.find_spec("plotly.express") is not None +DEFAULT_INFO_COLS = [ + "Model", + "Dataset Name", + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", +] + +# ----------------------------- +# Core data compare +# ----------------------------- def compare_data_columns( - files, name_column, data_column, info_cols, drop_column, debug=False + files: List[str], + name_column: str, + data_column: str, + info_cols: List[str], + drop_column: str, + debug: bool = False, ): """ Align concatenation by keys derived from info_cols instead of row order. - Pick one canonical key list: subset of info_cols present in ALL files. - For each file: set index to those keys, aggregate duplicates - - (mean for metric, first for names). + (mean for metric, first for names). - Concat along axis=1 (indexes align), then reset_index so callers can - - group by columns. + group by columns. - If --debug, add a _name column per file. """ print("\ncompare_data_column:", data_column) @@ -94,7 +119,7 @@ def compare_data_columns( frames.append(meta) meta_added = True - # (NEW) debug: aligned test-name column per file + # debug: aligned test-name column per file if debug and name_column in df_idx.columns: name_s = df_idx[name_column] if not name_s.index.is_unique: @@ -106,24 +131,22 @@ def compare_data_columns( raw_data_cols.append(file_label) compare_frames.append(s) - # Generalize ratio: for any file N>=2, add ratio (fileN / file1) + # ratio columns: fileN / file1 (throughput) or file1 / fileN (latency) if len(compare_frames) >= 2: base = compare_frames[0] current = compare_frames[-1] if "P99" in data_column or "Median" in data_column: - ratio = base / current # for latency + ratio = base / current # for latency: larger means better else: - ratio = current / base - ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 + ratio = current / base # for throughput: larger means better + ratio = ratio.mask(base == 0) ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) - # 4) concat on columns with aligned MultiIndex; - # then reset_index to return keys as columns concat_df = pd.concat(frames, axis=1) - concat_df = concat_df.reset_index(drop=True).reset_index() - if "index" in concat_df.columns: - concat_df = concat_df.drop(columns=["index"]) + + # NOTE: meta already contains key columns as normal columns, so we can drop the index cleanly. + concat_df = concat_df.reset_index(drop=True) # Ensure key/info columns appear first (in your info_cols order) front = [c for c in info_cols if c in concat_df.columns] @@ -134,16 +157,18 @@ def compare_data_columns( return concat_df, raw_data_cols +# ----------------------------- +# Split helper (restored) +# ----------------------------- def split_json_by_tp_pp( input_file: str = "benchmark_results.json", output_root: str = "." -) -> list[str]: +) -> List[str]: """ Split a benchmark JSON into separate folders by (TP Size, PP Size). Creates: /tp{TP}_pp{PP}/benchmark_results.json Returns: list of file paths written. """ - # Load JSON data into DataFrame with open(input_file, encoding="utf-8") as f: data = json.load(f) @@ -161,9 +186,7 @@ def split_json_by_tp_pp( (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None ) if name_col: - df = df[ - df[name_col].astype(str).str.contains(r"serving", case=False, na=False) - ].copy() + df = df[df[name_col].astype(str).str.contains(r"serving", case=False, na=False)].copy() # Handle alias column names rename_map = { @@ -172,9 +195,7 @@ def split_json_by_tp_pp( "pp_size": "PP Size", "pipeline_parallel_size": "PP Size", } - df.rename( - columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True - ) + df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True) # Ensure TP/PP columns exist (default to 1 if missing) if "TP Size" not in df.columns: @@ -182,16 +203,10 @@ def split_json_by_tp_pp( if "PP Size" not in df.columns: df["PP Size"] = 1 - # make sure TP/PP are numeric ints with no NaN - df["TP Size"] = ( - pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int) - ) - df["PP Size"] = ( - pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int) - ) + df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int) + df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int) - # Split into separate folders - saved_paths: list[str] = [] + saved_paths: List[str] = [] for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False): folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}") os.makedirs(folder_name, exist_ok=True) @@ -203,32 +218,9 @@ def split_json_by_tp_pp( return saved_paths -def _add_limit_line(fig, y_value, label): - # Visible dashed line + annotation - fig.add_hline( - y=y_value, - line_dash="dash", - line_color="red" if "ttft" in label.lower() else "blue", - annotation_text=f"{label}: {y_value} ms", - annotation_position="top left", - ) - # Optional: add a legend item (as a transparent helper trace) - if plot and plotly_found: - import plotly.graph_objects as go - - fig.add_trace( - go.Scatter( - x=[None], - y=[None], - mode="lines", - line=dict( - dash="dash", color="red" if "ttft" in label.lower() else "blue" - ), - name=f"{label}", - ) - ) - - +# ----------------------------- +# Styling helpers +# ----------------------------- def _find_concurrency_col(df: pd.DataFrame) -> str: for c in [ "# of max concurrency.", @@ -239,26 +231,17 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: ]: if c in df.columns: return c - # Fallback: guess an integer-like column (harmless if unused) for c in df.columns: if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: return c return "# of max concurrency." -def _highlight_threshold( - df: pd.DataFrame, threshold: float -) -> "pd.io.formats.style.Styler": +def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": """Highlight numeric per-configuration columns with value <= threshold.""" conc_col = _find_concurrency_col(df) - key_cols = [ - c - for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] - if c in df.columns - ] - conf_cols = [ - c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") - ] + key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns] + conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] return df.style.map( lambda v: "background-color:#e6ffe6;font-weight:bold;" @@ -267,45 +250,71 @@ def _highlight_threshold( subset=conf_cols, ) -def highlight_ratio_columns(styler): - ratio_cols = [ - c for c in styler.data.columns - if "ratio" in str(c).lower() - ] +def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): + """Highlight entire columns whose header contains 'Ratio'.""" + ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()] if not ratio_cols: return styler - # Highlight entire column (cells) + # highlight cells styler = styler.apply( lambda _: ["background-color: #fff3b0"] * len(styler.data), subset=ratio_cols, axis=0, ) - # Highlight column headers + # highlight headers styler = styler.set_table_styles( [ - { - "selector": f"th.col_heading.level0.col{i}", - "props": [("background-color", "#fff3b0")], - } + {"selector": f"th.col_heading.level0.col{i}", "props": [("background-color", "#fff3b0")]} for i, col in enumerate(styler.data.columns) if col in ratio_cols ], overwrite=False, ) - return styler -if __name__ == "__main__": + +# ----------------------------- +# Plot helper +# ----------------------------- +def _add_limit_line(fig, y_value: float, label: str): + fig.add_hline( + y=y_value, + line_dash="dash", + line_color="red" if "ttft" in label.lower() else "blue", + annotation_text=f"{label}: {y_value} ms", + annotation_position="top left", + ) + # If plotly is available, add a legend entry + if plotly_found: + import plotly.graph_objects as go + + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + line=dict(dash="dash", color="red" if "ttft" in label.lower() else "blue"), + name=label, + ) + ) + + +# ----------------------------- +# Refactored "main" +# ----------------------------- +@dataclass(frozen=True) +class MetricPlan: + data_cols: List[str] + drop_column: str + + +def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() - parser.add_argument( - "-f", "--file", action="append", type=str, help="input file name" - ) - parser.add_argument( - "--debug", action="store_true", help="show all information for debugging" - ) + parser.add_argument("-f", "--file", action="append", type=str, help="input file name") + parser.add_argument("--debug", action="store_true", help="show all information for debugging") parser.add_argument( "--plot", action=argparse.BooleanOptionalAction, @@ -326,188 +335,187 @@ if __name__ == "__main__": default="p99", help="take median|p99 for latency like TTFT/TPOT", ) - parser.add_argument( - "--ttft-max-ms", - type=float, - default=3000.0, - help="Reference limit for TTFT plots (ms)", - ) - parser.add_argument( - "--tpot-max-ms", - type=float, - default=100.0, - help="Reference limit for TPOT plots (ms)", - ) + parser.add_argument("--ttft-max-ms", type=float, default=3000.0, help="Reference limit for TTFT plots (ms)") + parser.add_argument("--tpot-max-ms", type=float, default=100.0, help="Reference limit for TPOT plots (ms)") + return parser - args = parser.parse_args() +def choose_metrics(latency: str) -> MetricPlan: + latency = (latency or "").lower() drop_column = "P99" - name_column = "Test name" - info_cols = [ - "Model", - "Dataset Name", - "Input Len", - "Output Len", - "TP Size", - "PP Size", - "# of max concurrency.", - "qps", - ] + if "median" in latency: + return MetricPlan( + data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"], + drop_column=drop_column, + ) + return MetricPlan( + data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"], + drop_column=drop_column, + ) - if "median" in args.latency: - data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] - html_msgs_for_data_cols = [ - "Compare Output Tokens /n", - "Median TTFT /n", - "Median TPOT /n", - ] - drop_column = "P99" - elif "p99" in args.latency: - data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"] - html_msgs_for_data_cols = [ - "Compare Output Tokens /n", - "P99 TTFT /n", - "P99 TPOT /n", - ] +def prepare_input_files(args, info_cols: List[str]) -> Tuple[List[str], List[str]]: + if not args.file: + raise ValueError("No input files provided. Use -f/--file.") if len(args.file) == 1: files = split_json_by_tp_pp(args.file[0], output_root="splits") info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] else: files = args.file + return files, info_cols + + +def get_y_axis_col(info_cols: List[str], xaxis: str) -> str: + y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6 + return info_cols[y_axis_index] + + +def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: + filtered_info_cols = info_cols[:4] + group_cols = [c for c in filtered_info_cols if c in output_df.columns] + if not group_cols: + raise ValueError( + f"No valid group-by columns. Expected subset: {filtered_info_cols}, " + f"but DataFrame has: {list(output_df.columns)}" + ) + return group_cols + + +def group_suffix(group_cols: List[str], name) -> str: + name_vals = name if isinstance(name, tuple) else (name,) + return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals)) + + +def group_filename(name, prefix: str = "perf_comparison_") -> str: + name_vals = name if isinstance(name, tuple) else (name,) + safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-") + return f"{prefix}{safe}.html" + + +def render_metric_table_html(display_group: pd.DataFrame, metric_label: str, suffix: str, args) -> str: + title = ( + f'
' + f'{_html.escape(metric_label)}' + f' — {_html.escape(suffix)}' + f"
\n" + ) + + metric_name = metric_label.lower() + + if "ttft" in metric_name: + styler = _highlight_threshold(display_group, args.ttft_max_ms) + elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): + styler = _highlight_threshold(display_group, args.tpot_max_ms) + else: + styler = display_group.style + + # format numbers + highlight ratios + styler = styler.format( + {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, + na_rep="—", + ) + styler = highlight_ratio_columns(styler) + + return title + styler.to_html(table_attributes='border="1" class="dataframe"') + + +def maybe_write_plot( + main_fh, + sub_fh, + group_df: pd.DataFrame, + raw_data_cols: List[str], + metric_label: str, + y_axis_col: str, + args, +): + if not (args.plot and plotly_found): + return + + import plotly.express as px + + df = group_df[raw_data_cols].sort_values(by=y_axis_col) + df_melted = df.melt( + id_vars=y_axis_col, + var_name="Configuration", + value_name=metric_label, + ) + + fig = px.line( + df_melted, + x=y_axis_col, + y=metric_label, + color="Configuration", + title=f"{metric_label} vs {y_axis_col}", + markers=True, + ) + + metric_name = metric_label.lower() + if "ttft" in metric_name: + _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") + elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): + _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") + + html = fig.to_html(full_html=True, include_plotlyjs="cdn") + main_fh.write(html) + sub_fh.write(html) + + +def write_report(files: List[str], info_cols: List[str], plan: MetricPlan, args): + name_column = "Test name" + y_axis_col = get_y_axis_col(info_cols, args.xaxis) + print("comparing : " + ", ".join(files)) - debug = args.debug - plot = args.plot - # For Plot feature, assign y axis from one of info_cols - y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 - with open("perf_comparison.html", "w") as text_file: - for i in range(len(data_cols_to_compare)): + + with open("perf_comparison.html", "w") as main_fh: + for metric_label in plan.data_cols: output_df, raw_data_cols = compare_data_columns( files, name_column, - data_cols_to_compare[i], + metric_label, info_cols, - drop_column, - debug=debug, + plan.drop_column, + debug=args.debug, ) - # For Plot feature, insert y axis from one of info_cols - raw_data_cols.insert(0, info_cols[y_axis_index]) + raw_data_cols = list(raw_data_cols) + raw_data_cols.insert(0, y_axis_col) + + group_cols = get_group_cols(output_df, info_cols) - filtered_info_cols = info_cols[:4] - existing_group_cols = [ - c for c in filtered_info_cols if c in output_df.columns - ] - if not existing_group_cols: - raise ValueError( - f"No valid group-by columns " - f"Expected subset: {filtered_info_cols}, " - f"but DataFrame has: {list(output_df.columns)}" - ) - # output_df_sorted = output_df.sort_values(by=existing_group_cols) output_df_sorted = output_df.sort_values(by=args.xaxis) - output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) - for name, group in output_groups: - group_name = ( - ",".join(map(str, name)).replace(",", "_").replace("/", "-") - ) - group_html_name = "perf_comparison_" + group_name + ".html" - import html as _html - name_vals = name if isinstance(name, tuple) else (name,) - group_title_suffix = " , ".join( - f"{col} : [ {val} ] " for col, val in zip(existing_group_cols, name_vals) - ) + for name, group_df in output_df_sorted.groupby(group_cols, dropna=False): + suffix = group_suffix(group_cols, name) + sub_path = group_filename(name) - # --------------------------------------------- - # DROP group columns from DISPLAY ONLY - # --------------------------------------------- - display_group = group.drop(columns=existing_group_cols, errors="ignore") + # drop group columns from display only + display_group = group_df.drop(columns=group_cols, errors="ignore") - metric_name = str(data_cols_to_compare[i]).lower() - if "tok/s" in metric_name: - styler = display_group.style - styler = highlight_ratio_columns(styler) - html = ( - f'
' - f'{_html.escape(data_cols_to_compare[i])}' - f' — {_html.escape(group_title_suffix)}' - f'
\n' - + styler.to_html(table_attributes='border="1" class="dataframe"') - ) - elif "ttft" in metric_name: - styler = _highlight_threshold(display_group, args.ttft_max_ms).format( - {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, - na_rep="—", - ) - styler = highlight_ratio_columns(styler) - html = ( - f'
' - f'{_html.escape(data_cols_to_compare[i])}' - f' — {_html.escape(group_title_suffix)}' - f'
\n' - + styler.to_html(table_attributes='border="1" class="dataframe"') + html = render_metric_table_html(display_group, metric_label, suffix, args) + + main_fh.write(html) + with open(sub_path, "a+") as sub_fh: + sub_fh.write(html) + maybe_write_plot( + main_fh, + sub_fh, + group_df=group_df, + raw_data_cols=raw_data_cols, + metric_label=metric_label, + y_axis_col=y_axis_col, + args=args, ) - elif ( - "tpot" in metric_name - or "median" in metric_name - or "p99" in metric_name - ): - styler = _highlight_threshold(display_group, args.tpot_max_ms).format( - {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, - na_rep="—", - ) - styler = highlight_ratio_columns(styler) - html = ( - f'
' - f'{_html.escape(data_cols_to_compare[i])}' - f' — {_html.escape(group_title_suffix)}' - f'
\n' - + styler.to_html(table_attributes='border="1" class="dataframe"') - ) - - text_file.write(html) - with open(group_html_name, "a+") as sub_text_file: - sub_text_file.write(html) - if plot and plotly_found: - import plotly.express as px +def main(): + args = build_parser().parse_args() - df = group[raw_data_cols] - df_sorted = df.sort_values(by=info_cols[y_axis_index]) - # Melt DataFrame for plotting - df_melted = df_sorted.melt( - id_vars=info_cols[y_axis_index], - var_name="Configuration", - value_name=data_cols_to_compare[i], - ) - title = ( - data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] - ) - # Create Plotly line chart - fig = px.line( - df_melted, - x=info_cols[y_axis_index], - y=data_cols_to_compare[i], - color="Configuration", - title=title, - markers=True, - ) + info_cols = list(DEFAULT_INFO_COLS) + plan = choose_metrics(args.latency) - # ---- Add threshold lines based on metric name ---- - if "ttft" in metric_name: - _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") - elif ( - "tpot" in metric_name - or "median" in metric_name - or "p99" in metric_name - ): - _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") + files, info_cols = prepare_input_files(args, info_cols) + write_report(files, info_cols, plan, args) + + +if __name__ == "__main__": + main() - # Export to HTML - text_file.write( - fig.to_html(full_html=True, include_plotlyjs="cdn") - ) - sub_text_file.write( - fig.to_html(full_html=True, include_plotlyjs="cdn") - )