From b735255f1743aaa18b298ea5ef68689115d43930 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Thu, 18 Dec 2025 16:22:30 -0800 Subject: [PATCH 01/13] improve cpu tests for 0.12.0 Signed-off-by: Tsai, Louie --- .../tests/serving-tests-cpu.json | 97 +++++++++++++++---- 1 file changed, 76 insertions(+), 21 deletions(-) diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index 8f7200862d20c..1b031a2717610 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -19,10 +19,8 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "enforce_eager": "", "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" + "max_num_seqs": 256 }, "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", @@ -35,7 +33,8 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "server_parameters": { - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "sharegpt", @@ -45,7 +44,8 @@ { "test_name": "serving_llama8B_tp2_sharegpt", "server_parameters": { - "tensor_parallel_size": 2 + "tensor_parallel_size": 2, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "sharegpt", @@ -55,7 +55,8 @@ { "test_name": "serving_llama8B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -66,7 +67,8 @@ { "test_name": "serving_llama8B_tp2_random_128_128", "server_parameters": { - "tensor_parallel_size": 2 + "tensor_parallel_size": 2, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -77,7 +79,8 @@ { "test_name": "serving_llama8B_tp4_random_128_128", "server_parameters": { - "tensor_parallel_size": 4 + "tensor_parallel_size": 4, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -88,7 +91,8 @@ { "test_name": "serving_llama8B_tp1_random_128_2048", "server_parameters": { - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -99,7 +103,8 @@ { "test_name": "serving_llama8B_tp2_random_128_2048", "server_parameters": { - "tensor_parallel_size": 2 + "tensor_parallel_size": 2, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -110,7 +115,8 @@ { "test_name": "serving_llama8B_tp4_random_128_2048", "server_parameters": { - "tensor_parallel_size": 4 + "tensor_parallel_size": 4, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -121,7 +127,8 @@ { "test_name": "serving_llama8B_tp1_random_2048_128", "server_parameters": { - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -132,7 +139,8 @@ { "test_name": "serving_llama8B_tp2_random_2048_128", "server_parameters": { - "tensor_parallel_size": 2 + "tensor_parallel_size": 2, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -143,7 +151,8 @@ { "test_name": "serving_llama8B_tp4_random_2048_128", "server_parameters": { - "tensor_parallel_size": 4 + "tensor_parallel_size": 4, + "enforce_eager": "" }, "client_parameters": { "dataset_name": "random", @@ -151,11 +160,51 @@ "random-output-len": 128 } }, + { + "test_name": "serving_llama8B_int4_tp1_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 2 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 4 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, { "test_name": "serving_llama3B_tp1_random_128_128", "server_parameters": { "model": "meta-llama/Llama-3.2-3B-Instruct", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "meta-llama/Llama-3.2-3B-Instruct", @@ -168,7 +217,8 @@ "test_name": "serving_granite2B_tp1_random_128_128", "server_parameters": { "model": "ibm-granite/granite-3.2-2b-instruct", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "ibm-granite/granite-3.2-2b-instruct", @@ -181,7 +231,8 @@ "test_name": "serving_qwen1.7B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-1.7B", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "Qwen/Qwen3-1.7B", @@ -194,7 +245,8 @@ "test_name": "serving_qwen4B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-4B", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "Qwen/Qwen3-4B", @@ -207,7 +259,8 @@ "test_name": "serving_qwen8B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "Qwen/Qwen3-8B", @@ -220,7 +273,8 @@ "test_name": "serving_glm9B_tp1_random_128_128", "server_parameters": { "model": "zai-org/glm-4-9b-hf", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "zai-org/glm-4-9b-hf", @@ -233,7 +287,8 @@ "test_name": "serving_gemma7B_tp1_random_128_128", "server_parameters": { "model": "google/gemma-7b", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "enforce_eager": "" }, "client_parameters": { "model": "google/gemma-7b", From ba0bf189c85d9f0a9ea2ceabd8b15f6c0e9334fd Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Fri, 19 Dec 2025 19:16:59 -0800 Subject: [PATCH 02/13] improve table readability Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 46 +++++++++++++++---- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index c8bf7b0453662..0ecbd2be11927 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -364,7 +364,7 @@ if __name__ == "__main__": # For Plot feature, insert y axis from one of info_cols raw_data_cols.insert(0, info_cols[y_axis_index]) - filtered_info_cols = info_cols[:-2] + filtered_info_cols = info_cols[:-4] existing_group_cols = [ c for c in filtered_info_cols if c in output_df.columns ] @@ -382,30 +382,56 @@ if __name__ == "__main__": ",".join(map(str, name)).replace(",", "_").replace("/", "-") ) group_html_name = "perf_comparison_" + group_name + ".html" + import html as _html + name_vals = name if isinstance(name, tuple) else (name,) + group_title_suffix = ", ".join( + f"{col}={val}" for col, val in zip(existing_group_cols, name_vals) + ) + + # --------------------------------------------- + # DROP group columns from DISPLAY ONLY + # --------------------------------------------- + display_group = group.drop(columns=existing_group_cols, errors="ignore") metric_name = str(data_cols_to_compare[i]).lower() if "tok/s" in metric_name: - html = group.to_html() + html = ( + f'
' + f'{_html.escape(data_cols_to_compare[i])}' + f' — {_html.escape(group_title_suffix)}' + f'
\n' + + display_group.to_html(index=False) + ) elif "ttft" in metric_name: - styler = _highlight_threshold(group, args.ttft_max_ms).format( - {c: "{:.2f}" for c in group.select_dtypes("number").columns}, + styler = _highlight_threshold(display_group, args.ttft_max_ms).format( + {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, na_rep="—", ) - html = styler.to_html( - table_attributes='border="1" class="dataframe"' + html = ( + f'
' + f'{_html.escape(data_cols_to_compare[i])}' + f' — {_html.escape(group_title_suffix)}' + f'
\n' + + styler.to_html(table_attributes='border="1" class="dataframe"') ) + elif ( "tpot" in metric_name or "median" in metric_name or "p99" in metric_name ): - styler = _highlight_threshold(group, args.tpot_max_ms).format( - {c: "{:.2f}" for c in group.select_dtypes("number").columns}, + styler = _highlight_threshold(display_group, args.tpot_max_ms).format( + {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, na_rep="—", ) - html = styler.to_html( - table_attributes='border="1" class="dataframe"' + html = ( + f'
' + f'{_html.escape(data_cols_to_compare[i])}' + f' — {_html.escape(group_title_suffix)}' + f'
\n' + + styler.to_html(table_attributes='border="1" class="dataframe"') ) + text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) From 763d48dbcb2f5a668d44651c73a0f5126e6a0d78 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Fri, 19 Dec 2025 19:25:28 -0800 Subject: [PATCH 03/13] highlight ratio for TTFT and TPOT Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index 0ecbd2be11927..fad407a545bf1 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -267,6 +267,36 @@ def _highlight_threshold( subset=conf_cols, ) +def highlight_ratio_columns(styler): + ratio_cols = [ + c for c in styler.data.columns + if "ratio" in str(c).lower() + ] + + if not ratio_cols: + return styler + + # Highlight entire column (cells) + styler = styler.apply( + lambda _: ["background-color: #fff3b0"] * len(styler.data), + subset=ratio_cols, + axis=0, + ) + + # Highlight column headers + styler = styler.set_table_styles( + [ + { + "selector": f"th.col_heading.level0.col{i}", + "props": [("background-color", "#fff3b0")], + } + for i, col in enumerate(styler.data.columns) + if col in ratio_cols + ], + overwrite=False, + ) + + return styler if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -364,7 +394,7 @@ if __name__ == "__main__": # For Plot feature, insert y axis from one of info_cols raw_data_cols.insert(0, info_cols[y_axis_index]) - filtered_info_cols = info_cols[:-4] + filtered_info_cols = info_cols[:4] existing_group_cols = [ c for c in filtered_info_cols if c in output_df.columns ] @@ -407,6 +437,7 @@ if __name__ == "__main__": {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, na_rep="—", ) + styler = highlight_ratio_columns(styler) html = ( f'
' f'{_html.escape(data_cols_to_compare[i])}' @@ -424,6 +455,7 @@ if __name__ == "__main__": {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, na_rep="—", ) + styler = highlight_ratio_columns(styler) html = ( f'
' f'{_html.escape(data_cols_to_compare[i])}' From efa495545c67a89afffb9879c4fa23d625abcf11 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Fri, 19 Dec 2025 23:19:54 -0800 Subject: [PATCH 04/13] highlight ratio in througput Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index fad407a545bf1..24d9155e64b41 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -414,8 +414,8 @@ if __name__ == "__main__": group_html_name = "perf_comparison_" + group_name + ".html" import html as _html name_vals = name if isinstance(name, tuple) else (name,) - group_title_suffix = ", ".join( - f"{col}={val}" for col, val in zip(existing_group_cols, name_vals) + group_title_suffix = " , ".join( + f"{col} : [ {val} ] " for col, val in zip(existing_group_cols, name_vals) ) # --------------------------------------------- @@ -425,12 +425,14 @@ if __name__ == "__main__": metric_name = str(data_cols_to_compare[i]).lower() if "tok/s" in metric_name: + styler = display_group.style + styler = highlight_ratio_columns(styler) html = ( f'
' f'{_html.escape(data_cols_to_compare[i])}' f' — {_html.escape(group_title_suffix)}' f'
\n' - + display_group.to_html(index=False) + + styler.to_html(table_attributes='border="1" class="dataframe"') ) elif "ttft" in metric_name: styler = _highlight_threshold(display_group, args.ttft_max_ms).format( @@ -464,11 +466,8 @@ if __name__ == "__main__": + styler.to_html(table_attributes='border="1" class="dataframe"') ) - - text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) with open(group_html_name, "a+") as sub_text_file: - sub_text_file.write(html_msgs_for_data_cols[i]) sub_text_file.write(html) if plot and plotly_found: From 63ebc2336d4e3538fdc4428115b5aafa8a6e69be Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Fri, 19 Dec 2025 23:42:19 -0800 Subject: [PATCH 05/13] code refactor to improve readabliity Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 498 +++++++++--------- 1 file changed, 253 insertions(+), 245 deletions(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index 24d9155e64b41..6cb05879cdc17 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -1,26 +1,51 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + import argparse +import html as _html import json import os +from dataclasses import dataclass from importlib import util +from typing import List, Tuple import pandas as pd pd.options.display.float_format = "{:.2f}".format plotly_found = util.find_spec("plotly.express") is not None +DEFAULT_INFO_COLS = [ + "Model", + "Dataset Name", + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", +] + +# ----------------------------- +# Core data compare +# ----------------------------- def compare_data_columns( - files, name_column, data_column, info_cols, drop_column, debug=False + files: List[str], + name_column: str, + data_column: str, + info_cols: List[str], + drop_column: str, + debug: bool = False, ): """ Align concatenation by keys derived from info_cols instead of row order. - Pick one canonical key list: subset of info_cols present in ALL files. - For each file: set index to those keys, aggregate duplicates - - (mean for metric, first for names). + (mean for metric, first for names). - Concat along axis=1 (indexes align), then reset_index so callers can - - group by columns. + group by columns. - If --debug, add a _name column per file. """ print("\ncompare_data_column:", data_column) @@ -94,7 +119,7 @@ def compare_data_columns( frames.append(meta) meta_added = True - # (NEW) debug: aligned test-name column per file + # debug: aligned test-name column per file if debug and name_column in df_idx.columns: name_s = df_idx[name_column] if not name_s.index.is_unique: @@ -106,24 +131,22 @@ def compare_data_columns( raw_data_cols.append(file_label) compare_frames.append(s) - # Generalize ratio: for any file N>=2, add ratio (fileN / file1) + # ratio columns: fileN / file1 (throughput) or file1 / fileN (latency) if len(compare_frames) >= 2: base = compare_frames[0] current = compare_frames[-1] if "P99" in data_column or "Median" in data_column: - ratio = base / current # for latency + ratio = base / current # for latency: larger means better else: - ratio = current / base - ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 + ratio = current / base # for throughput: larger means better + ratio = ratio.mask(base == 0) ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) - # 4) concat on columns with aligned MultiIndex; - # then reset_index to return keys as columns concat_df = pd.concat(frames, axis=1) - concat_df = concat_df.reset_index(drop=True).reset_index() - if "index" in concat_df.columns: - concat_df = concat_df.drop(columns=["index"]) + + # NOTE: meta already contains key columns as normal columns, so we can drop the index cleanly. + concat_df = concat_df.reset_index(drop=True) # Ensure key/info columns appear first (in your info_cols order) front = [c for c in info_cols if c in concat_df.columns] @@ -134,16 +157,18 @@ def compare_data_columns( return concat_df, raw_data_cols +# ----------------------------- +# Split helper (restored) +# ----------------------------- def split_json_by_tp_pp( input_file: str = "benchmark_results.json", output_root: str = "." -) -> list[str]: +) -> List[str]: """ Split a benchmark JSON into separate folders by (TP Size, PP Size). Creates: /tp{TP}_pp{PP}/benchmark_results.json Returns: list of file paths written. """ - # Load JSON data into DataFrame with open(input_file, encoding="utf-8") as f: data = json.load(f) @@ -161,9 +186,7 @@ def split_json_by_tp_pp( (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None ) if name_col: - df = df[ - df[name_col].astype(str).str.contains(r"serving", case=False, na=False) - ].copy() + df = df[df[name_col].astype(str).str.contains(r"serving", case=False, na=False)].copy() # Handle alias column names rename_map = { @@ -172,9 +195,7 @@ def split_json_by_tp_pp( "pp_size": "PP Size", "pipeline_parallel_size": "PP Size", } - df.rename( - columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True - ) + df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True) # Ensure TP/PP columns exist (default to 1 if missing) if "TP Size" not in df.columns: @@ -182,16 +203,10 @@ def split_json_by_tp_pp( if "PP Size" not in df.columns: df["PP Size"] = 1 - # make sure TP/PP are numeric ints with no NaN - df["TP Size"] = ( - pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int) - ) - df["PP Size"] = ( - pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int) - ) + df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int) + df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int) - # Split into separate folders - saved_paths: list[str] = [] + saved_paths: List[str] = [] for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False): folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}") os.makedirs(folder_name, exist_ok=True) @@ -203,32 +218,9 @@ def split_json_by_tp_pp( return saved_paths -def _add_limit_line(fig, y_value, label): - # Visible dashed line + annotation - fig.add_hline( - y=y_value, - line_dash="dash", - line_color="red" if "ttft" in label.lower() else "blue", - annotation_text=f"{label}: {y_value} ms", - annotation_position="top left", - ) - # Optional: add a legend item (as a transparent helper trace) - if plot and plotly_found: - import plotly.graph_objects as go - - fig.add_trace( - go.Scatter( - x=[None], - y=[None], - mode="lines", - line=dict( - dash="dash", color="red" if "ttft" in label.lower() else "blue" - ), - name=f"{label}", - ) - ) - - +# ----------------------------- +# Styling helpers +# ----------------------------- def _find_concurrency_col(df: pd.DataFrame) -> str: for c in [ "# of max concurrency.", @@ -239,26 +231,17 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: ]: if c in df.columns: return c - # Fallback: guess an integer-like column (harmless if unused) for c in df.columns: if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: return c return "# of max concurrency." -def _highlight_threshold( - df: pd.DataFrame, threshold: float -) -> "pd.io.formats.style.Styler": +def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": """Highlight numeric per-configuration columns with value <= threshold.""" conc_col = _find_concurrency_col(df) - key_cols = [ - c - for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] - if c in df.columns - ] - conf_cols = [ - c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") - ] + key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns] + conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] return df.style.map( lambda v: "background-color:#e6ffe6;font-weight:bold;" @@ -267,45 +250,71 @@ def _highlight_threshold( subset=conf_cols, ) -def highlight_ratio_columns(styler): - ratio_cols = [ - c for c in styler.data.columns - if "ratio" in str(c).lower() - ] +def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): + """Highlight entire columns whose header contains 'Ratio'.""" + ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()] if not ratio_cols: return styler - # Highlight entire column (cells) + # highlight cells styler = styler.apply( lambda _: ["background-color: #fff3b0"] * len(styler.data), subset=ratio_cols, axis=0, ) - # Highlight column headers + # highlight headers styler = styler.set_table_styles( [ - { - "selector": f"th.col_heading.level0.col{i}", - "props": [("background-color", "#fff3b0")], - } + {"selector": f"th.col_heading.level0.col{i}", "props": [("background-color", "#fff3b0")]} for i, col in enumerate(styler.data.columns) if col in ratio_cols ], overwrite=False, ) - return styler -if __name__ == "__main__": + +# ----------------------------- +# Plot helper +# ----------------------------- +def _add_limit_line(fig, y_value: float, label: str): + fig.add_hline( + y=y_value, + line_dash="dash", + line_color="red" if "ttft" in label.lower() else "blue", + annotation_text=f"{label}: {y_value} ms", + annotation_position="top left", + ) + # If plotly is available, add a legend entry + if plotly_found: + import plotly.graph_objects as go + + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + line=dict(dash="dash", color="red" if "ttft" in label.lower() else "blue"), + name=label, + ) + ) + + +# ----------------------------- +# Refactored "main" +# ----------------------------- +@dataclass(frozen=True) +class MetricPlan: + data_cols: List[str] + drop_column: str + + +def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() - parser.add_argument( - "-f", "--file", action="append", type=str, help="input file name" - ) - parser.add_argument( - "--debug", action="store_true", help="show all information for debugging" - ) + parser.add_argument("-f", "--file", action="append", type=str, help="input file name") + parser.add_argument("--debug", action="store_true", help="show all information for debugging") parser.add_argument( "--plot", action=argparse.BooleanOptionalAction, @@ -326,188 +335,187 @@ if __name__ == "__main__": default="p99", help="take median|p99 for latency like TTFT/TPOT", ) - parser.add_argument( - "--ttft-max-ms", - type=float, - default=3000.0, - help="Reference limit for TTFT plots (ms)", - ) - parser.add_argument( - "--tpot-max-ms", - type=float, - default=100.0, - help="Reference limit for TPOT plots (ms)", - ) + parser.add_argument("--ttft-max-ms", type=float, default=3000.0, help="Reference limit for TTFT plots (ms)") + parser.add_argument("--tpot-max-ms", type=float, default=100.0, help="Reference limit for TPOT plots (ms)") + return parser - args = parser.parse_args() +def choose_metrics(latency: str) -> MetricPlan: + latency = (latency or "").lower() drop_column = "P99" - name_column = "Test name" - info_cols = [ - "Model", - "Dataset Name", - "Input Len", - "Output Len", - "TP Size", - "PP Size", - "# of max concurrency.", - "qps", - ] + if "median" in latency: + return MetricPlan( + data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"], + drop_column=drop_column, + ) + return MetricPlan( + data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"], + drop_column=drop_column, + ) - if "median" in args.latency: - data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] - html_msgs_for_data_cols = [ - "Compare Output Tokens /n", - "Median TTFT /n", - "Median TPOT /n", - ] - drop_column = "P99" - elif "p99" in args.latency: - data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"] - html_msgs_for_data_cols = [ - "Compare Output Tokens /n", - "P99 TTFT /n", - "P99 TPOT /n", - ] +def prepare_input_files(args, info_cols: List[str]) -> Tuple[List[str], List[str]]: + if not args.file: + raise ValueError("No input files provided. Use -f/--file.") if len(args.file) == 1: files = split_json_by_tp_pp(args.file[0], output_root="splits") info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] else: files = args.file + return files, info_cols + + +def get_y_axis_col(info_cols: List[str], xaxis: str) -> str: + y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6 + return info_cols[y_axis_index] + + +def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: + filtered_info_cols = info_cols[:4] + group_cols = [c for c in filtered_info_cols if c in output_df.columns] + if not group_cols: + raise ValueError( + f"No valid group-by columns. Expected subset: {filtered_info_cols}, " + f"but DataFrame has: {list(output_df.columns)}" + ) + return group_cols + + +def group_suffix(group_cols: List[str], name) -> str: + name_vals = name if isinstance(name, tuple) else (name,) + return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals)) + + +def group_filename(name, prefix: str = "perf_comparison_") -> str: + name_vals = name if isinstance(name, tuple) else (name,) + safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-") + return f"{prefix}{safe}.html" + + +def render_metric_table_html(display_group: pd.DataFrame, metric_label: str, suffix: str, args) -> str: + title = ( + f'
' + f'{_html.escape(metric_label)}' + f' — {_html.escape(suffix)}' + f"
\n" + ) + + metric_name = metric_label.lower() + + if "ttft" in metric_name: + styler = _highlight_threshold(display_group, args.ttft_max_ms) + elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): + styler = _highlight_threshold(display_group, args.tpot_max_ms) + else: + styler = display_group.style + + # format numbers + highlight ratios + styler = styler.format( + {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, + na_rep="—", + ) + styler = highlight_ratio_columns(styler) + + return title + styler.to_html(table_attributes='border="1" class="dataframe"') + + +def maybe_write_plot( + main_fh, + sub_fh, + group_df: pd.DataFrame, + raw_data_cols: List[str], + metric_label: str, + y_axis_col: str, + args, +): + if not (args.plot and plotly_found): + return + + import plotly.express as px + + df = group_df[raw_data_cols].sort_values(by=y_axis_col) + df_melted = df.melt( + id_vars=y_axis_col, + var_name="Configuration", + value_name=metric_label, + ) + + fig = px.line( + df_melted, + x=y_axis_col, + y=metric_label, + color="Configuration", + title=f"{metric_label} vs {y_axis_col}", + markers=True, + ) + + metric_name = metric_label.lower() + if "ttft" in metric_name: + _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") + elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): + _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") + + html = fig.to_html(full_html=True, include_plotlyjs="cdn") + main_fh.write(html) + sub_fh.write(html) + + +def write_report(files: List[str], info_cols: List[str], plan: MetricPlan, args): + name_column = "Test name" + y_axis_col = get_y_axis_col(info_cols, args.xaxis) + print("comparing : " + ", ".join(files)) - debug = args.debug - plot = args.plot - # For Plot feature, assign y axis from one of info_cols - y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 - with open("perf_comparison.html", "w") as text_file: - for i in range(len(data_cols_to_compare)): + + with open("perf_comparison.html", "w") as main_fh: + for metric_label in plan.data_cols: output_df, raw_data_cols = compare_data_columns( files, name_column, - data_cols_to_compare[i], + metric_label, info_cols, - drop_column, - debug=debug, + plan.drop_column, + debug=args.debug, ) - # For Plot feature, insert y axis from one of info_cols - raw_data_cols.insert(0, info_cols[y_axis_index]) + raw_data_cols = list(raw_data_cols) + raw_data_cols.insert(0, y_axis_col) + + group_cols = get_group_cols(output_df, info_cols) - filtered_info_cols = info_cols[:4] - existing_group_cols = [ - c for c in filtered_info_cols if c in output_df.columns - ] - if not existing_group_cols: - raise ValueError( - f"No valid group-by columns " - f"Expected subset: {filtered_info_cols}, " - f"but DataFrame has: {list(output_df.columns)}" - ) - # output_df_sorted = output_df.sort_values(by=existing_group_cols) output_df_sorted = output_df.sort_values(by=args.xaxis) - output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) - for name, group in output_groups: - group_name = ( - ",".join(map(str, name)).replace(",", "_").replace("/", "-") - ) - group_html_name = "perf_comparison_" + group_name + ".html" - import html as _html - name_vals = name if isinstance(name, tuple) else (name,) - group_title_suffix = " , ".join( - f"{col} : [ {val} ] " for col, val in zip(existing_group_cols, name_vals) - ) + for name, group_df in output_df_sorted.groupby(group_cols, dropna=False): + suffix = group_suffix(group_cols, name) + sub_path = group_filename(name) - # --------------------------------------------- - # DROP group columns from DISPLAY ONLY - # --------------------------------------------- - display_group = group.drop(columns=existing_group_cols, errors="ignore") + # drop group columns from display only + display_group = group_df.drop(columns=group_cols, errors="ignore") - metric_name = str(data_cols_to_compare[i]).lower() - if "tok/s" in metric_name: - styler = display_group.style - styler = highlight_ratio_columns(styler) - html = ( - f'
' - f'{_html.escape(data_cols_to_compare[i])}' - f' — {_html.escape(group_title_suffix)}' - f'
\n' - + styler.to_html(table_attributes='border="1" class="dataframe"') - ) - elif "ttft" in metric_name: - styler = _highlight_threshold(display_group, args.ttft_max_ms).format( - {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, - na_rep="—", - ) - styler = highlight_ratio_columns(styler) - html = ( - f'
' - f'{_html.escape(data_cols_to_compare[i])}' - f' — {_html.escape(group_title_suffix)}' - f'
\n' - + styler.to_html(table_attributes='border="1" class="dataframe"') + html = render_metric_table_html(display_group, metric_label, suffix, args) + + main_fh.write(html) + with open(sub_path, "a+") as sub_fh: + sub_fh.write(html) + maybe_write_plot( + main_fh, + sub_fh, + group_df=group_df, + raw_data_cols=raw_data_cols, + metric_label=metric_label, + y_axis_col=y_axis_col, + args=args, ) - elif ( - "tpot" in metric_name - or "median" in metric_name - or "p99" in metric_name - ): - styler = _highlight_threshold(display_group, args.tpot_max_ms).format( - {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, - na_rep="—", - ) - styler = highlight_ratio_columns(styler) - html = ( - f'
' - f'{_html.escape(data_cols_to_compare[i])}' - f' — {_html.escape(group_title_suffix)}' - f'
\n' - + styler.to_html(table_attributes='border="1" class="dataframe"') - ) - - text_file.write(html) - with open(group_html_name, "a+") as sub_text_file: - sub_text_file.write(html) - if plot and plotly_found: - import plotly.express as px +def main(): + args = build_parser().parse_args() - df = group[raw_data_cols] - df_sorted = df.sort_values(by=info_cols[y_axis_index]) - # Melt DataFrame for plotting - df_melted = df_sorted.melt( - id_vars=info_cols[y_axis_index], - var_name="Configuration", - value_name=data_cols_to_compare[i], - ) - title = ( - data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] - ) - # Create Plotly line chart - fig = px.line( - df_melted, - x=info_cols[y_axis_index], - y=data_cols_to_compare[i], - color="Configuration", - title=title, - markers=True, - ) + info_cols = list(DEFAULT_INFO_COLS) + plan = choose_metrics(args.latency) - # ---- Add threshold lines based on metric name ---- - if "ttft" in metric_name: - _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") - elif ( - "tpot" in metric_name - or "median" in metric_name - or "p99" in metric_name - ): - _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") + files, info_cols = prepare_input_files(args, info_cols) + write_report(files, info_cols, plan, args) + + +if __name__ == "__main__": + main() - # Export to HTML - text_file.write( - fig.to_html(full_html=True, include_plotlyjs="cdn") - ) - sub_text_file.write( - fig.to_html(full_html=True, include_plotlyjs="cdn") - ) From 0e01150cb4b34cfd4c96833a7cd05ca6c627bde0 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Fri, 19 Dec 2025 23:52:46 -0800 Subject: [PATCH 06/13] group-first report instead of data-column-first Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 184 +++++++++++++----- 1 file changed, 139 insertions(+), 45 deletions(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index 6cb05879cdc17..ece004107b669 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -9,7 +9,7 @@ import json import os from dataclasses import dataclass from importlib import util -from typing import List, Tuple +from typing import Dict, List, Tuple import pandas as pd @@ -51,11 +51,11 @@ def compare_data_columns( print("\ncompare_data_column:", data_column) frames = [] - raw_data_cols = [] + raw_data_cols: List[str] = [] compare_frames = [] # 1) choose a canonical key list from info_cols that exists in ALL files - cols_per_file = [] + cols_per_file: List[set] = [] for f in files: try: df_tmp = pd.read_json(f, orient="records") @@ -143,10 +143,7 @@ def compare_data_columns( ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) - concat_df = pd.concat(frames, axis=1) - - # NOTE: meta already contains key columns as normal columns, so we can drop the index cleanly. - concat_df = concat_df.reset_index(drop=True) + concat_df = pd.concat(frames, axis=1).reset_index(drop=True) # Ensure key/info columns appear first (in your info_cols order) front = [c for c in info_cols if c in concat_df.columns] @@ -158,7 +155,7 @@ def compare_data_columns( # ----------------------------- -# Split helper (restored) +# Split helper # ----------------------------- def split_json_by_tp_pp( input_file: str = "benchmark_results.json", output_root: str = "." @@ -231,6 +228,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: ]: if c in df.columns: return c + # Fallback: guess an integer-like column (harmless if unused) for c in df.columns: if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: return c @@ -240,9 +238,16 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": """Highlight numeric per-configuration columns with value <= threshold.""" conc_col = _find_concurrency_col(df) - key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns] - conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")] + key_cols = [ + c + for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] + if c in df.columns + ] + conf_cols = [ + c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") + ] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] + return df.style.map( lambda v: "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) and v <= threshold @@ -257,17 +262,20 @@ def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): if not ratio_cols: return styler - # highlight cells + # Highlight entire column (cells) styler = styler.apply( lambda _: ["background-color: #fff3b0"] * len(styler.data), subset=ratio_cols, axis=0, ) - # highlight headers + # Highlight column headers styler = styler.set_table_styles( [ - {"selector": f"th.col_heading.level0.col{i}", "props": [("background-color", "#fff3b0")]} + { + "selector": f"th.col_heading.level0.col{i}", + "props": [("background-color", "#fff3b0")], + } for i, col in enumerate(styler.data.columns) if col in ratio_cols ], @@ -296,14 +304,17 @@ def _add_limit_line(fig, y_value: float, label: str): x=[None], y=[None], mode="lines", - line=dict(dash="dash", color="red" if "ttft" in label.lower() else "blue"), + line=dict( + dash="dash", + color="red" if "ttft" in label.lower() else "blue", + ), name=label, ) ) # ----------------------------- -# Refactored "main" +# Refactored main + group-first report # ----------------------------- @dataclass(frozen=True) class MetricPlan: @@ -343,11 +354,14 @@ def build_parser() -> argparse.ArgumentParser: def choose_metrics(latency: str) -> MetricPlan: latency = (latency or "").lower() drop_column = "P99" + if "median" in latency: return MetricPlan( data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"], drop_column=drop_column, ) + + # default: p99 return MetricPlan( data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"], drop_column=drop_column, @@ -357,11 +371,13 @@ def choose_metrics(latency: str) -> MetricPlan: def prepare_input_files(args, info_cols: List[str]) -> Tuple[List[str], List[str]]: if not args.file: raise ValueError("No input files provided. Use -f/--file.") + if len(args.file) == 1: files = split_json_by_tp_pp(args.file[0], output_root="splits") info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] else: files = args.file + return files, info_cols @@ -371,6 +387,7 @@ def get_y_axis_col(info_cols: List[str], xaxis: str) -> str: def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: + # Your current grouping rule: first 4 info columns filtered_info_cols = info_cols[:4] group_cols = [c for c in filtered_info_cols if c in output_df.columns] if not group_cols: @@ -381,27 +398,38 @@ def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: return group_cols -def group_suffix(group_cols: List[str], name) -> str: - name_vals = name if isinstance(name, tuple) else (name,) - return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals)) +def normalize_group_key(name): + """Pandas group key can be scalar (1 col) or tuple (N cols). Normalize to tuple.""" + return name if isinstance(name, tuple) else (name,) def group_filename(name, prefix: str = "perf_comparison_") -> str: - name_vals = name if isinstance(name, tuple) else (name,) + name_vals = normalize_group_key(name) safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-") return f"{prefix}{safe}.html" -def render_metric_table_html(display_group: pd.DataFrame, metric_label: str, suffix: str, args) -> str: +def build_group_suffix(group_cols: List[str], name) -> str: + name_vals = normalize_group_key(name) + return " , ".join( + f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals) + ) + + +def render_metric_table_html( + display_group: pd.DataFrame, + metric_label: str, + group_suffix: str, + args, +) -> str: title = ( f'
' f'{_html.escape(metric_label)}' - f' — {_html.escape(suffix)}' + f' — {_html.escape(group_suffix)}' f"
\n" ) metric_name = metric_label.lower() - if "ttft" in metric_name: styler = _highlight_threshold(display_group, args.ttft_max_ms) elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): @@ -409,7 +437,6 @@ def render_metric_table_html(display_group: pd.DataFrame, metric_label: str, suf else: styler = display_group.style - # format numbers + highlight ratios styler = styler.format( {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, na_rep="—", @@ -460,41 +487,106 @@ def maybe_write_plot( sub_fh.write(html) -def write_report(files: List[str], info_cols: List[str], plan: MetricPlan, args): +def build_group_keys(df: pd.DataFrame, group_cols: List[str], sort_cols: List[str] | None = None): + """Return a stable list of group keys from df.""" + if sort_cols: + df = df.sort_values(by=sort_cols) + gb = df.groupby(group_cols, dropna=False) + return [k for k, _ in gb] + + +def write_report_group_first(files: List[str], info_cols: List[str], plan: MetricPlan, args): + """ + Group-first layout: + For each group, emit tok/s then TTFT then TPOT (or Median variants) together. + """ name_column = "Test name" y_axis_col = get_y_axis_col(info_cols, args.xaxis) print("comparing : " + ", ".join(files)) + # Precompute per-metric dataframes once + metric_cache: Dict[str, Tuple[pd.DataFrame, List[str]]] = {} + group_cols_canonical: List[str] | None = None + + for metric_label in plan.data_cols: + output_df, raw_data_cols = compare_data_columns( + files, + name_column, + metric_label, + info_cols, + plan.drop_column, + debug=args.debug, + ) + + # plot expects y-axis column at the front + raw_data_cols = list(raw_data_cols) + raw_data_cols.insert(0, y_axis_col) + + group_cols = get_group_cols(output_df, info_cols) + if group_cols_canonical is None: + group_cols_canonical = group_cols + else: + # keep intersection (stable order) + group_cols_canonical = [c for c in group_cols_canonical if c in group_cols] + + metric_cache[metric_label] = (output_df.sort_values(by=args.xaxis), raw_data_cols) + + if not group_cols_canonical: + raise ValueError("No canonical group columns found across metrics.") + + # Canonical group keys from first metric (typically tok/s) + first_metric = plan.data_cols[0] + first_df_sorted, _ = metric_cache[first_metric] + group_keys = build_group_keys(first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]) + + # Pre-build groupby objects per metric + metric_groupbys = { + metric_label: df.groupby(group_cols_canonical, dropna=False) + for metric_label, (df, _) in metric_cache.items() + } + with open("perf_comparison.html", "w") as main_fh: - for metric_label in plan.data_cols: - output_df, raw_data_cols = compare_data_columns( - files, - name_column, - metric_label, - info_cols, - plan.drop_column, - debug=args.debug, + for gkey in group_keys: + gkey_tuple = normalize_group_key(gkey) + suffix = build_group_suffix(group_cols_canonical, gkey_tuple) + sub_path = group_filename(gkey_tuple) + + # Optional group header (separates each group visually) + group_header = ( + f'
' + f'{_html.escape(suffix)}' + f"
\n" ) - raw_data_cols = list(raw_data_cols) - raw_data_cols.insert(0, y_axis_col) + main_fh.write(group_header) + with open(sub_path, "w") as sub_fh: + sub_fh.write(group_header) - group_cols = get_group_cols(output_df, info_cols) + for metric_label in plan.data_cols: + gb = metric_groupbys[metric_label] + df_sorted, raw_data_cols = metric_cache[metric_label] - output_df_sorted = output_df.sort_values(by=args.xaxis) - for name, group_df in output_df_sorted.groupby(group_cols, dropna=False): - suffix = group_suffix(group_cols, name) - sub_path = group_filename(name) + try: + group_df = gb.get_group(gkey) + except KeyError: + missing = ( + f'
' + f'{_html.escape(metric_label)} — missing for this group' + f"
\n" + ) + main_fh.write(missing) + sub_fh.write(missing) + continue - # drop group columns from display only - display_group = group_df.drop(columns=group_cols, errors="ignore") + # Display-only: drop group columns + display_group = group_df.drop(columns=group_cols_canonical, errors="ignore") - html = render_metric_table_html(display_group, metric_label, suffix, args) + html = render_metric_table_html(display_group, metric_label, suffix, args) - main_fh.write(html) - with open(sub_path, "a+") as sub_fh: + main_fh.write(html) sub_fh.write(html) + maybe_write_plot( main_fh, sub_fh, @@ -513,7 +605,9 @@ def main(): plan = choose_metrics(args.latency) files, info_cols = prepare_input_files(args, info_cols) - write_report(files, info_cols, plan, args) + + # Group-first report layout + write_report_group_first(files, info_cols, plan, args) if __name__ == "__main__": From db9aaa61acf82134784bbabc213fe4e9c113650f Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Sat, 20 Dec 2025 00:00:11 -0800 Subject: [PATCH 07/13] minor function name change Signed-off-by: Tsai, Louie --- .../performance-benchmarks/scripts/compare-json-results.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index ece004107b669..a9b6d256cd0a6 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -446,7 +446,7 @@ def render_metric_table_html( return title + styler.to_html(table_attributes='border="1" class="dataframe"') -def maybe_write_plot( +def write_plot( main_fh, sub_fh, group_df: pd.DataFrame, @@ -587,7 +587,7 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri main_fh.write(html) sub_fh.write(html) - maybe_write_plot( + write_plot( main_fh, sub_fh, group_df=group_df, From f825a14d56c5b56e783c1d3b5ec28c7e522e4393 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Sat, 20 Dec 2025 00:31:25 -0800 Subject: [PATCH 08/13] add sizing table Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 245 +++++++++++++----- 1 file changed, 187 insertions(+), 58 deletions(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index a9b6d256cd0a6..9d2b212d8b3b0 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -27,6 +27,10 @@ DEFAULT_INFO_COLS = [ "qps", ] +# Safety net: if any DataFrame leaks into to_html(), keep precision at 2. +pd.set_option("display.precision", 2) +pd.set_option("display.float_format", lambda x: f"{x:.2f}") + # ----------------------------- # Core data compare @@ -54,7 +58,6 @@ def compare_data_columns( raw_data_cols: List[str] = [] compare_frames = [] - # 1) choose a canonical key list from info_cols that exists in ALL files cols_per_file: List[set] = [] for f in files: try: @@ -65,24 +68,20 @@ def compare_data_columns( key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] if not key_cols: - # soft fallback: use any info_cols present in the first file key_cols = [c for c in info_cols if c in list(cols_per_file[0])] if not key_cols: raise ValueError( "No common key columns found from info_cols across the input files." ) - # 2) build a single "meta" block (keys as columns) once, aligned by the key index meta_added = False for file in files: df = pd.read_json(file, orient="records") - # Keep rows that actually have the compared metric (same as original behavior) if drop_column in df.columns: df = df.dropna(subset=[drop_column], ignore_index=True) - # Stabilize numeric key columns (harmless if missing) for c in ( "Input Len", "Output Len", @@ -94,32 +93,26 @@ def compare_data_columns( if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce") - # Ensure all key columns exist for c in key_cols: if c not in df.columns: df[c] = pd.NA - # Set index = key_cols and aggregate duplicates → unique MultiIndex df_idx = df.set_index(key_cols, drop=False) - # meta (key columns), unique per key meta = df_idx[key_cols] if not meta.index.is_unique: meta = meta.groupby(level=key_cols, dropna=False).first() - # metric series for this file, aggregated to one row per key file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) s = df_idx[data_column] if not s.index.is_unique: s = s.groupby(level=key_cols, dropna=False).mean() - s.name = file_label # column label like original + s.name = file_label - # add meta once (from first file) so keys are the leftmost columns if not meta_added: frames.append(meta) meta_added = True - # debug: aligned test-name column per file if debug and name_column in df_idx.columns: name_s = df_idx[name_column] if not name_s.index.is_unique: @@ -131,21 +124,19 @@ def compare_data_columns( raw_data_cols.append(file_label) compare_frames.append(s) - # ratio columns: fileN / file1 (throughput) or file1 / fileN (latency) if len(compare_frames) >= 2: base = compare_frames[0] current = compare_frames[-1] if "P99" in data_column or "Median" in data_column: - ratio = base / current # for latency: larger means better + ratio = base / current else: - ratio = current / base # for throughput: larger means better + ratio = current / base ratio = ratio.mask(base == 0) ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) concat_df = pd.concat(frames, axis=1).reset_index(drop=True) - # Ensure key/info columns appear first (in your info_cols order) front = [c for c in info_cols if c in concat_df.columns] rest = [c for c in concat_df.columns if c not in front] concat_df = concat_df[front + rest] @@ -160,16 +151,9 @@ def compare_data_columns( def split_json_by_tp_pp( input_file: str = "benchmark_results.json", output_root: str = "." ) -> List[str]: - """ - Split a benchmark JSON into separate folders by (TP Size, PP Size). - - Creates: /tp{TP}_pp{PP}/benchmark_results.json - Returns: list of file paths written. - """ with open(input_file, encoding="utf-8") as f: data = json.load(f) - # If the JSON is a dict with a list under common keys, use that list if isinstance(data, dict): for key in ("results", "serving_results", "benchmarks", "data"): if isinstance(data.get(key), list): @@ -178,14 +162,12 @@ def split_json_by_tp_pp( df = pd.DataFrame(data) - # Keep only "serving" tests name_col = next( (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None ) if name_col: df = df[df[name_col].astype(str).str.contains(r"serving", case=False, na=False)].copy() - # Handle alias column names rename_map = { "tp_size": "TP Size", "tensor_parallel_size": "TP Size", @@ -194,7 +176,6 @@ def split_json_by_tp_pp( } df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True) - # Ensure TP/PP columns exist (default to 1 if missing) if "TP Size" not in df.columns: df["TP Size"] = 1 if "PP Size" not in df.columns: @@ -228,7 +209,6 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: ]: if c in df.columns: return c - # Fallback: guess an integer-like column (harmless if unused) for c in df.columns: if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: return c @@ -236,7 +216,6 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": - """Highlight numeric per-configuration columns with value <= threshold.""" conc_col = _find_concurrency_col(df) key_cols = [ c @@ -257,19 +236,16 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): - """Highlight entire columns whose header contains 'Ratio'.""" ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()] if not ratio_cols: return styler - # Highlight entire column (cells) styler = styler.apply( lambda _: ["background-color: #fff3b0"] * len(styler.data), subset=ratio_cols, axis=0, ) - # Highlight column headers styler = styler.set_table_styles( [ { @@ -284,6 +260,152 @@ def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): return styler +def _apply_two_decimals(styler: "pd.io.formats.style.Styler") -> "pd.io.formats.style.Styler": + df = styler.data + num_cols = df.select_dtypes("number").columns + if len(num_cols) == 0: + return styler + return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="—") + + +# ----------------------------- +# Valid max concurrency summary helpers +# ----------------------------- +def _config_value_columns(df: pd.DataFrame, conc_col: str) -> List[str]: + key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len"] if c in df.columns] + exclude = set(key_cols + [conc_col, "qps", "QPS"]) + + cols: List[str] = [] + for c in df.columns: + if c in exclude: + continue + lc = str(c).lower() + if lc.startswith("ratio"): + continue + if lc.endswith("_name") or lc == "test name" or lc == "test_name": + continue + if pd.api.types.is_numeric_dtype(df[c]): + cols.append(c) + return cols + + +def _max_concurrency_ok(df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float): + if df is None or conc_col not in df.columns or cfg_col not in df.columns: + return pd.NA + + d = df[[conc_col, cfg_col]].copy() + d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce") + d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce") + d = d.dropna(subset=[conc_col, cfg_col]) + + if d.empty: + return pd.NA + + ok = d[d[cfg_col] <= threshold] + if ok.empty: + return pd.NA + + return ok[conc_col].max() + + +def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value): + if df is None or conc_col not in df.columns or cfg_col not in df.columns or pd.isna(conc_value): + return pd.NA + + d = df[[conc_col, cfg_col]].copy() + d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce") + d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce") + + conc_value = pd.to_numeric(conc_value, errors="coerce") + if pd.isna(conc_value): + return pd.NA + + hit = d[d[conc_col] == conc_value] + if hit.empty: + return pd.NA + return hit[cfg_col].iloc[0] + + +def build_valid_max_concurrency_summary_html( + tput_group_df: pd.DataFrame | None, + ttft_group_df: pd.DataFrame | None, + tpot_group_df: pd.DataFrame | None, + conc_col: str, + args, +) -> str: + if ttft_group_df is None and tpot_group_df is None: + return "" + + ttft_cols = _config_value_columns(ttft_group_df, conc_col) if ttft_group_df is not None else [] + tpot_cols = _config_value_columns(tpot_group_df, conc_col) if tpot_group_df is not None else [] + tput_cols = _config_value_columns(tput_group_df, conc_col) if tput_group_df is not None else [] + + if ttft_group_df is not None and tpot_group_df is not None: + cfg_cols = [c for c in ttft_cols if c in tpot_cols] + if tput_group_df is not None: + cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols + else: + cfg_cols = ttft_cols or tpot_cols + + if not cfg_cols: + cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str) + + rows = [] + for cfg in cfg_cols: + ttft_max = _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms) if ttft_group_df is not None else pd.NA + tpot_max = _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms) if tpot_group_df is not None else pd.NA + both = pd.NA if (pd.isna(ttft_max) or pd.isna(tpot_max)) else min(ttft_max, tpot_max) + + tput_at_both = _value_at_concurrency(tput_group_df, conc_col, cfg, both) if tput_group_df is not None else pd.NA + ttft_at_both = _value_at_concurrency(ttft_group_df, conc_col, cfg, both) if ttft_group_df is not None else pd.NA + tpot_at_both = _value_at_concurrency(tpot_group_df, conc_col, cfg, both) if tpot_group_df is not None else pd.NA + + rows.append( + { + "Configuration": cfg, + f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max, + f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max, + f"Max {conc_col} (Both)": both, + "Output Tput @ Both (tok/s)": tput_at_both, + "TTFT @ Both (ms)": ttft_at_both, + "TPOT @ Both (ms)": tpot_at_both, + } + ) + + summary_df = pd.DataFrame(rows) + + # --- Coerce numeric columns so Styler doesn't miss them due to object dtype --- + for c in summary_df.columns: + if c == "Configuration": + continue + summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce") + + both_col = f"Max {conc_col} (Both)" + + # --- Strict 2-decimal formatting for ALL non-Configuration columns --- + formatters = {} + for c in summary_df.columns: + if c == "Configuration": + continue + # default argument binds per-column formatter correctly + formatters[c] = (lambda v: "—" if pd.isna(v) else f"{float(v):.2f}") + + styler = summary_df.style.format(formatters) + + def _green(v): + return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else "" + + if both_col in summary_df.columns: + styler = styler.map(_green, subset=[both_col]) + + title = ( + f'
' + f'Valid Max Concurrency Summary' + f"
\n" + ) + return title + styler.to_html(table_attributes='border="1" class="dataframe"') + + # ----------------------------- # Plot helper # ----------------------------- @@ -295,7 +417,6 @@ def _add_limit_line(fig, y_value: float, label: str): annotation_text=f"{label}: {y_value} ms", annotation_position="top left", ) - # If plotly is available, add a legend entry if plotly_found: import plotly.graph_objects as go @@ -361,7 +482,6 @@ def choose_metrics(latency: str) -> MetricPlan: drop_column=drop_column, ) - # default: p99 return MetricPlan( data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"], drop_column=drop_column, @@ -387,7 +507,6 @@ def get_y_axis_col(info_cols: List[str], xaxis: str) -> str: def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: - # Your current grouping rule: first 4 info columns filtered_info_cols = info_cols[:4] group_cols = [c for c in filtered_info_cols if c in output_df.columns] if not group_cols: @@ -399,7 +518,6 @@ def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: def normalize_group_key(name): - """Pandas group key can be scalar (1 col) or tuple (N cols). Normalize to tuple.""" return name if isinstance(name, tuple) else (name,) @@ -437,16 +555,13 @@ def render_metric_table_html( else: styler = display_group.style - styler = styler.format( - {c: "{:.2f}" for c in display_group.select_dtypes("number").columns}, - na_rep="—", - ) + styler = _apply_two_decimals(styler) styler = highlight_ratio_columns(styler) return title + styler.to_html(table_attributes='border="1" class="dataframe"') -def write_plot( +def maybe_write_plot( main_fh, sub_fh, group_df: pd.DataFrame, @@ -476,6 +591,10 @@ def write_plot( markers=True, ) + # Ensure plot hover + y tick labels are also 2 decimals. + fig.update_traces(hovertemplate="%{y:.2f}") + fig.update_yaxes(tickformat=".2f") + metric_name = metric_label.lower() if "ttft" in metric_name: _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") @@ -488,7 +607,6 @@ def write_plot( def build_group_keys(df: pd.DataFrame, group_cols: List[str], sort_cols: List[str] | None = None): - """Return a stable list of group keys from df.""" if sort_cols: df = df.sort_values(by=sort_cols) gb = df.groupby(group_cols, dropna=False) @@ -496,16 +614,11 @@ def build_group_keys(df: pd.DataFrame, group_cols: List[str], sort_cols: List[st def write_report_group_first(files: List[str], info_cols: List[str], plan: MetricPlan, args): - """ - Group-first layout: - For each group, emit tok/s then TTFT then TPOT (or Median variants) together. - """ name_column = "Test name" y_axis_col = get_y_axis_col(info_cols, args.xaxis) print("comparing : " + ", ".join(files)) - # Precompute per-metric dataframes once metric_cache: Dict[str, Tuple[pd.DataFrame, List[str]]] = {} group_cols_canonical: List[str] | None = None @@ -519,7 +632,6 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri debug=args.debug, ) - # plot expects y-axis column at the front raw_data_cols = list(raw_data_cols) raw_data_cols.insert(0, y_axis_col) @@ -527,7 +639,6 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri if group_cols_canonical is None: group_cols_canonical = group_cols else: - # keep intersection (stable order) group_cols_canonical = [c for c in group_cols_canonical if c in group_cols] metric_cache[metric_label] = (output_df.sort_values(by=args.xaxis), raw_data_cols) @@ -535,12 +646,10 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri if not group_cols_canonical: raise ValueError("No canonical group columns found across metrics.") - # Canonical group keys from first metric (typically tok/s) first_metric = plan.data_cols[0] first_df_sorted, _ = metric_cache[first_metric] group_keys = build_group_keys(first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]) - # Pre-build groupby objects per metric metric_groupbys = { metric_label: df.groupby(group_cols_canonical, dropna=False) for metric_label, (df, _) in metric_cache.items() @@ -552,7 +661,6 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri suffix = build_group_suffix(group_cols_canonical, gkey_tuple) sub_path = group_filename(gkey_tuple) - # Optional group header (separates each group visually) group_header = ( f'
' f'{_html.escape(suffix)}' @@ -563,6 +671,11 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri with open(sub_path, "w") as sub_fh: sub_fh.write(group_header) + tput_group_df = None + ttft_group_df = None + tpot_group_df = None + conc_col = args.xaxis + for metric_label in plan.data_cols: gb = metric_groupbys[metric_label] df_sorted, raw_data_cols = metric_cache[metric_label] @@ -579,15 +692,24 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri sub_fh.write(missing) continue - # Display-only: drop group columns + if conc_col not in group_df.columns: + conc_col = _find_concurrency_col(group_df) + + mn = metric_label.lower().strip() + if "tok/s" in mn: + tput_group_df = group_df + elif "ttft" in mn: + ttft_group_df = group_df + elif mn in ("p99", "median") or "tpot" in mn: + tpot_group_df = group_df + display_group = group_df.drop(columns=group_cols_canonical, errors="ignore") html = render_metric_table_html(display_group, metric_label, suffix, args) - main_fh.write(html) sub_fh.write(html) - write_plot( + maybe_write_plot( main_fh, sub_fh, group_df=group_df, @@ -597,16 +719,23 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri args=args, ) + summary_html = build_valid_max_concurrency_summary_html( + tput_group_df=tput_group_df, + ttft_group_df=ttft_group_df, + tpot_group_df=tpot_group_df, + conc_col=conc_col, + args=args, + ) + if summary_html: + main_fh.write(summary_html) + sub_fh.write(summary_html) + def main(): args = build_parser().parse_args() - info_cols = list(DEFAULT_INFO_COLS) plan = choose_metrics(args.latency) - files, info_cols = prepare_input_files(args, info_cols) - - # Group-first report layout write_report_group_first(files, info_cols, plan, args) From 76862427f160fa2d2f22b97e7b6251dc13db206a Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Sun, 21 Dec 2025 15:27:11 -0800 Subject: [PATCH 09/13] pre-commit fix Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 202 ++++++++++++------ 1 file changed, 141 insertions(+), 61 deletions(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index 9d2b212d8b3b0..7ad92c2db40d4 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -9,7 +9,6 @@ import json import os from dataclasses import dataclass from importlib import util -from typing import Dict, List, Tuple import pandas as pd @@ -36,10 +35,10 @@ pd.set_option("display.float_format", lambda x: f"{x:.2f}") # Core data compare # ----------------------------- def compare_data_columns( - files: List[str], + files: list[str], name_column: str, data_column: str, - info_cols: List[str], + info_cols: list[str], drop_column: str, debug: bool = False, ): @@ -55,10 +54,10 @@ def compare_data_columns( print("\ncompare_data_column:", data_column) frames = [] - raw_data_cols: List[str] = [] + raw_data_cols: list[str] = [] compare_frames = [] - cols_per_file: List[set] = [] + cols_per_file: list[set] = [] for f in files: try: df_tmp = pd.read_json(f, orient="records") @@ -150,7 +149,7 @@ def compare_data_columns( # ----------------------------- def split_json_by_tp_pp( input_file: str = "benchmark_results.json", output_root: str = "." -) -> List[str]: +) -> list[str]: with open(input_file, encoding="utf-8") as f: data = json.load(f) @@ -166,7 +165,9 @@ def split_json_by_tp_pp( (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None ) if name_col: - df = df[df[name_col].astype(str).str.contains(r"serving", case=False, na=False)].copy() + df = df[ + df[name_col].astype(str).str.contains(r"serving", case=False, na=False) + ].copy() rename_map = { "tp_size": "TP Size", @@ -174,7 +175,9 @@ def split_json_by_tp_pp( "pp_size": "PP Size", "pipeline_parallel_size": "PP Size", } - df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True) + df.rename( + columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True + ) if "TP Size" not in df.columns: df["TP Size"] = 1 @@ -184,7 +187,7 @@ def split_json_by_tp_pp( df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int) df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int) - saved_paths: List[str] = [] + saved_paths: list[str] = [] for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False): folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}") os.makedirs(folder_name, exist_ok=True) @@ -215,7 +218,9 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: return "# of max concurrency." -def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": +def _highlight_threshold( + df: pd.DataFrame, threshold: float +) -> pd.io.formats.style.Styler: conc_col = _find_concurrency_col(df) key_cols = [ c @@ -235,7 +240,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s ) -def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): +def highlight_ratio_columns(styler: pd.io.formats.style.Styler): ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()] if not ratio_cols: return styler @@ -260,7 +265,9 @@ def highlight_ratio_columns(styler: "pd.io.formats.style.Styler"): return styler -def _apply_two_decimals(styler: "pd.io.formats.style.Styler") -> "pd.io.formats.style.Styler": +def _apply_two_decimals( + styler: pd.io.formats.style.Styler, +) -> pd.io.formats.style.Styler: df = styler.data num_cols = df.select_dtypes("number").columns if len(num_cols) == 0: @@ -271,11 +278,15 @@ def _apply_two_decimals(styler: "pd.io.formats.style.Styler") -> "pd.io.formats. # ----------------------------- # Valid max concurrency summary helpers # ----------------------------- -def _config_value_columns(df: pd.DataFrame, conc_col: str) -> List[str]: - key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len"] if c in df.columns] +def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]: + key_cols = [ + c + for c in ["Model", "Dataset Name", "Input Len", "Output Len"] + if c in df.columns + ] exclude = set(key_cols + [conc_col, "qps", "QPS"]) - cols: List[str] = [] + cols: list[str] = [] for c in df.columns: if c in exclude: continue @@ -289,7 +300,9 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> List[str]: return cols -def _max_concurrency_ok(df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float): +def _max_concurrency_ok( + df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float +): if df is None or conc_col not in df.columns or cfg_col not in df.columns: return pd.NA @@ -309,7 +322,12 @@ def _max_concurrency_ok(df: pd.DataFrame, conc_col: str, cfg_col: str, threshold def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value): - if df is None or conc_col not in df.columns or cfg_col not in df.columns or pd.isna(conc_value): + if ( + df is None + or conc_col not in df.columns + or cfg_col not in df.columns + or pd.isna(conc_value) + ): return pd.NA d = df[[conc_col, cfg_col]].copy() @@ -336,9 +354,21 @@ def build_valid_max_concurrency_summary_html( if ttft_group_df is None and tpot_group_df is None: return "" - ttft_cols = _config_value_columns(ttft_group_df, conc_col) if ttft_group_df is not None else [] - tpot_cols = _config_value_columns(tpot_group_df, conc_col) if tpot_group_df is not None else [] - tput_cols = _config_value_columns(tput_group_df, conc_col) if tput_group_df is not None else [] + ttft_cols = ( + _config_value_columns(ttft_group_df, conc_col) + if ttft_group_df is not None + else [] + ) + tpot_cols = ( + _config_value_columns(tpot_group_df, conc_col) + if tpot_group_df is not None + else [] + ) + tput_cols = ( + _config_value_columns(tput_group_df, conc_col) + if tput_group_df is not None + else [] + ) if ttft_group_df is not None and tpot_group_df is not None: cfg_cols = [c for c in ttft_cols if c in tpot_cols] @@ -352,13 +382,37 @@ def build_valid_max_concurrency_summary_html( rows = [] for cfg in cfg_cols: - ttft_max = _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms) if ttft_group_df is not None else pd.NA - tpot_max = _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms) if tpot_group_df is not None else pd.NA - both = pd.NA if (pd.isna(ttft_max) or pd.isna(tpot_max)) else min(ttft_max, tpot_max) + ttft_max = ( + _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms) + if ttft_group_df is not None + else pd.NA + ) + tpot_max = ( + _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms) + if tpot_group_df is not None + else pd.NA + ) + both = ( + pd.NA + if (pd.isna(ttft_max) or pd.isna(tpot_max)) + else min(ttft_max, tpot_max) + ) - tput_at_both = _value_at_concurrency(tput_group_df, conc_col, cfg, both) if tput_group_df is not None else pd.NA - ttft_at_both = _value_at_concurrency(ttft_group_df, conc_col, cfg, both) if ttft_group_df is not None else pd.NA - tpot_at_both = _value_at_concurrency(tpot_group_df, conc_col, cfg, both) if tpot_group_df is not None else pd.NA + tput_at_both = ( + _value_at_concurrency(tput_group_df, conc_col, cfg, both) + if tput_group_df is not None + else pd.NA + ) + ttft_at_both = ( + _value_at_concurrency(ttft_group_df, conc_col, cfg, both) + if ttft_group_df is not None + else pd.NA + ) + tpot_at_both = ( + _value_at_concurrency(tpot_group_df, conc_col, cfg, both) + if tpot_group_df is not None + else pd.NA + ) rows.append( { @@ -388,7 +442,7 @@ def build_valid_max_concurrency_summary_html( if c == "Configuration": continue # default argument binds per-column formatter correctly - formatters[c] = (lambda v: "—" if pd.isna(v) else f"{float(v):.2f}") + formatters[c] = lambda v: "—" if pd.isna(v) else f"{float(v):.2f}" styler = summary_df.style.format(formatters) @@ -399,9 +453,9 @@ def build_valid_max_concurrency_summary_html( styler = styler.map(_green, subset=[both_col]) title = ( - f'
' - f'Valid Max Concurrency Summary' - f"
\n" + '
' + "Valid Max Concurrency Summary" + "
\n" ) return title + styler.to_html(table_attributes='border="1" class="dataframe"') @@ -439,14 +493,18 @@ def _add_limit_line(fig, y_value: float, label: str): # ----------------------------- @dataclass(frozen=True) class MetricPlan: - data_cols: List[str] + data_cols: list[str] drop_column: str def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() - parser.add_argument("-f", "--file", action="append", type=str, help="input file name") - parser.add_argument("--debug", action="store_true", help="show all information for debugging") + parser.add_argument( + "-f", "--file", action="append", type=str, help="input file name" + ) + parser.add_argument( + "--debug", action="store_true", help="show all information for debugging" + ) parser.add_argument( "--plot", action=argparse.BooleanOptionalAction, @@ -467,8 +525,18 @@ def build_parser() -> argparse.ArgumentParser: default="p99", help="take median|p99 for latency like TTFT/TPOT", ) - parser.add_argument("--ttft-max-ms", type=float, default=3000.0, help="Reference limit for TTFT plots (ms)") - parser.add_argument("--tpot-max-ms", type=float, default=100.0, help="Reference limit for TPOT plots (ms)") + parser.add_argument( + "--ttft-max-ms", + type=float, + default=3000.0, + help="Reference limit for TTFT plots (ms)", + ) + parser.add_argument( + "--tpot-max-ms", + type=float, + default=100.0, + help="Reference limit for TPOT plots (ms)", + ) return parser @@ -488,7 +556,7 @@ def choose_metrics(latency: str) -> MetricPlan: ) -def prepare_input_files(args, info_cols: List[str]) -> Tuple[List[str], List[str]]: +def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]: if not args.file: raise ValueError("No input files provided. Use -f/--file.") @@ -501,12 +569,12 @@ def prepare_input_files(args, info_cols: List[str]) -> Tuple[List[str], List[str return files, info_cols -def get_y_axis_col(info_cols: List[str], xaxis: str) -> str: +def get_y_axis_col(info_cols: list[str], xaxis: str) -> str: y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6 return info_cols[y_axis_index] -def get_group_cols(output_df: pd.DataFrame, info_cols: List[str]) -> List[str]: +def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]: filtered_info_cols = info_cols[:4] group_cols = [c for c in filtered_info_cols if c in output_df.columns] if not group_cols: @@ -527,11 +595,9 @@ def group_filename(name, prefix: str = "perf_comparison_") -> str: return f"{prefix}{safe}.html" -def build_group_suffix(group_cols: List[str], name) -> str: +def build_group_suffix(group_cols: list[str], name) -> str: name_vals = normalize_group_key(name) - return " , ".join( - f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals) - ) + return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals)) def render_metric_table_html( @@ -542,8 +608,8 @@ def render_metric_table_html( ) -> str: title = ( f'
' - f'{_html.escape(metric_label)}' - f' — {_html.escape(group_suffix)}' + f"{_html.escape(metric_label)}" + f" — {_html.escape(group_suffix)}" f"
\n" ) @@ -565,7 +631,7 @@ def maybe_write_plot( main_fh, sub_fh, group_df: pd.DataFrame, - raw_data_cols: List[str], + raw_data_cols: list[str], metric_label: str, y_axis_col: str, args, @@ -606,21 +672,25 @@ def maybe_write_plot( sub_fh.write(html) -def build_group_keys(df: pd.DataFrame, group_cols: List[str], sort_cols: List[str] | None = None): +def build_group_keys( + df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None +): if sort_cols: df = df.sort_values(by=sort_cols) gb = df.groupby(group_cols, dropna=False) return [k for k, _ in gb] -def write_report_group_first(files: List[str], info_cols: List[str], plan: MetricPlan, args): +def write_report_group_first( + files: list[str], info_cols: list[str], plan: MetricPlan, args +): name_column = "Test name" y_axis_col = get_y_axis_col(info_cols, args.xaxis) print("comparing : " + ", ".join(files)) - metric_cache: Dict[str, Tuple[pd.DataFrame, List[str]]] = {} - group_cols_canonical: List[str] | None = None + metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {} + group_cols_canonical: list[str] | None = None for metric_label in plan.data_cols: output_df, raw_data_cols = compare_data_columns( @@ -641,14 +711,19 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri else: group_cols_canonical = [c for c in group_cols_canonical if c in group_cols] - metric_cache[metric_label] = (output_df.sort_values(by=args.xaxis), raw_data_cols) + metric_cache[metric_label] = ( + output_df.sort_values(by=args.xaxis), + raw_data_cols, + ) if not group_cols_canonical: raise ValueError("No canonical group columns found across metrics.") first_metric = plan.data_cols[0] first_df_sorted, _ = metric_cache[first_metric] - group_keys = build_group_keys(first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]) + group_keys = build_group_keys( + first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis] + ) metric_groupbys = { metric_label: df.groupby(group_cols_canonical, dropna=False) @@ -660,11 +735,11 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri gkey_tuple = normalize_group_key(gkey) suffix = build_group_suffix(group_cols_canonical, gkey_tuple) sub_path = group_filename(gkey_tuple) - group_header = ( - f'
' - f'{_html.escape(suffix)}' - f"
\n" + '
' + f"{_html.escape(suffix)}" + "
\n" ) main_fh.write(group_header) @@ -684,10 +759,12 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri group_df = gb.get_group(gkey) except KeyError: missing = ( - f'
' - f'{_html.escape(metric_label)} — missing for this group' - f"
\n" + '
' + f"{_html.escape(metric_label)} — missing for this group" + "
\n" ) + main_fh.write(missing) sub_fh.write(missing) continue @@ -703,9 +780,13 @@ def write_report_group_first(files: List[str], info_cols: List[str], plan: Metri elif mn in ("p99", "median") or "tpot" in mn: tpot_group_df = group_df - display_group = group_df.drop(columns=group_cols_canonical, errors="ignore") + display_group = group_df.drop( + columns=group_cols_canonical, errors="ignore" + ) - html = render_metric_table_html(display_group, metric_label, suffix, args) + html = render_metric_table_html( + display_group, metric_label, suffix, args + ) main_fh.write(html) sub_fh.write(html) @@ -741,4 +822,3 @@ def main(): if __name__ == "__main__": main() - From 898e868d28acfba97dc808ef6d2e9e8bce522449 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Mon, 22 Dec 2025 00:02:42 -0800 Subject: [PATCH 10/13] fix a mulitple TP/PP size comparison issue in a table Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index 7ad92c2db40d4..b3d0a2d3bbce0 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -20,8 +20,8 @@ DEFAULT_INFO_COLS = [ "Dataset Name", "Input Len", "Output Len", - "TP Size", - "PP Size", + # "TP Size", + # "PP Size", "# of max concurrency.", "qps", ] @@ -272,7 +272,7 @@ def _apply_two_decimals( num_cols = df.select_dtypes("number").columns if len(num_cols) == 0: return styler - return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="—") + return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="") # ----------------------------- @@ -442,7 +442,7 @@ def build_valid_max_concurrency_summary_html( if c == "Configuration": continue # default argument binds per-column formatter correctly - formatters[c] = lambda v: "—" if pd.isna(v) else f"{float(v):.2f}" + formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}" styler = summary_df.style.format(formatters) @@ -730,7 +730,8 @@ def write_report_group_first( for metric_label, (df, _) in metric_cache.items() } - with open("perf_comparison.html", "w") as main_fh: + with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: + main_fh.write('\n') for gkey in group_keys: gkey_tuple = normalize_group_key(gkey) suffix = build_group_suffix(group_cols_canonical, gkey_tuple) @@ -743,9 +744,9 @@ def write_report_group_first( ) main_fh.write(group_header) - with open(sub_path, "w") as sub_fh: + with open(sub_path, "w", encoding="utf-8") as sub_fh: + sub_fh.write('\n') sub_fh.write(group_header) - tput_group_df = None ttft_group_df = None tpot_group_df = None From b00fd3592ebc88af64e1f2d8791ef75671e603ee Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 23 Dec 2025 22:57:57 -0800 Subject: [PATCH 11/13] Update dashboard.md for perf_comparison.html report update Signed-off-by: Tsai, Louie --- .buildkite/performance-benchmarks/README.md | 43 ++++++++++++++---- docs/benchmarking/dashboard.md | 50 ++++++++++++++++++++- 2 files changed, 84 insertions(+), 9 deletions(-) diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 64a262c6cb401..d7dac5e16810d 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -176,19 +176,46 @@ If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. +#### Performance Results Comparison The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. -Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps. + +Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length. `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` -| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | -|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------| -| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 | -| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 | +***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]*** +| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | +|----|------|-----|-----------|----------|----------| +| 0 | 12 | inf | 24.98 | 186.03 | 7.45 | +| 1 | 16 | inf| 25.49 | 246.92 | 9.69 | +| 2 | 24 | inf| 27.74 | 293.34 | 10.57 | +| 3 | 32 | inf| 28.61 |306.69 | 10.72 | + + +***compare-json-results.py – Command-Line Parameters*** +compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots. +In most cases, users only need to specify --file to parse the desired benchmark results. +| Parameter | Type | Default Value | Description | +| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- | +| `--file` | `str` (appendable) | *None* | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs. | +| `--debug` | `bool` | `False` | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. | +| `--plot` / `--no-plot` | `bool` | `True` | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation. | +| `--xaxis` | `str` | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size). | +| `--latency` | `str` | `p99` | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`. | +| `--ttft-max-ms` | `float` | `3000.0` | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds. | +| `--tpot-max-ms` | `float` | `100.0` | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds. | + + +***Valid Max Concurrency Summary*** +Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result. +The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously. +This value is typically used in capacity planning and sizing guides. +| # | Configuration | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) | +| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- | +| 1 | results-a | 128.00 | 12.00 | 12.00 | 127.76 | 3000.82 | 93.24 | +| 2 | results-b | 128.00 | 32.00 | 32.00 | 371.42 | 2261.53 | 81.74 | + -A comparison diagram will be generated below the table. -Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 -image diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md index 4cbc1a6a0a4fb..b328d1884ad6d 100644 --- a/docs/benchmarking/dashboard.md +++ b/docs/benchmarking/dashboard.md @@ -40,7 +40,55 @@ When run, benchmark script generates results under **benchmark/results** folder, - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. -For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results). +### Visualization + +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table with real benchmarking results. +You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. +If you do not see the table, please wait till the benchmark finish running. +The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. +The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. + +#### Performance Results Comparison +The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. +When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. +`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. +If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. + +Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length. +`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` + +***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]*** +| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | +|----|------|-----|-----------|----------|----------| +| 0 | 12 | inf | 24.98 | 186.03 | 7.45 | +| 1 | 16 | inf| 25.49 | 246.92 | 9.69 | +| 2 | 24 | inf| 27.74 | 293.34 | 10.57 | +| 3 | 32 | inf| 28.61 |306.69 | 10.72 | + +***compare-json-results.py – Command-Line Parameters*** +compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots. +In most cases, users only need to specify --file to parse the desired benchmark results. +| Parameter | Type | Default Value | Description | +| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- | +| `--file` | `str` (appendable) | *None* | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs. | +| `--debug` | `bool` | `False` | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. | +| `--plot` / `--no-plot` | `bool` | `True` | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation. | +| `--xaxis` | `str` | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size). | +| `--latency` | `str` | `p99` | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`. | +| `--ttft-max-ms` | `float` | `3000.0` | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds. | +| `--tpot-max-ms` | `float` | `100.0` | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds. | + + +***Valid Max Concurrency Summary*** +Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result. +The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously. +This value is typically used in capacity planning and sizing guides. +| # | Configuration | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) | +| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- | +| 0 | results-a | 128.00 | 12.00 | 12.00 | 127.76 | 3000.82 | 93.24 | +| 1 | results-b | 128.00 | 32.00 | 32.00 | 371.42 | 2261.53 | 81.74 | + + More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md). From ff80f1427a88020723b772d998431cd66b469c2b Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 24 Dec 2025 10:44:05 -0800 Subject: [PATCH 12/13] remove enforce-eager according to feedback. Signed-off-by: Tsai, Louie --- .../tests/serving-tests-cpu.json | 54 +++++++------------ 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index 1b031a2717610..25ed7415ec0e4 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -33,8 +33,7 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "server_parameters": { - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "dataset_name": "sharegpt", @@ -44,8 +43,7 @@ { "test_name": "serving_llama8B_tp2_sharegpt", "server_parameters": { - "tensor_parallel_size": 2, - "enforce_eager": "" + "tensor_parallel_size": 2 }, "client_parameters": { "dataset_name": "sharegpt", @@ -55,8 +53,7 @@ { "test_name": "serving_llama8B_tp1_random_128_128", "server_parameters": { - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "dataset_name": "random", @@ -67,8 +64,7 @@ { "test_name": "serving_llama8B_tp2_random_128_128", "server_parameters": { - "tensor_parallel_size": 2, - "enforce_eager": "" + "tensor_parallel_size": 2 }, "client_parameters": { "dataset_name": "random", @@ -79,8 +75,7 @@ { "test_name": "serving_llama8B_tp4_random_128_128", "server_parameters": { - "tensor_parallel_size": 4, - "enforce_eager": "" + "tensor_parallel_size": 4 }, "client_parameters": { "dataset_name": "random", @@ -91,8 +86,7 @@ { "test_name": "serving_llama8B_tp1_random_128_2048", "server_parameters": { - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "dataset_name": "random", @@ -103,8 +97,7 @@ { "test_name": "serving_llama8B_tp2_random_128_2048", "server_parameters": { - "tensor_parallel_size": 2, - "enforce_eager": "" + "tensor_parallel_size": 2 }, "client_parameters": { "dataset_name": "random", @@ -115,8 +108,7 @@ { "test_name": "serving_llama8B_tp4_random_128_2048", "server_parameters": { - "tensor_parallel_size": 4, - "enforce_eager": "" + "tensor_parallel_size": 4 }, "client_parameters": { "dataset_name": "random", @@ -127,8 +119,7 @@ { "test_name": "serving_llama8B_tp1_random_2048_128", "server_parameters": { - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "dataset_name": "random", @@ -139,8 +130,7 @@ { "test_name": "serving_llama8B_tp2_random_2048_128", "server_parameters": { - "tensor_parallel_size": 2, - "enforce_eager": "" + "tensor_parallel_size": 2 }, "client_parameters": { "dataset_name": "random", @@ -151,8 +141,7 @@ { "test_name": "serving_llama8B_tp4_random_2048_128", "server_parameters": { - "tensor_parallel_size": 4, - "enforce_eager": "" + "tensor_parallel_size": 4 }, "client_parameters": { "dataset_name": "random", @@ -203,8 +192,7 @@ "test_name": "serving_llama3B_tp1_random_128_128", "server_parameters": { "model": "meta-llama/Llama-3.2-3B-Instruct", - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "model": "meta-llama/Llama-3.2-3B-Instruct", @@ -217,8 +205,7 @@ "test_name": "serving_granite2B_tp1_random_128_128", "server_parameters": { "model": "ibm-granite/granite-3.2-2b-instruct", - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "model": "ibm-granite/granite-3.2-2b-instruct", @@ -231,8 +218,7 @@ "test_name": "serving_qwen1.7B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-1.7B", - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "model": "Qwen/Qwen3-1.7B", @@ -245,8 +231,7 @@ "test_name": "serving_qwen4B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-4B", - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "model": "Qwen/Qwen3-4B", @@ -259,8 +244,7 @@ "test_name": "serving_qwen8B_tp1_random_128_128", "server_parameters": { "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "model": "Qwen/Qwen3-8B", @@ -273,8 +257,7 @@ "test_name": "serving_glm9B_tp1_random_128_128", "server_parameters": { "model": "zai-org/glm-4-9b-hf", - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "model": "zai-org/glm-4-9b-hf", @@ -287,8 +270,7 @@ "test_name": "serving_gemma7B_tp1_random_128_128", "server_parameters": { "model": "google/gemma-7b", - "tensor_parallel_size": 1, - "enforce_eager": "" + "tensor_parallel_size": 1 }, "client_parameters": { "model": "google/gemma-7b", From e41c10d5cffe898e83a47fdce9032aa33969bda3 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Wed, 24 Dec 2025 10:55:50 -0800 Subject: [PATCH 13/13] Update dashboard.md and Update README.md to remove duplicated section Signed-off-by: Tsai, Louie --- .buildkite/performance-benchmarks/README.md | 42 +-------------------- docs/benchmarking/dashboard.md | 17 +++++---- 2 files changed, 11 insertions(+), 48 deletions(-) diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index d7dac5e16810d..289877e504bbd 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -177,45 +177,5 @@ The json version of the table (together with the json version of the benchmark) The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. #### Performance Results Comparison -The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. -When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. -`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. -If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. - - -Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length. -`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` - -***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]*** -| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | -|----|------|-----|-----------|----------|----------| -| 0 | 12 | inf | 24.98 | 186.03 | 7.45 | -| 1 | 16 | inf| 25.49 | 246.92 | 9.69 | -| 2 | 24 | inf| 27.74 | 293.34 | 10.57 | -| 3 | 32 | inf| 28.61 |306.69 | 10.72 | - - -***compare-json-results.py – Command-Line Parameters*** -compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots. -In most cases, users only need to specify --file to parse the desired benchmark results. -| Parameter | Type | Default Value | Description | -| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- | -| `--file` | `str` (appendable) | *None* | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs. | -| `--debug` | `bool` | `False` | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. | -| `--plot` / `--no-plot` | `bool` | `True` | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation. | -| `--xaxis` | `str` | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size). | -| `--latency` | `str` | `p99` | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`. | -| `--ttft-max-ms` | `float` | `3000.0` | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds. | -| `--tpot-max-ms` | `float` | `100.0` | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds. | - - -***Valid Max Concurrency Summary*** -Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result. -The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously. -This value is typically used in capacity planning and sizing guides. -| # | Configuration | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) | -| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- | -| 1 | results-a | 128.00 | 12.00 | 12.00 | 127.76 | 3000.82 | 93.24 | -| 2 | results-b | 128.00 | 32.00 | 32.00 | 371.42 | 2261.53 | 81.74 | - +Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide. diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md index b328d1884ad6d..701fb16ae2cf1 100644 --- a/docs/benchmarking/dashboard.md +++ b/docs/benchmarking/dashboard.md @@ -49,6 +49,7 @@ The json version of the table (together with the json version of the benchmark) The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. #### Performance Results Comparison + The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. @@ -58,16 +59,19 @@ Here is an example using the script to compare result_a and result_b with max co `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` ***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]*** + | | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | |----|------|-----|-----------|----------|----------| -| 0 | 12 | inf | 24.98 | 186.03 | 7.45 | -| 1 | 16 | inf| 25.49 | 246.92 | 9.69 | -| 2 | 24 | inf| 27.74 | 293.34 | 10.57 | +| 0 | 12 | inf | 24.98 | 186.03 | 7.45 | +| 1 | 16 | inf| 25.49 | 246.92 | 9.69 | +| 2 | 24 | inf| 27.74 | 293.34 | 10.57 | | 3 | 32 | inf| 28.61 |306.69 | 10.72 | ***compare-json-results.py – Command-Line Parameters*** + compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots. -In most cases, users only need to specify --file to parse the desired benchmark results. +In most cases, users only need to specify --file to parse the desired benchmark results. + | Parameter | Type | Default Value | Description | | ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- | | `--file` | `str` (appendable) | *None* | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs. | @@ -78,18 +82,17 @@ In most cases, users only need to specify --file to parse the desired benchmark | `--ttft-max-ms` | `float` | `3000.0` | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds. | | `--tpot-max-ms` | `float` | `100.0` | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds. | - ***Valid Max Concurrency Summary*** + Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result. The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously. This value is typically used in capacity planning and sizing guides. + | # | Configuration | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) | | - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- | | 0 | results-a | 128.00 | 12.00 | 12.00 | 127.76 | 3000.82 | 93.24 | | 1 | results-b | 128.00 | 32.00 | 32.00 | 371.42 | 2261.53 | 81.74 | - - More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md). ## Continuous Benchmarking