Merge e41c10d5cffe898e83a47fdce9032aa33969bda3 into 254f6b986720c92ddf97fbb1a6a6465da8e87e29

2026-07-07 07:27:08 +08:00 · 2025-12-25 08:07:04 +08:00 · 2025-12-25 08:07:04 +08:00 · 5011eb74da
commit 5011eb74da
parent 254f6b9867 e41c10d5cf
4 changed files with 666 additions and 222 deletions
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@ -176,19 +176,6 @@ If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.

-The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
-When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
-If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
+#### Performance Results Comparison  

-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-
-|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
-| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
-| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
-
-A comparison diagram will be generated below the table.
-Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
-<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
+Follow the instructions in [performance results comparison](https://docs.vllm.ai/en/latest/benchmarking/dashboard/#performance-results-comparison) to analyze performance results and the sizing guide.
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@ -1,8 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
 import argparse
+import html as _html
 import json
 import os
+from dataclasses import dataclass
 from importlib import util

 import pandas as pd
@ -10,27 +15,49 @@ import pandas as pd
 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None

+DEFAULT_INFO_COLS = [
+    "Model",
+    "Dataset Name",
+    "Input Len",
+    "Output Len",
+    #    "TP Size",
+    #    "PP Size",
+    "# of max concurrency.",
+    "qps",
+]

+# Safety net: if any DataFrame leaks into to_html(), keep precision at 2.
+pd.set_option("display.precision", 2)
+pd.set_option("display.float_format", lambda x: f"{x:.2f}")
+
+
+# -----------------------------
+# Core data compare
+# -----------------------------
 def compare_data_columns(
-    files, name_column, data_column, info_cols, drop_column, debug=False
+    files: list[str],
+    name_column: str,
+    data_column: str,
+    info_cols: list[str],
+    drop_column: str,
+    debug: bool = False,
 ):
    """
    Align concatenation by keys derived from info_cols instead of row order.
    - Pick one canonical key list: subset of info_cols present in ALL files.
    - For each file: set index to those keys, aggregate duplicates
-    - (mean for metric, first for names).
+      (mean for metric, first for names).
    - Concat along axis=1 (indexes align), then reset_index so callers can
-    - group by columns.
+      group by columns.
    - If --debug, add a <file_label>_name column per file.
    """
    print("\ncompare_data_column:", data_column)

    frames = []
-    raw_data_cols = []
+    raw_data_cols: list[str] = []
    compare_frames = []

-    # 1) choose a canonical key list from info_cols that exists in ALL files
-    cols_per_file = []
+    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
@ -40,24 +67,20 @@ def compare_data_columns(

    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
    if not key_cols:
-        # soft fallback: use any info_cols present in the first file
        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
    if not key_cols:
        raise ValueError(
            "No common key columns found from info_cols across the input files."
        )

-    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
    meta_added = False

    for file in files:
        df = pd.read_json(file, orient="records")

-        # Keep rows that actually have the compared metric (same as original behavior)
        if drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)

-        # Stabilize numeric key columns (harmless if missing)
        for c in (
            "Input Len",
            "Output Len",
@ -69,32 +92,26 @@ def compare_data_columns(
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")

-        # Ensure all key columns exist
        for c in key_cols:
            if c not in df.columns:
                df[c] = pd.NA

-        # Set index = key_cols and aggregate duplicates → unique MultiIndex
        df_idx = df.set_index(key_cols, drop=False)

-        # meta (key columns), unique per key
        meta = df_idx[key_cols]
        if not meta.index.is_unique:
            meta = meta.groupby(level=key_cols, dropna=False).first()

-        # metric series for this file, aggregated to one row per key
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
        s = df_idx[data_column]
        if not s.index.is_unique:
            s = s.groupby(level=key_cols, dropna=False).mean()
-        s.name = file_label  # column label like original
+        s.name = file_label

-        # add meta once (from first file) so keys are the leftmost columns
        if not meta_added:
            frames.append(meta)
            meta_added = True

-        # (NEW) debug: aligned test-name column per file
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
@ -106,26 +123,19 @@ def compare_data_columns(
        raw_data_cols.append(file_label)
        compare_frames.append(s)

-        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
        if len(compare_frames) >= 2:
            base = compare_frames[0]
            current = compare_frames[-1]
            if "P99" in data_column or "Median" in data_column:
-                ratio = base / current  # for latency
+                ratio = base / current
            else:
                ratio = current / base
-            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
+            ratio = ratio.mask(base == 0)
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)

-    # 4) concat on columns with aligned MultiIndex;
-    # then reset_index to return keys as columns
-    concat_df = pd.concat(frames, axis=1)
-    concat_df = concat_df.reset_index(drop=True).reset_index()
-    if "index" in concat_df.columns:
-        concat_df = concat_df.drop(columns=["index"])
+    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)

-    # Ensure key/info columns appear first (in your info_cols order)
    front = [c for c in info_cols if c in concat_df.columns]
    rest = [c for c in concat_df.columns if c not in front]
    concat_df = concat_df[front + rest]
@ -134,20 +144,15 @@ def compare_data_columns(
    return concat_df, raw_data_cols


+# -----------------------------
+# Split helper
+# -----------------------------
 def split_json_by_tp_pp(
    input_file: str = "benchmark_results.json", output_root: str = "."
 ) -> list[str]:
-    """
-    Split a benchmark JSON into separate folders by (TP Size, PP Size).
-
-    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
-    Returns: list of file paths written.
-    """
-    # Load JSON data into DataFrame
    with open(input_file, encoding="utf-8") as f:
        data = json.load(f)

-    # If the JSON is a dict with a list under common keys, use that list
    if isinstance(data, dict):
        for key in ("results", "serving_results", "benchmarks", "data"):
            if isinstance(data.get(key), list):
@ -156,7 +161,6 @@ def split_json_by_tp_pp(

    df = pd.DataFrame(data)

-    # Keep only "serving" tests
    name_col = next(
        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
    )
@ -165,7 +169,6 @@ def split_json_by_tp_pp(
            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
        ].copy()

-    # Handle alias column names
    rename_map = {
        "tp_size": "TP Size",
        "tensor_parallel_size": "TP Size",
@ -176,21 +179,14 @@ def split_json_by_tp_pp(
        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
    )

-    # Ensure TP/PP columns exist (default to 1 if missing)
    if "TP Size" not in df.columns:
        df["TP Size"] = 1
    if "PP Size" not in df.columns:
        df["PP Size"] = 1

-    # make sure TP/PP are numeric ints with no NaN
-    df["TP Size"] = (
-        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
-    df["PP Size"] = (
-        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
-    )
+    df["TP Size"] = pd.to_numeric(df["TP Size"], errors="coerce").fillna(1).astype(int)
+    df["PP Size"] = pd.to_numeric(df["PP Size"], errors="coerce").fillna(1).astype(int)

-    # Split into separate folders
    saved_paths: list[str] = []
    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
@ -203,32 +199,9 @@ def split_json_by_tp_pp(
    return saved_paths


-def _add_limit_line(fig, y_value, label):
-    # Visible dashed line + annotation
-    fig.add_hline(
-        y=y_value,
-        line_dash="dash",
-        line_color="red" if "ttft" in label.lower() else "blue",
-        annotation_text=f"{label}: {y_value} ms",
-        annotation_position="top left",
-    )
-    # Optional: add a legend item (as a transparent helper trace)
-    if plot and plotly_found:
-        import plotly.graph_objects as go
-
-        fig.add_trace(
-            go.Scatter(
-                x=[None],
-                y=[None],
-                mode="lines",
-                line=dict(
-                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
-                ),
-                name=f"{label}",
-            )
-        )
-
-
+# -----------------------------
+# Styling helpers
+# -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
@ -239,7 +212,6 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
    ]:
        if c in df.columns:
            return c
-    # Fallback: guess an integer-like column (harmless if unused)
    for c in df.columns:
        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
            return c
@ -248,8 +220,7 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:

 def _highlight_threshold(
    df: pd.DataFrame, threshold: float
-) -> "pd.io.formats.style.Styler":
-    """Highlight numeric per-configuration columns with value <= threshold."""
+) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
        c
@ -260,6 +231,7 @@ def _highlight_threshold(
        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+
    return df.style.map(
        lambda v: "background-color:#e6ffe6;font-weight:bold;"
        if pd.notna(v) and v <= threshold
@ -268,7 +240,264 @@ def _highlight_threshold(
    )


-if __name__ == "__main__":
+def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
+    ratio_cols = [c for c in styler.data.columns if "ratio" in str(c).lower()]
+    if not ratio_cols:
+        return styler
+
+    styler = styler.apply(
+        lambda _: ["background-color: #fff3b0"] * len(styler.data),
+        subset=ratio_cols,
+        axis=0,
+    )
+
+    styler = styler.set_table_styles(
+        [
+            {
+                "selector": f"th.col_heading.level0.col{i}",
+                "props": [("background-color", "#fff3b0")],
+            }
+            for i, col in enumerate(styler.data.columns)
+            if col in ratio_cols
+        ],
+        overwrite=False,
+    )
+    return styler
+
+
+def _apply_two_decimals(
+    styler: pd.io.formats.style.Styler,
+) -> pd.io.formats.style.Styler:
+    df = styler.data
+    num_cols = df.select_dtypes("number").columns
+    if len(num_cols) == 0:
+        return styler
+    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
+
+
+# -----------------------------
+# Valid max concurrency summary helpers
+# -----------------------------
+def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len"]
+        if c in df.columns
+    ]
+    exclude = set(key_cols + [conc_col, "qps", "QPS"])
+
+    cols: list[str] = []
+    for c in df.columns:
+        if c in exclude:
+            continue
+        lc = str(c).lower()
+        if lc.startswith("ratio"):
+            continue
+        if lc.endswith("_name") or lc == "test name" or lc == "test_name":
+            continue
+        if pd.api.types.is_numeric_dtype(df[c]):
+            cols.append(c)
+    return cols
+
+
+def _max_concurrency_ok(
+    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+):
+    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
+        return pd.NA
+
+    d = df[[conc_col, cfg_col]].copy()
+    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+    d = d.dropna(subset=[conc_col, cfg_col])
+
+    if d.empty:
+        return pd.NA
+
+    ok = d[d[cfg_col] <= threshold]
+    if ok.empty:
+        return pd.NA
+
+    return ok[conc_col].max()
+
+
+def _value_at_concurrency(df: pd.DataFrame, conc_col: str, cfg_col: str, conc_value):
+    if (
+        df is None
+        or conc_col not in df.columns
+        or cfg_col not in df.columns
+        or pd.isna(conc_value)
+    ):
+        return pd.NA
+
+    d = df[[conc_col, cfg_col]].copy()
+    d[conc_col] = pd.to_numeric(d[conc_col], errors="coerce")
+    d[cfg_col] = pd.to_numeric(d[cfg_col], errors="coerce")
+
+    conc_value = pd.to_numeric(conc_value, errors="coerce")
+    if pd.isna(conc_value):
+        return pd.NA
+
+    hit = d[d[conc_col] == conc_value]
+    if hit.empty:
+        return pd.NA
+    return hit[cfg_col].iloc[0]
+
+
+def build_valid_max_concurrency_summary_html(
+    tput_group_df: pd.DataFrame | None,
+    ttft_group_df: pd.DataFrame | None,
+    tpot_group_df: pd.DataFrame | None,
+    conc_col: str,
+    args,
+) -> str:
+    if ttft_group_df is None and tpot_group_df is None:
+        return ""
+
+    ttft_cols = (
+        _config_value_columns(ttft_group_df, conc_col)
+        if ttft_group_df is not None
+        else []
+    )
+    tpot_cols = (
+        _config_value_columns(tpot_group_df, conc_col)
+        if tpot_group_df is not None
+        else []
+    )
+    tput_cols = (
+        _config_value_columns(tput_group_df, conc_col)
+        if tput_group_df is not None
+        else []
+    )
+
+    if ttft_group_df is not None and tpot_group_df is not None:
+        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
+        if tput_group_df is not None:
+            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
+    else:
+        cfg_cols = ttft_cols or tpot_cols
+
+    if not cfg_cols:
+        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
+
+    rows = []
+    for cfg in cfg_cols:
+        ttft_max = (
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_max = (
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+        both = (
+            pd.NA
+            if (pd.isna(ttft_max) or pd.isna(tpot_max))
+            else min(ttft_max, tpot_max)
+        )
+
+        tput_at_both = (
+            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
+            if tput_group_df is not None
+            else pd.NA
+        )
+        ttft_at_both = (
+            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
+            if ttft_group_df is not None
+            else pd.NA
+        )
+        tpot_at_both = (
+            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
+            if tpot_group_df is not None
+            else pd.NA
+        )
+
+        rows.append(
+            {
+                "Configuration": cfg,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (Both)": both,
+                "Output Tput @ Both (tok/s)": tput_at_both,
+                "TTFT @ Both (ms)": ttft_at_both,
+                "TPOT @ Both (ms)": tpot_at_both,
+            }
+        )
+
+    summary_df = pd.DataFrame(rows)
+
+    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
+    for c in summary_df.columns:
+        if c == "Configuration":
+            continue
+        summary_df[c] = pd.to_numeric(summary_df[c], errors="coerce")
+
+    both_col = f"Max {conc_col} (Both)"
+
+    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
+    formatters = {}
+    for c in summary_df.columns:
+        if c == "Configuration":
+            continue
+        # default argument binds per-column formatter correctly
+        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
+
+    styler = summary_df.style.format(formatters)
+
+    def _green(v):
+        return "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) else ""
+
+    if both_col in summary_df.columns:
+        styler = styler.map(_green, subset=[both_col])
+
+    title = (
+        '<div style="font-size: 1.15em; font-weight: 700; margin: 12px 0 6px 0;">'
+        "Valid Max Concurrency Summary"
+        "</div>\n"
+    )
+    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+# -----------------------------
+# Plot helper
+# -----------------------------
+def _add_limit_line(fig, y_value: float, label: str):
+    fig.add_hline(
+        y=y_value,
+        line_dash="dash",
+        line_color="red" if "ttft" in label.lower() else "blue",
+        annotation_text=f"{label}: {y_value} ms",
+        annotation_position="top left",
+    )
+    if plotly_found:
+        import plotly.graph_objects as go
+
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(
+                    dash="dash",
+                    color="red" if "ttft" in label.lower() else "blue",
+                ),
+                name=label,
+            )
+        )
+
+
+# -----------------------------
+# Refactored main + group-first report
+# -----------------------------
+@dataclass(frozen=True)
+class MetricPlan:
+    data_cols: list[str]
+    drop_column: str
+
+
+def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f", "--file", action="append", type=str, help="input file name"
@ -308,149 +537,289 @@ if __name__ == "__main__":
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
+    return parser

-    args = parser.parse_args()

+def choose_metrics(latency: str) -> MetricPlan:
+    latency = (latency or "").lower()
    drop_column = "P99"
-    name_column = "Test name"
-    info_cols = [
-        "Model",
-        "Dataset Name",
-        "Input Len",
-        "Output Len",
-        "TP Size",
-        "PP Size",
-        "# of max concurrency.",
-        "qps",
-    ]

-    if "median" in args.latency:
-        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-        html_msgs_for_data_cols = [
-            "Compare Output Tokens /n",
-            "Median TTFT /n",
-            "Median TPOT /n",
-        ]
-        drop_column = "P99"
-    elif "p99" in args.latency:
-        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
-        html_msgs_for_data_cols = [
-            "Compare Output Tokens /n",
-            "P99 TTFT /n",
-            "P99 TPOT /n",
-        ]
+    if "median" in latency:
+        return MetricPlan(
+            data_cols=["Output Tput (tok/s)", "Median TTFT (ms)", "Median"],
+            drop_column=drop_column,
+        )
+
+    return MetricPlan(
+        data_cols=["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"],
+        drop_column=drop_column,
+    )
+
+
+def prepare_input_files(args, info_cols: list[str]) -> tuple[list[str], list[str]]:
+    if not args.file:
+        raise ValueError("No input files provided. Use -f/--file.")

    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
    else:
        files = args.file
+
+    return files, info_cols
+
+
+def get_y_axis_col(info_cols: list[str], xaxis: str) -> str:
+    y_axis_index = info_cols.index(xaxis) if xaxis in info_cols else 6
+    return info_cols[y_axis_index]
+
+
+def get_group_cols(output_df: pd.DataFrame, info_cols: list[str]) -> list[str]:
+    filtered_info_cols = info_cols[:4]
+    group_cols = [c for c in filtered_info_cols if c in output_df.columns]
+    if not group_cols:
+        raise ValueError(
+            f"No valid group-by columns. Expected subset: {filtered_info_cols}, "
+            f"but DataFrame has: {list(output_df.columns)}"
+        )
+    return group_cols
+
+
+def normalize_group_key(name):
+    return name if isinstance(name, tuple) else (name,)
+
+
+def group_filename(name, prefix: str = "perf_comparison_") -> str:
+    name_vals = normalize_group_key(name)
+    safe = ",".join(map(str, name_vals)).replace(",", "_").replace("/", "-")
+    return f"{prefix}{safe}.html"
+
+
+def build_group_suffix(group_cols: list[str], name) -> str:
+    name_vals = normalize_group_key(name)
+    return " , ".join(f"{col} : [ {val} ] " for col, val in zip(group_cols, name_vals))
+
+
+def render_metric_table_html(
+    display_group: pd.DataFrame,
+    metric_label: str,
+    group_suffix: str,
+    args,
+) -> str:
+    title = (
+        f'<div style="font-size: 1.25em; font-weight: 600; margin: 12px 0;">'
+        f"{_html.escape(metric_label)}"
+        f" — {_html.escape(group_suffix)}"
+        f"</div>\n"
+    )
+
+    metric_name = metric_label.lower()
+    if "ttft" in metric_name:
+        styler = _highlight_threshold(display_group, args.ttft_max_ms)
+    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+        styler = _highlight_threshold(display_group, args.tpot_max_ms)
+    else:
+        styler = display_group.style
+
+    styler = _apply_two_decimals(styler)
+    styler = highlight_ratio_columns(styler)
+
+    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
+
+
+def maybe_write_plot(
+    main_fh,
+    sub_fh,
+    group_df: pd.DataFrame,
+    raw_data_cols: list[str],
+    metric_label: str,
+    y_axis_col: str,
+    args,
+):
+    if not (args.plot and plotly_found):
+        return
+
+    import plotly.express as px
+
+    df = group_df[raw_data_cols].sort_values(by=y_axis_col)
+    df_melted = df.melt(
+        id_vars=y_axis_col,
+        var_name="Configuration",
+        value_name=metric_label,
+    )
+
+    fig = px.line(
+        df_melted,
+        x=y_axis_col,
+        y=metric_label,
+        color="Configuration",
+        title=f"{metric_label} vs {y_axis_col}",
+        markers=True,
+    )
+
+    # Ensure plot hover + y tick labels are also 2 decimals.
+    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
+    fig.update_yaxes(tickformat=".2f")
+
+    metric_name = metric_label.lower()
+    if "ttft" in metric_name:
+        _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
+        _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+    html = fig.to_html(full_html=True, include_plotlyjs="cdn")
+    main_fh.write(html)
+    sub_fh.write(html)
+
+
+def build_group_keys(
+    df: pd.DataFrame, group_cols: list[str], sort_cols: list[str] | None = None
+):
+    if sort_cols:
+        df = df.sort_values(by=sort_cols)
+    gb = df.groupby(group_cols, dropna=False)
+    return [k for k, _ in gb]
+
+
+def write_report_group_first(
+    files: list[str], info_cols: list[str], plan: MetricPlan, args
+):
+    name_column = "Test name"
+    y_axis_col = get_y_axis_col(info_cols, args.xaxis)
+
    print("comparing : " + ", ".join(files))
-    debug = args.debug
-    plot = args.plot
-    # For Plot feature, assign y axis from one of info_cols
-    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
-    with open("perf_comparison.html", "w") as text_file:
-        for i in range(len(data_cols_to_compare)):
-            output_df, raw_data_cols = compare_data_columns(
-                files,
-                name_column,
-                data_cols_to_compare[i],
-                info_cols,
-                drop_column,
-                debug=debug,
+
+    metric_cache: dict[str, tuple[pd.DataFrame, list[str]]] = {}
+    group_cols_canonical: list[str] | None = None
+
+    for metric_label in plan.data_cols:
+        output_df, raw_data_cols = compare_data_columns(
+            files,
+            name_column,
+            metric_label,
+            info_cols,
+            plan.drop_column,
+            debug=args.debug,
+        )
+
+        raw_data_cols = list(raw_data_cols)
+        raw_data_cols.insert(0, y_axis_col)
+
+        group_cols = get_group_cols(output_df, info_cols)
+        if group_cols_canonical is None:
+            group_cols_canonical = group_cols
+        else:
+            group_cols_canonical = [c for c in group_cols_canonical if c in group_cols]
+
+        metric_cache[metric_label] = (
+            output_df.sort_values(by=args.xaxis),
+            raw_data_cols,
+        )
+
+    if not group_cols_canonical:
+        raise ValueError("No canonical group columns found across metrics.")
+
+    first_metric = plan.data_cols[0]
+    first_df_sorted, _ = metric_cache[first_metric]
+    group_keys = build_group_keys(
+        first_df_sorted, group_cols_canonical, sort_cols=[args.xaxis]
+    )
+
+    metric_groupbys = {
+        metric_label: df.groupby(group_cols_canonical, dropna=False)
+        for metric_label, (df, _) in metric_cache.items()
+    }
+
+    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+        main_fh.write('<meta charset="utf-8">\n')
+        for gkey in group_keys:
+            gkey_tuple = normalize_group_key(gkey)
+            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+            sub_path = group_filename(gkey_tuple)
+            group_header = (
+                '<div style="font-size: 1.4em; font-weight: 700; '
+                'margin: 18px 0 10px 0;">'
+                f"{_html.escape(suffix)}"
+                "</div>\n"
            )

-            # For Plot feature, insert y axis from one of info_cols
-            raw_data_cols.insert(0, info_cols[y_axis_index])
+            main_fh.write(group_header)
+            with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                sub_fh.write('<meta charset="utf-8">\n')
+                sub_fh.write(group_header)
+                tput_group_df = None
+                ttft_group_df = None
+                tpot_group_df = None
+                conc_col = args.xaxis

-            filtered_info_cols = info_cols[:-2]
-            existing_group_cols = [
-                c for c in filtered_info_cols if c in output_df.columns
-            ]
-            if not existing_group_cols:
-                raise ValueError(
-                    f"No valid group-by columns  "
-                    f"Expected subset: {filtered_info_cols}, "
-                    f"but DataFrame has: {list(output_df.columns)}"
+                for metric_label in plan.data_cols:
+                    gb = metric_groupbys[metric_label]
+                    df_sorted, raw_data_cols = metric_cache[metric_label]
+
+                    try:
+                        group_df = gb.get_group(gkey)
+                    except KeyError:
+                        missing = (
+                            '<div style="font-size: 1.1em; font-weight: 600; '
+                            'margin: 10px 0;">'
+                            f"{_html.escape(metric_label)} — missing for this group"
+                            "</div>\n"
+                        )
+
+                        main_fh.write(missing)
+                        sub_fh.write(missing)
+                        continue
+
+                    if conc_col not in group_df.columns:
+                        conc_col = _find_concurrency_col(group_df)
+
+                    mn = metric_label.lower().strip()
+                    if "tok/s" in mn:
+                        tput_group_df = group_df
+                    elif "ttft" in mn:
+                        ttft_group_df = group_df
+                    elif mn in ("p99", "median") or "tpot" in mn:
+                        tpot_group_df = group_df
+
+                    display_group = group_df.drop(
+                        columns=group_cols_canonical, errors="ignore"
+                    )
+
+                    html = render_metric_table_html(
+                        display_group, metric_label, suffix, args
+                    )
+                    main_fh.write(html)
+                    sub_fh.write(html)
+
+                    maybe_write_plot(
+                        main_fh,
+                        sub_fh,
+                        group_df=group_df,
+                        raw_data_cols=raw_data_cols,
+                        metric_label=metric_label,
+                        y_axis_col=y_axis_col,
+                        args=args,
+                    )
+
+                summary_html = build_valid_max_concurrency_summary_html(
+                    tput_group_df=tput_group_df,
+                    ttft_group_df=ttft_group_df,
+                    tpot_group_df=tpot_group_df,
+                    conc_col=conc_col,
+                    args=args,
                )
-            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
-            output_df_sorted = output_df.sort_values(by=args.xaxis)
-            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
-            for name, group in output_groups:
-                group_name = (
-                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
-                )
-                group_html_name = "perf_comparison_" + group_name + ".html"
+                if summary_html:
+                    main_fh.write(summary_html)
+                    sub_fh.write(summary_html)

-                metric_name = str(data_cols_to_compare[i]).lower()
-                if "tok/s" in metric_name:
-                    html = group.to_html()
-                elif "ttft" in metric_name:
-                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
-                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
-                        na_rep="—",
-                    )
-                    html = styler.to_html(
-                        table_attributes='border="1" class="dataframe"'
-                    )
-                elif (
-                    "tpot" in metric_name
-                    or "median" in metric_name
-                    or "p99" in metric_name
-                ):
-                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
-                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
-                        na_rep="—",
-                    )
-                    html = styler.to_html(
-                        table_attributes='border="1" class="dataframe"'
-                    )

-                text_file.write(html_msgs_for_data_cols[i])
-                text_file.write(html)
-                with open(group_html_name, "a+") as sub_text_file:
-                    sub_text_file.write(html_msgs_for_data_cols[i])
-                    sub_text_file.write(html)
+def main():
+    args = build_parser().parse_args()
+    info_cols = list(DEFAULT_INFO_COLS)
+    plan = choose_metrics(args.latency)
+    files, info_cols = prepare_input_files(args, info_cols)
+    write_report_group_first(files, info_cols, plan, args)

-                    if plot and plotly_found:
-                        import plotly.express as px

-                        df = group[raw_data_cols]
-                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                        # Melt DataFrame for plotting
-                        df_melted = df_sorted.melt(
-                            id_vars=info_cols[y_axis_index],
-                            var_name="Configuration",
-                            value_name=data_cols_to_compare[i],
-                        )
-                        title = (
-                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                        )
-                        # Create Plotly line chart
-                        fig = px.line(
-                            df_melted,
-                            x=info_cols[y_axis_index],
-                            y=data_cols_to_compare[i],
-                            color="Configuration",
-                            title=title,
-                            markers=True,
-                        )
-
-                        # ---- Add threshold lines based on metric name ----
-                        if "ttft" in metric_name:
-                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
-                        elif (
-                            "tpot" in metric_name
-                            or "median" in metric_name
-                            or "p99" in metric_name
-                        ):
-                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
-
-                        # Export to HTML
-                        text_file.write(
-                            fig.to_html(full_html=True, include_plotlyjs="cdn")
-                        )
-                        sub_text_file.write(
-                            fig.to_html(full_html=True, include_plotlyjs="cdn")
-                        )
+if __name__ == "__main__":
+    main()
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@ -19,10 +19,8 @@
      "block_size": 128,
      "trust_remote_code": "",
      "disable_log_stats": "",
-      "enforce_eager": "",
      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256,
-      "load_format": "dummy"
+      "max_num_seqs": 256
    },
    "client_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
@ -151,6 +149,45 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@ -40,7 +40,58 @@ When run, benchmark script generates results under **benchmark/results** folder,
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.

-For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+### Visualization
+
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table with real benchmarking results.
+You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
+If you do not see the table, please wait till the benchmark finish running.
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+#### Performance Results Comparison
+
+The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
+When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
+
+Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+
+***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
+
+|    | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|------|-----|-----------|----------|----------|
+| 0  | 12 | inf | 24.98   | 186.03 |  7.45 |
+| 1  | 16 | inf|  25.49  | 246.92 | 9.69 |
+| 2  | 24 | inf| 27.74  | 293.34 |  10.57 |
+| 3  | 32 | inf| 28.61  |306.69 | 10.72 |
+
+***compare-json-results.py – Command-Line Parameters***  
+
+compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots.  
+In most cases, users only need to specify --file to parse the desired benchmark results.
+
+| Parameter              | Type               | Default Value           | Description                                                                                           |
+| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- |
+| `--file`               | `str` (appendable) | *None*                  | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs.     |
+| `--debug`              | `bool`             | `False`                 | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. |
+| `--plot` / `--no-plot` | `bool`             | `True`                  | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation.        |
+| `--xaxis`              | `str`              | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size).          |
+| `--latency`            | `str`              | `p99`                   | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`.                   |
+| `--ttft-max-ms`        | `float`            | `3000.0`                | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds.      |
+| `--tpot-max-ms`        | `float`            | `100.0`                 | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds.      |
+
+***Valid Max Concurrency Summary***  
+
+Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result.  
+The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously.  
+This value is typically used in capacity planning and sizing guides.  
+
+| # | Configuration  | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) |
+| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- |
+| 0 | results-a      | 128.00                                      | 12.00                                     | 12.00                            | 127.76                     | 3000.82          | 93.24            |
+| 1 | results-b      | 128.00                                      | 32.00                                     | 32.00                            | 371.42                     | 2261.53          | 81.74            |

 More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).