add SLA information into comparison graph for vLLM Benchmark Suite (#25525)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Louie Tsai 2025-10-23 01:04:59 -07:00 committed by GitHub
parent 50b788a17a
commit 3b7bdf983b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 571 additions and 113 deletions

View File

@ -7,6 +7,7 @@ from importlib import util
import pandas as pd import pandas as pd
pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None plotly_found = util.find_spec("plotly.express") is not None
@ -109,7 +110,10 @@ def compare_data_columns(
if len(compare_frames) >= 2: if len(compare_frames) >= 2:
base = compare_frames[0] base = compare_frames[0]
current = compare_frames[-1] current = compare_frames[-1]
ratio = current / base if "P99" in data_column or "Median" in data_column:
ratio = base / current # for latency
else:
ratio = current / base
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
ratio.name = f"Ratio 1 vs {len(compare_frames)}" ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio) frames.append(ratio)
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
return saved_paths return saved_paths
def _add_limit_line(fig, y_value, label):
# Visible dashed line + annotation
fig.add_hline(
y=y_value,
line_dash="dash",
line_color="red" if "ttft" in label.lower() else "blue",
annotation_text=f"{label}: {y_value} ms",
annotation_position="top left",
)
# Optional: add a legend item (as a transparent helper trace)
if plot and plotly_found:
import plotly.graph_objects as go
fig.add_trace(
go.Scatter(
x=[None],
y=[None],
mode="lines",
line=dict(
dash="dash", color="red" if "ttft" in label.lower() else "blue"
),
name=f"{label}",
)
)
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
# Fallback: guess an integer-like column (harmless if unused)
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
return "# of max concurrency."
def _highlight_threshold(
df: pd.DataFrame, threshold: float
) -> "pd.io.formats.style.Styler":
"""Highlight numeric per-configuration columns with value <= threshold."""
conc_col = _find_concurrency_col(df)
key_cols = [
c
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
if c in df.columns
]
conf_cols = [
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
return df.style.map(
lambda v: "background-color:#e6ffe6;font-weight:bold;"
if pd.notna(v) and v <= threshold
else "",
subset=conf_cols,
)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
@ -220,6 +289,26 @@ if __name__ == "__main__":
default="# of max concurrency.", default="# of max concurrency.",
help="column name to use as X Axis in comparison graph", help="column name to use as X Axis in comparison graph",
) )
parser.add_argument(
"-l",
"--latency",
type=str,
default="p99",
help="take median|p99 for latency like TTFT/TPOT",
)
parser.add_argument(
"--ttft-max-ms",
type=float,
default=3000.0,
help="Reference limit for TTFT plots (ms)",
)
parser.add_argument(
"--tpot-max-ms",
type=float,
default=100.0,
help="Reference limit for TPOT plots (ms)",
)
args = parser.parse_args() args = parser.parse_args()
drop_column = "P99" drop_column = "P99"
@ -234,12 +323,22 @@ if __name__ == "__main__":
"# of max concurrency.", "# of max concurrency.",
"qps", "qps",
] ]
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [ if "median" in args.latency:
"Compare Output Tokens /n", data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
"Median TTFT /n", html_msgs_for_data_cols = [
"Median TPOT /n", "Compare Output Tokens /n",
] "Median TTFT /n",
"Median TPOT /n",
]
drop_column = "P99"
elif "p99" in args.latency:
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"P99 TTFT /n",
"P99 TPOT /n",
]
if len(args.file) == 1: if len(args.file) == 1:
files = split_json_by_tp_pp(args.file[0], output_root="splits") files = split_json_by_tp_pp(args.file[0], output_root="splits")
@ -275,33 +374,83 @@ if __name__ == "__main__":
f"Expected subset: {filtered_info_cols}, " f"Expected subset: {filtered_info_cols}, "
f"but DataFrame has: {list(output_df.columns)}" f"but DataFrame has: {list(output_df.columns)}"
) )
output_df_sorted = output_df.sort_values(by=existing_group_cols) # output_df_sorted = output_df.sort_values(by=existing_group_cols)
output_df_sorted = output_df.sort_values(by=args.xaxis)
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
for name, group in output_groups: for name, group in output_groups:
html = group.to_html() group_name = (
",".join(map(str, name)).replace(",", "_").replace("/", "-")
)
group_html_name = "perf_comparison_" + group_name + ".html"
metric_name = str(data_cols_to_compare[i]).lower()
if "tok/s" in metric_name:
html = group.to_html()
elif "ttft" in metric_name:
styler = _highlight_threshold(group, args.ttft_max_ms).format(
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
na_rep="",
)
html = styler.to_html(
table_attributes='border="1" class="dataframe"'
)
elif (
"tpot" in metric_name
or "median" in metric_name
or "p99" in metric_name
):
styler = _highlight_threshold(group, args.tpot_max_ms).format(
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
na_rep="",
)
html = styler.to_html(
table_attributes='border="1" class="dataframe"'
)
text_file.write(html_msgs_for_data_cols[i]) text_file.write(html_msgs_for_data_cols[i])
text_file.write(html) text_file.write(html)
with open(group_html_name, "a+") as sub_text_file:
sub_text_file.write(html_msgs_for_data_cols[i])
sub_text_file.write(html)
if plot and plotly_found: if plot and plotly_found:
import plotly.express as px import plotly.express as px
df = group[raw_data_cols] df = group[raw_data_cols]
df_sorted = df.sort_values(by=info_cols[y_axis_index]) df_sorted = df.sort_values(by=info_cols[y_axis_index])
# Melt DataFrame for plotting # Melt DataFrame for plotting
df_melted = df_sorted.melt( df_melted = df_sorted.melt(
id_vars=info_cols[y_axis_index], id_vars=info_cols[y_axis_index],
var_name="Configuration", var_name="Configuration",
value_name=data_cols_to_compare[i], value_name=data_cols_to_compare[i],
) )
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] title = (
# Create Plotly line chart data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
fig = px.line( )
df_melted, # Create Plotly line chart
x=info_cols[y_axis_index], fig = px.line(
y=data_cols_to_compare[i], df_melted,
color="Configuration", x=info_cols[y_axis_index],
title=title, y=data_cols_to_compare[i],
markers=True, color="Configuration",
) title=title,
# Export to HTML markers=True,
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) )
# ---- Add threshold lines based on metric name ----
if "ttft" in metric_name:
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
elif (
"tpot" in metric_name
or "median" in metric_name
or "p99" in metric_name
):
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
# Export to HTML
text_file.write(
fig.to_html(full_html=True, include_plotlyjs="cdn")
)
sub_text_file.write(
fig.to_html(full_html=True, include_plotlyjs="cdn")
)

View File

@ -63,9 +63,11 @@ serving_column_mapping = {
"mean_ttft_ms": "Mean TTFT (ms)", "mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)",
"std_ttft_ms": "STD TTFT (ms)",
"mean_tpot_ms": "Mean TPOT (ms)", "mean_tpot_ms": "Mean TPOT (ms)",
"median_tpot_ms": "Median", "median_tpot_ms": "Median",
"p99_tpot_ms": "P99", "p99_tpot_ms": "P99",
"std_tpot_ms": "STD TPOT (ms)",
"mean_itl_ms": "Mean ITL (ms)", "mean_itl_ms": "Mean ITL (ms)",
"median_itl_ms": "Median ITL (ms)", "median_itl_ms": "Median ITL (ms)",
"p99_itl_ms": "P99 ITL (ms)", "p99_itl_ms": "P99 ITL (ms)",
@ -368,7 +370,7 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE" # we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply( df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}" lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
) )
# get markdown tables # get markdown tables

View File

@ -471,6 +471,11 @@ main() {
mkdir -p $RESULTS_FOLDER mkdir -p $RESULTS_FOLDER
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# dump vllm info via vllm collect-env
env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking # benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"

View File

@ -1,28 +1,24 @@
[ [
{ {
"test_name": "latency_llama8B_tp1", "test_name": "latency_llama8B_tp2",
"environment_variables": { "environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40 "VLLM_CPU_KVCACHE_SPACE": 40
}, },
"parameters": { "parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct", "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 2,
"load_format": "dummy", "dtype": "bfloat16",
"num_iters_warmup": 5, "distributed_executor_backend": "mp",
"num_iters": 15 "block_size": 128,
} "trust_remote_code": "",
}, "disable_log_stats": "",
{ "enforce_eager": "",
"test_name": "latency_llama8B_tp4", "max_num_batched_tokens": 2048,
"environment_variables": { "max_num_seqs": 256,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5, "num_iters_warmup": 5,
"num_iters": 15 "num_iters": 15
} }

View File

@ -95,6 +95,38 @@
"num_prompts": 200 "num_prompts": 200
} }
}, },
{
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{ {
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
"qps_list": ["inf"], "qps_list": ["inf"],
@ -233,6 +265,41 @@
"num_prompts": 1000 "num_prompts": 1000
} }
}, },
{
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{ {
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
"qps_list": ["inf"], "qps_list": ["inf"],
@ -365,6 +432,38 @@
"num_prompts": 200 "num_prompts": 200
} }
}, },
{
"test_name": "serving_llama8B_int8_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{ {
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt", "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
"qps_list": ["inf"], "qps_list": ["inf"],
@ -503,6 +602,41 @@
"num_prompts": 1000 "num_prompts": 1000
} }
}, },
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{ {
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128", "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
"qps_list": ["inf"], "qps_list": ["inf"],
@ -638,6 +772,39 @@
"num_prompts": 200 "num_prompts": 200
} }
}, },
{
"test_name": "serving_llama8B_int4_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{ {
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt", "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
"qps_list": ["inf"], "qps_list": ["inf"],
@ -780,6 +947,42 @@
"num_prompts": 1000 "num_prompts": 1000
} }
}, },
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{ {
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128", "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
"qps_list": ["inf"], "qps_list": ["inf"],

View File

@ -2,7 +2,7 @@
{ {
"test_name": "serving_llama8B_tp1_sharegpt", "test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "max_concurrency_list": [32],
"server_environment_variables": { "server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000, "VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -28,13 +28,13 @@
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200 "num_prompts": 32
} }
}, },
{ {
"test_name": "serving_llama8B_tp2_sharegpt", "test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "max_concurrency_list": [32],
"server_environment_variables": { "server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000, "VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -60,13 +60,13 @@
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200 "num_prompts": 32
} }
}, },
{ {
"test_name": "serving_llama8B_tp4_sharegpt", "test_name": "serving_llama8B_tp1_random_128_128",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "max_concurrency_list": [32],
"server_environment_variables": { "server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000, "VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -76,39 +76,7 @@
}, },
"server_parameters": { "server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct", "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16", "dtype": "bfloat16",
"distributed_executor_backend": "mp", "distributed_executor_backend": "mp",
"block_size": 128, "block_size": 128,
@ -124,16 +92,16 @@
"model": "meta-llama/Llama-3.1-8B-Instruct", "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 1024, "random-input-len": 128,
"random-output-len": 128, "random-output-len": 128,
"ignore-eos": "", "ignore-eos": "",
"num_prompts": 100 "num_prompts": 32
} }
}, },
{ {
"test_name": "serving_llama8B_pp6_random_1024_128", "test_name": "serving_llama8B_tp2_random_128_128",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "max_concurrency_list": [32],
"server_environment_variables": { "server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000, "VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -143,7 +111,7 @@
}, },
"server_parameters": { "server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct", "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6, "tensor_parallel_size": 2,
"dtype": "bfloat16", "dtype": "bfloat16",
"distributed_executor_backend": "mp", "distributed_executor_backend": "mp",
"block_size": 128, "block_size": 128,
@ -159,10 +127,150 @@
"model": "meta-llama/Llama-3.1-8B-Instruct", "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 1024, "random-input-len": 128,
"random-output-len": 128, "random-output-len": 128,
"ignore-eos": "", "ignore-eos": "",
"num_prompts": 100 "num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
} }
} }
] ]

View File

@ -1,29 +1,24 @@
[ [
{ {
"test_name": "throughput_llama8B_tp1", "test_name": "throughput_llama8B_tp2",
"environment_variables": { "environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40 "VLLM_CPU_KVCACHE_SPACE": 40
}, },
"parameters": { "parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct", "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 2,
"load_format": "dummy", "dtype": "bfloat16",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "distributed_executor_backend": "mp",
"num_prompts": 200, "block_size": 128,
"backend": "vllm" "trust_remote_code": "",
} "disable_log_stats": "",
}, "enforce_eager": "",
{ "max_num_batched_tokens": 2048,
"test_name": "throughput_llama8B_tp4", "max_num_seqs": 256,
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200, "num_prompts": 200,
"backend": "vllm" "backend": "vllm"