mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 12:45:33 +08:00
add SLA information into comparison graph for vLLM Benchmark Suite (#25525)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com> Signed-off-by: louie-tsai <louie.tsai@intel.com> Signed-off-by: Louie Tsai <louie.tsai@intel.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
parent
50b788a17a
commit
3b7bdf983b
@ -7,6 +7,7 @@ from importlib import util
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
pd.options.display.float_format = "{:.2f}".format
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
plotly_found = util.find_spec("plotly.express") is not None
|
||||||
|
|
||||||
|
|
||||||
@ -109,7 +110,10 @@ def compare_data_columns(
|
|||||||
if len(compare_frames) >= 2:
|
if len(compare_frames) >= 2:
|
||||||
base = compare_frames[0]
|
base = compare_frames[0]
|
||||||
current = compare_frames[-1]
|
current = compare_frames[-1]
|
||||||
ratio = current / base
|
if "P99" in data_column or "Median" in data_column:
|
||||||
|
ratio = base / current # for latency
|
||||||
|
else:
|
||||||
|
ratio = current / base
|
||||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||||
frames.append(ratio)
|
frames.append(ratio)
|
||||||
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
|
|||||||
return saved_paths
|
return saved_paths
|
||||||
|
|
||||||
|
|
||||||
|
def _add_limit_line(fig, y_value, label):
|
||||||
|
# Visible dashed line + annotation
|
||||||
|
fig.add_hline(
|
||||||
|
y=y_value,
|
||||||
|
line_dash="dash",
|
||||||
|
line_color="red" if "ttft" in label.lower() else "blue",
|
||||||
|
annotation_text=f"{label}: {y_value} ms",
|
||||||
|
annotation_position="top left",
|
||||||
|
)
|
||||||
|
# Optional: add a legend item (as a transparent helper trace)
|
||||||
|
if plot and plotly_found:
|
||||||
|
import plotly.graph_objects as go
|
||||||
|
|
||||||
|
fig.add_trace(
|
||||||
|
go.Scatter(
|
||||||
|
x=[None],
|
||||||
|
y=[None],
|
||||||
|
mode="lines",
|
||||||
|
line=dict(
|
||||||
|
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
||||||
|
),
|
||||||
|
name=f"{label}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||||
|
for c in [
|
||||||
|
"# of max concurrency.",
|
||||||
|
"# of max concurrency",
|
||||||
|
"Max Concurrency",
|
||||||
|
"max_concurrency",
|
||||||
|
"Concurrency",
|
||||||
|
]:
|
||||||
|
if c in df.columns:
|
||||||
|
return c
|
||||||
|
# Fallback: guess an integer-like column (harmless if unused)
|
||||||
|
for c in df.columns:
|
||||||
|
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
||||||
|
return c
|
||||||
|
return "# of max concurrency."
|
||||||
|
|
||||||
|
|
||||||
|
def _highlight_threshold(
|
||||||
|
df: pd.DataFrame, threshold: float
|
||||||
|
) -> "pd.io.formats.style.Styler":
|
||||||
|
"""Highlight numeric per-configuration columns with value <= threshold."""
|
||||||
|
conc_col = _find_concurrency_col(df)
|
||||||
|
key_cols = [
|
||||||
|
c
|
||||||
|
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
|
||||||
|
if c in df.columns
|
||||||
|
]
|
||||||
|
conf_cols = [
|
||||||
|
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
||||||
|
]
|
||||||
|
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||||
|
return df.style.map(
|
||||||
|
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
||||||
|
if pd.notna(v) and v <= threshold
|
||||||
|
else "",
|
||||||
|
subset=conf_cols,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -220,6 +289,26 @@ if __name__ == "__main__":
|
|||||||
default="# of max concurrency.",
|
default="# of max concurrency.",
|
||||||
help="column name to use as X Axis in comparison graph",
|
help="column name to use as X Axis in comparison graph",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--latency",
|
||||||
|
type=str,
|
||||||
|
default="p99",
|
||||||
|
help="take median|p99 for latency like TTFT/TPOT",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ttft-max-ms",
|
||||||
|
type=float,
|
||||||
|
default=3000.0,
|
||||||
|
help="Reference limit for TTFT plots (ms)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tpot-max-ms",
|
||||||
|
type=float,
|
||||||
|
default=100.0,
|
||||||
|
help="Reference limit for TPOT plots (ms)",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
drop_column = "P99"
|
drop_column = "P99"
|
||||||
@ -234,12 +323,22 @@ if __name__ == "__main__":
|
|||||||
"# of max concurrency.",
|
"# of max concurrency.",
|
||||||
"qps",
|
"qps",
|
||||||
]
|
]
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
|
||||||
html_msgs_for_data_cols = [
|
if "median" in args.latency:
|
||||||
"Compare Output Tokens /n",
|
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||||
"Median TTFT /n",
|
html_msgs_for_data_cols = [
|
||||||
"Median TPOT /n",
|
"Compare Output Tokens /n",
|
||||||
]
|
"Median TTFT /n",
|
||||||
|
"Median TPOT /n",
|
||||||
|
]
|
||||||
|
drop_column = "P99"
|
||||||
|
elif "p99" in args.latency:
|
||||||
|
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
||||||
|
html_msgs_for_data_cols = [
|
||||||
|
"Compare Output Tokens /n",
|
||||||
|
"P99 TTFT /n",
|
||||||
|
"P99 TPOT /n",
|
||||||
|
]
|
||||||
|
|
||||||
if len(args.file) == 1:
|
if len(args.file) == 1:
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||||
@ -275,33 +374,83 @@ if __name__ == "__main__":
|
|||||||
f"Expected subset: {filtered_info_cols}, "
|
f"Expected subset: {filtered_info_cols}, "
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
f"but DataFrame has: {list(output_df.columns)}"
|
||||||
)
|
)
|
||||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||||
|
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
||||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||||
for name, group in output_groups:
|
for name, group in output_groups:
|
||||||
html = group.to_html()
|
group_name = (
|
||||||
|
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
||||||
|
)
|
||||||
|
group_html_name = "perf_comparison_" + group_name + ".html"
|
||||||
|
|
||||||
|
metric_name = str(data_cols_to_compare[i]).lower()
|
||||||
|
if "tok/s" in metric_name:
|
||||||
|
html = group.to_html()
|
||||||
|
elif "ttft" in metric_name:
|
||||||
|
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
||||||
|
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||||
|
na_rep="—",
|
||||||
|
)
|
||||||
|
html = styler.to_html(
|
||||||
|
table_attributes='border="1" class="dataframe"'
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
"tpot" in metric_name
|
||||||
|
or "median" in metric_name
|
||||||
|
or "p99" in metric_name
|
||||||
|
):
|
||||||
|
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
||||||
|
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||||
|
na_rep="—",
|
||||||
|
)
|
||||||
|
html = styler.to_html(
|
||||||
|
table_attributes='border="1" class="dataframe"'
|
||||||
|
)
|
||||||
|
|
||||||
text_file.write(html_msgs_for_data_cols[i])
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
text_file.write(html)
|
text_file.write(html)
|
||||||
|
with open(group_html_name, "a+") as sub_text_file:
|
||||||
|
sub_text_file.write(html_msgs_for_data_cols[i])
|
||||||
|
sub_text_file.write(html)
|
||||||
|
|
||||||
if plot and plotly_found:
|
if plot and plotly_found:
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
|
|
||||||
df = group[raw_data_cols]
|
df = group[raw_data_cols]
|
||||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||||
# Melt DataFrame for plotting
|
# Melt DataFrame for plotting
|
||||||
df_melted = df_sorted.melt(
|
df_melted = df_sorted.melt(
|
||||||
id_vars=info_cols[y_axis_index],
|
id_vars=info_cols[y_axis_index],
|
||||||
var_name="Configuration",
|
var_name="Configuration",
|
||||||
value_name=data_cols_to_compare[i],
|
value_name=data_cols_to_compare[i],
|
||||||
)
|
)
|
||||||
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
title = (
|
||||||
# Create Plotly line chart
|
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||||
fig = px.line(
|
)
|
||||||
df_melted,
|
# Create Plotly line chart
|
||||||
x=info_cols[y_axis_index],
|
fig = px.line(
|
||||||
y=data_cols_to_compare[i],
|
df_melted,
|
||||||
color="Configuration",
|
x=info_cols[y_axis_index],
|
||||||
title=title,
|
y=data_cols_to_compare[i],
|
||||||
markers=True,
|
color="Configuration",
|
||||||
)
|
title=title,
|
||||||
# Export to HTML
|
markers=True,
|
||||||
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
)
|
||||||
|
|
||||||
|
# ---- Add threshold lines based on metric name ----
|
||||||
|
if "ttft" in metric_name:
|
||||||
|
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
||||||
|
elif (
|
||||||
|
"tpot" in metric_name
|
||||||
|
or "median" in metric_name
|
||||||
|
or "p99" in metric_name
|
||||||
|
):
|
||||||
|
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
||||||
|
|
||||||
|
# Export to HTML
|
||||||
|
text_file.write(
|
||||||
|
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||||
|
)
|
||||||
|
sub_text_file.write(
|
||||||
|
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||||
|
)
|
||||||
|
|||||||
@ -63,9 +63,11 @@ serving_column_mapping = {
|
|||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
|
"std_ttft_ms": "STD TTFT (ms)",
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
"median_tpot_ms": "Median",
|
"median_tpot_ms": "Median",
|
||||||
"p99_tpot_ms": "P99",
|
"p99_tpot_ms": "P99",
|
||||||
|
"std_tpot_ms": "STD TPOT (ms)",
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
"p99_itl_ms": "P99 ITL (ms)",
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
@ -368,7 +370,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
|
|||||||
@ -471,6 +471,11 @@ main() {
|
|||||||
mkdir -p $RESULTS_FOLDER
|
mkdir -p $RESULTS_FOLDER
|
||||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# dump vllm info via vllm collect-env
|
||||||
|
env_output=$(vllm collect-env)
|
||||||
|
|
||||||
|
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
||||||
|
|
||||||
# benchmarking
|
# benchmarking
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||||
|
|||||||
@ -1,28 +1,24 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "latency_llama8B_tp1",
|
"test_name": "latency_llama8B_tp2",
|
||||||
"environment_variables": {
|
"environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 2,
|
||||||
"load_format": "dummy",
|
"dtype": "bfloat16",
|
||||||
"num_iters_warmup": 5,
|
"distributed_executor_backend": "mp",
|
||||||
"num_iters": 15
|
"block_size": 128,
|
||||||
}
|
"trust_remote_code": "",
|
||||||
},
|
"disable_log_stats": "",
|
||||||
{
|
"enforce_eager": "",
|
||||||
"test_name": "latency_llama8B_tp4",
|
"max_num_batched_tokens": 2048,
|
||||||
"environment_variables": {
|
"max_num_seqs": 256,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
"num_iters_warmup": 5,
|
||||||
"num_iters": 15
|
"num_iters": 15
|
||||||
}
|
}
|
||||||
|
|||||||
@ -95,6 +95,38 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -233,6 +265,41 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -365,6 +432,38 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -503,6 +602,41 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -638,6 +772,39 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -780,6 +947,42 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -28,13 +28,13 @@
|
|||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -60,13 +60,13 @@
|
|||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -76,39 +76,7 @@
|
|||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 1,
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
@ -124,16 +92,16 @@
|
|||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 1024,
|
"random-input-len": 128,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 100
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -143,7 +111,7 @@
|
|||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"pipeline_parallel_size": 6,
|
"tensor_parallel_size": 2,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
@ -159,10 +127,150 @@
|
|||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 1024,
|
"random-input-len": 128,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 100
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@ -1,29 +1,24 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "throughput_llama8B_tp1",
|
"test_name": "throughput_llama8B_tp2",
|
||||||
"environment_variables": {
|
"environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 2,
|
||||||
"load_format": "dummy",
|
"dtype": "bfloat16",
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"distributed_executor_backend": "mp",
|
||||||
"num_prompts": 200,
|
"block_size": 128,
|
||||||
"backend": "vllm"
|
"trust_remote_code": "",
|
||||||
}
|
"disable_log_stats": "",
|
||||||
},
|
"enforce_eager": "",
|
||||||
{
|
"max_num_batched_tokens": 2048,
|
||||||
"test_name": "throughput_llama8B_tp4",
|
"max_num_seqs": 256,
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200,
|
"num_prompts": 200,
|
||||||
"backend": "vllm"
|
"backend": "vllm"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user