diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 5ea5a50a258a4..c8bf7b0453662 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -7,6 +7,7 @@ from importlib import util import pandas as pd +pd.options.display.float_format = "{:.2f}".format plotly_found = util.find_spec("plotly.express") is not None @@ -109,7 +110,10 @@ def compare_data_columns( if len(compare_frames) >= 2: base = compare_frames[0] current = compare_frames[-1] - ratio = current / base + if "P99" in data_column or "Median" in data_column: + ratio = base / current # for latency + else: + ratio = current / base ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) @@ -199,6 +203,71 @@ def split_json_by_tp_pp( return saved_paths +def _add_limit_line(fig, y_value, label): + # Visible dashed line + annotation + fig.add_hline( + y=y_value, + line_dash="dash", + line_color="red" if "ttft" in label.lower() else "blue", + annotation_text=f"{label}: {y_value} ms", + annotation_position="top left", + ) + # Optional: add a legend item (as a transparent helper trace) + if plot and plotly_found: + import plotly.graph_objects as go + + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + line=dict( + dash="dash", color="red" if "ttft" in label.lower() else "blue" + ), + name=f"{label}", + ) + ) + + +def _find_concurrency_col(df: pd.DataFrame) -> str: + for c in [ + "# of max concurrency.", + "# of max concurrency", + "Max Concurrency", + "max_concurrency", + "Concurrency", + ]: + if c in df.columns: + return c + # Fallback: guess an integer-like column (harmless if unused) + for c in df.columns: + if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: + return c + return "# of max concurrency." + + +def _highlight_threshold( + df: pd.DataFrame, threshold: float +) -> "pd.io.formats.style.Styler": + """Highlight numeric per-configuration columns with value <= threshold.""" + conc_col = _find_concurrency_col(df) + key_cols = [ + c + for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] + if c in df.columns + ] + conf_cols = [ + c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") + ] + conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] + return df.style.map( + lambda v: "background-color:#e6ffe6;font-weight:bold;" + if pd.notna(v) and v <= threshold + else "", + subset=conf_cols, + ) + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -220,6 +289,26 @@ if __name__ == "__main__": default="# of max concurrency.", help="column name to use as X Axis in comparison graph", ) + parser.add_argument( + "-l", + "--latency", + type=str, + default="p99", + help="take median|p99 for latency like TTFT/TPOT", + ) + parser.add_argument( + "--ttft-max-ms", + type=float, + default=3000.0, + help="Reference limit for TTFT plots (ms)", + ) + parser.add_argument( + "--tpot-max-ms", + type=float, + default=100.0, + help="Reference limit for TPOT plots (ms)", + ) + args = parser.parse_args() drop_column = "P99" @@ -234,12 +323,22 @@ if __name__ == "__main__": "# of max concurrency.", "qps", ] - data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] - html_msgs_for_data_cols = [ - "Compare Output Tokens /n", - "Median TTFT /n", - "Median TPOT /n", - ] + + if "median" in args.latency: + data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] + html_msgs_for_data_cols = [ + "Compare Output Tokens /n", + "Median TTFT /n", + "Median TPOT /n", + ] + drop_column = "P99" + elif "p99" in args.latency: + data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"] + html_msgs_for_data_cols = [ + "Compare Output Tokens /n", + "P99 TTFT /n", + "P99 TPOT /n", + ] if len(args.file) == 1: files = split_json_by_tp_pp(args.file[0], output_root="splits") @@ -275,33 +374,83 @@ if __name__ == "__main__": f"Expected subset: {filtered_info_cols}, " f"but DataFrame has: {list(output_df.columns)}" ) - output_df_sorted = output_df.sort_values(by=existing_group_cols) + # output_df_sorted = output_df.sort_values(by=existing_group_cols) + output_df_sorted = output_df.sort_values(by=args.xaxis) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: - html = group.to_html() + group_name = ( + ",".join(map(str, name)).replace(",", "_").replace("/", "-") + ) + group_html_name = "perf_comparison_" + group_name + ".html" + + metric_name = str(data_cols_to_compare[i]).lower() + if "tok/s" in metric_name: + html = group.to_html() + elif "ttft" in metric_name: + styler = _highlight_threshold(group, args.ttft_max_ms).format( + {c: "{:.2f}" for c in group.select_dtypes("number").columns}, + na_rep="—", + ) + html = styler.to_html( + table_attributes='border="1" class="dataframe"' + ) + elif ( + "tpot" in metric_name + or "median" in metric_name + or "p99" in metric_name + ): + styler = _highlight_threshold(group, args.tpot_max_ms).format( + {c: "{:.2f}" for c in group.select_dtypes("number").columns}, + na_rep="—", + ) + html = styler.to_html( + table_attributes='border="1" class="dataframe"' + ) + text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) + with open(group_html_name, "a+") as sub_text_file: + sub_text_file.write(html_msgs_for_data_cols[i]) + sub_text_file.write(html) - if plot and plotly_found: - import plotly.express as px + if plot and plotly_found: + import plotly.express as px - df = group[raw_data_cols] - df_sorted = df.sort_values(by=info_cols[y_axis_index]) - # Melt DataFrame for plotting - df_melted = df_sorted.melt( - id_vars=info_cols[y_axis_index], - var_name="Configuration", - value_name=data_cols_to_compare[i], - ) - title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] - # Create Plotly line chart - fig = px.line( - df_melted, - x=info_cols[y_axis_index], - y=data_cols_to_compare[i], - color="Configuration", - title=title, - markers=True, - ) - # Export to HTML - text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) + df = group[raw_data_cols] + df_sorted = df.sort_values(by=info_cols[y_axis_index]) + # Melt DataFrame for plotting + df_melted = df_sorted.melt( + id_vars=info_cols[y_axis_index], + var_name="Configuration", + value_name=data_cols_to_compare[i], + ) + title = ( + data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] + ) + # Create Plotly line chart + fig = px.line( + df_melted, + x=info_cols[y_axis_index], + y=data_cols_to_compare[i], + color="Configuration", + title=title, + markers=True, + ) + + # ---- Add threshold lines based on metric name ---- + if "ttft" in metric_name: + _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") + elif ( + "tpot" in metric_name + or "median" in metric_name + or "p99" in metric_name + ): + _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") + + # Export to HTML + text_file.write( + fig.to_html(full_html=True, include_plotlyjs="cdn") + ) + sub_text_file.write( + fig.to_html(full_html=True, include_plotlyjs="cdn") + ) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index a655a650cb325..a7544aeef4c74 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -63,9 +63,11 @@ serving_column_mapping = { "mean_ttft_ms": "Mean TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)", + "std_ttft_ms": "STD TTFT (ms)", "mean_tpot_ms": "Mean TPOT (ms)", "median_tpot_ms": "Median", "p99_tpot_ms": "P99", + "std_tpot_ms": "STD TPOT (ms)", "mean_itl_ms": "Mean ITL (ms)", "median_itl_ms": "Median ITL (ms)", "p99_itl_ms": "P99 ITL (ms)", @@ -368,7 +370,7 @@ if __name__ == "__main__": # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # we want to turn it into "8xGPUTYPE" df["GPU"] = df["GPU"].apply( - lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}" + lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0]) ) # get markdown tables diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index c64e5638029e7..5a47576483bbf 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -471,6 +471,11 @@ main() { mkdir -p $RESULTS_FOLDER QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + # dump vllm info via vllm collect-env + env_output=$(vllm collect-env) + + echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" + # benchmarking run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json index 569117aae852d..77d1694ec8641 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json @@ -1,28 +1,24 @@ [ { - "test_name": "latency_llama8B_tp1", + "test_name": "latency_llama8B_tp2", "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "num_iters_warmup": 5, - "num_iters": 15 - } - }, - { - "test_name": "latency_llama8B_tp4", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "num_iters_warmup": 5, "num_iters": 15 } diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index ce396d6e54f27..0b1a42e790255 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -95,6 +95,38 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_bf16_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -233,6 +265,41 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_bf16_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", "qps_list": ["inf"], @@ -365,6 +432,38 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_int8_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -503,6 +602,41 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_int8_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", "qps_list": ["inf"], @@ -638,6 +772,39 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_int4_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -780,6 +947,42 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", "qps_list": ["inf"], diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index e21c8df0a9fe9..f792956f39472 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -2,7 +2,7 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -28,13 +28,13 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 + "num_prompts": 32 } }, { "test_name": "serving_llama8B_tp2_sharegpt", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -60,13 +60,13 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 + "num_prompts": 32 } }, { - "test_name": "serving_llama8B_tp4_sharegpt", + "test_name": "serving_llama8B_tp1_random_128_128", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -76,39 +76,7 @@ }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_tp4_random_1024_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, + "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, @@ -124,16 +92,16 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", - "random-input-len": 1024, + "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 100 + "num_prompts": 32 } }, { - "test_name": "serving_llama8B_pp6_random_1024_128", + "test_name": "serving_llama8B_tp2_random_128_128", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -143,7 +111,7 @@ }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 6, + "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, @@ -159,10 +127,150 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", - "random-input-len": 1024, + "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 100 + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 32 } } ] diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json index 48c015aa8403b..dc214ddfb27e3 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json @@ -1,29 +1,24 @@ [ { - "test_name": "throughput_llama8B_tp1", + "test_name": "throughput_llama8B_tp2", "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_llama8B_tp4", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm"