diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 3f2e2da397977..b39f9899a8f28 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
@@ -138,28 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-Here is an example using the script to compare result_a and result_b without detail test name.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
-
-| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
-|----|----------------------------------------|----------------------------------------|----------|
-| 0 | 142.633982 | 156.526018 | 1.097396 |
-| 1 | 241.620334 | 294.018783 | 1.216863 |
-| 2 | 218.298905 | 262.664916 | 1.203235 |
-| 3 | 242.743860 | 299.816190 | 1.235113 |
-
-Here is an example using the script to compare result_a and result_b with detail test name.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
-|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
-| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
-| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
-| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
-| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
-| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
+| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
+|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
+| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
+| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
+
+A comparison diagram will be generated below the table.
+Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
+
## Nightly test details
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 20c106234935c..12c4ba6aa69a6 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -1,24 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
+import json
+import os
import pandas as pd
def compare_data_columns(
- files, name_column, data_column, drop_column, ignore_test_name=False
+ files, name_column, data_column, info_cols, drop_column, debug=False
):
print("\ncompare_data_column: " + data_column)
frames = []
+ raw_data_cols = []
compare_frames = []
for file in files:
data_df = pd.read_json(file)
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
- if ignore_test_name is False:
+ # Show all info columns in the first couple columns
+ if not frames:
+ for col in info_cols:
+ if col not in serving_df.columns:
+ print(f"Skipping missing column: {col}")
+ continue
+ frames.append(serving_df[col])
+ # only show test name under debug mode
+ if debug is True:
serving_df = serving_df.rename(columns={name_column: file + "_name"})
frames.append(serving_df[file + "_name"])
+
+ file = "/".join(file.split("/")[:-1])
serving_df = serving_df.rename(columns={data_column: file})
frames.append(serving_df[file])
+ raw_data_cols.append(file)
compare_frames.append(serving_df[file])
if len(compare_frames) >= 2:
# Compare numbers among two files
@@ -27,7 +41,68 @@ def compare_data_columns(
compare_frames.pop(1)
concat_df = pd.concat(frames, axis=1)
- return concat_df
+ print(raw_data_cols)
+ return concat_df, raw_data_cols
+
+
+def split_json_by_tp_pp(
+ input_file: str = "benchmark_results.json", output_root: str = "."
+) -> list[str]:
+ """
+ Split a benchmark JSON into separate folders by (TP Size, PP Size).
+
+ Creates: /tp{TP}_pp{PP}/benchmark_results.json
+ Returns: list of file paths written.
+ """
+ # Load JSON data into DataFrame
+ with open(input_file, encoding="utf-8") as f:
+ data = json.load(f)
+
+ # If the JSON is a dict with a list under common keys, use that list
+ if isinstance(data, dict):
+ for key in ("results", "serving_results", "benchmarks", "data"):
+ if isinstance(data.get(key), list):
+ data = data[key]
+ break
+
+ df = pd.DataFrame(data)
+
+ # Handle alias column names
+ rename_map = {
+ "tp_size": "TP Size",
+ "tensor_parallel_size": "TP Size",
+ "pp_size": "PP Size",
+ "pipeline_parallel_size": "PP Size",
+ }
+ df.rename(
+ columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
+ )
+
+ # Ensure TP/PP columns exist (default to 1 if missing)
+ if "TP Size" not in df.columns:
+ df["TP Size"] = 1
+ if "PP Size" not in df.columns:
+ df["PP Size"] = 1
+
+ # make sure TP/PP are numeric ints with no NaN
+ df["TP Size"] = (
+ pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
+ )
+ df["PP Size"] = (
+ pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
+ )
+
+ # Split into separate folders
+ saved_paths: list[str] = []
+ for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
+ folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
+ os.makedirs(folder_name, exist_ok=True)
+ filepath = os.path.join(folder_name, "benchmark_results.json")
+ group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
+ print(f"Saved: {filepath}")
+ saved_paths.append(filepath)
+
+ return saved_paths
if __name__ == "__main__":
@@ -36,31 +111,105 @@ if __name__ == "__main__":
"-f", "--file", action="append", type=str, help="input file name"
)
parser.add_argument(
- "--ignore_test_name", action="store_true", help="ignore_test_name or not"
+ "--debug", action="store_true", help="show all information for debugging"
+ )
+ parser.add_argument(
+ "--plot",
+ action=argparse.BooleanOptionalAction,
+ default=True,
+ help="plot perf diagrams or not --no-plot --plot",
+ )
+ parser.add_argument(
+ "-x",
+ "--xaxis",
+ type=str,
+ default="# of max concurrency.",
+ help="column name to use as X Axis in comparision graph",
)
args = parser.parse_args()
- files = args.file
- print("comparing : " + ", ".join(files))
drop_column = "P99"
name_column = "Test name"
+ info_cols = [
+ "Model",
+ "Dataset Name",
+ "Input Len",
+ "Output Len",
+ "TP Size",
+ "PP Size",
+ "# of max concurrency.",
+ "qps",
+ ]
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
- ignore_test_name = args.ignore_test_name
+
+ if len(args.file) == 1:
+ files = split_json_by_tp_pp(args.file[0], output_root="splits")
+ info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
+ else:
+ files = args.file
+ print("comparing : " + ", ".join(files))
+ debug = args.debug
+ plot = args.plot
+ # For Plot feature, assign y axis from one of info_cols
+ y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
with open("perf_comparison.html", "w") as text_file:
for i in range(len(data_cols_to_compare)):
- output_df = compare_data_columns(
+ output_df, raw_data_cols = compare_data_columns(
files,
name_column,
data_cols_to_compare[i],
+ info_cols,
drop_column,
- ignore_test_name=ignore_test_name,
+ debug=debug,
)
- print(output_df)
- html = output_df.to_html()
- text_file.write(html_msgs_for_data_cols[i])
- text_file.write(html)
+
+ # For Plot feature, insert y axis from one of info_cols
+ raw_data_cols.insert(0, info_cols[y_axis_index])
+
+ filtered_info_cols = info_cols[:-2]
+ existing_group_cols = [
+ c for c in filtered_info_cols if c in output_df.columns
+ ]
+ if not existing_group_cols:
+ raise ValueError(
+ f"No valid group-by columns "
+ f"Expected subset: {filtered_info_cols}, "
+ f"but DataFrame has: {list(output_df.columns)}"
+ )
+
+ output_df_sorted = output_df.sort_values(by=existing_group_cols)
+ output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
+ for name, group in output_groups:
+ html = group.to_html()
+ text_file.write(html_msgs_for_data_cols[i])
+ text_file.write(html)
+
+ if plot is True:
+ import pandas as pd
+ import plotly.express as px
+
+ df = group[raw_data_cols]
+ df_sorted = df.sort_values(by=info_cols[y_axis_index])
+ # Melt DataFrame for plotting
+ df_melted = df_sorted.melt(
+ id_vars=info_cols[y_axis_index],
+ var_name="Configuration",
+ value_name=data_cols_to_compare[i],
+ )
+ title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+ # Create Plotly line chart
+ fig = px.line(
+ df_melted,
+ x=info_cols[y_axis_index],
+ y=data_cols_to_compare[i],
+ color="Configuration",
+ title=title,
+ markers=True,
+ )
+ # Export to HTML
+ text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 554256b4bdb8b..496ee6083abde 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,17 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
import json
import os
+import re
+import shlex
from importlib import util
from pathlib import Path
+from typing import Any
import pandas as pd
import psutil
from tabulate import tabulate
-results_folder = Path("results/")
-
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
@@ -42,14 +44,22 @@ throughput_results_column_mapping = {
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
+ "model_id": "Model",
+ "dataset_name": "Dataset Name",
+ "input_len": "Input Len",
+ "output_len": "Output Len",
+ "tp_size": "TP Size",
+ "pp_size": "PP Size",
+ "dtype": "dtype",
"gpu_type": "GPU",
"completed": "# of req.",
+ "qps": "qps",
"max_concurrency": "# of max concurrency.",
"request_throughput": "Tput (req/s)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
- "total_input_tokens": "Total input tokens",
- "total_output_tokens": "Total output tokens",
+ # "total_input_tokens": "Total input tokens",
+ # "total_output_tokens": "Total output tokens",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
@@ -94,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"):
bytes /= factor
+def _coerce(val: str) -> Any:
+ """Best-effort type coercion from string to Python types."""
+ low = val.lower()
+ if low == "null":
+ return None
+ if low == "true":
+ return True
+ if low == "false":
+ return False
+ # integers
+ if re.fullmatch(r"[+-]?\d+", val):
+ try:
+ return int(val)
+ except ValueError:
+ pass
+ # floats (keep 'inf'/'-inf'/'nan' as strings)
+ if re.fullmatch(r"[+-]?\d*\.\d+", val):
+ try:
+ return float(val)
+ except ValueError:
+ pass
+ return val
+
+
+def parse_client_command(cmd: str) -> dict[str, Any]:
+ """Parse the client_command shell string into {executable, script, args}."""
+ toks = shlex.split(cmd)
+ if len(toks) < 2:
+ raise ValueError("client_command must include an executable and a script")
+ executable, script = toks[0], toks[1]
+ args: dict[str, Any] = {}
+
+ i = 2
+ while i < len(toks):
+ t = toks[i]
+ if t.startswith("--"):
+ # --key=value or --key (value) or boolean flag
+ if "=" in t:
+ key, val = t.split("=", 1)
+ if key == "--metadata":
+ md = {}
+ if val:
+ if "=" in val:
+ k, v = val.split("=", 1)
+ md[k] = _coerce(v)
+ else:
+ md[val] = True
+ args[key] = md
+ else:
+ args[key] = _coerce(val)
+ i += 1
+ continue
+
+ key = t
+
+ # Special: consume metadata k=v pairs until next --flag
+ if key == "--metadata":
+ i += 1
+ md = {}
+ while i < len(toks) and not toks[i].startswith("--"):
+ pair = toks[i]
+ if "=" in pair:
+ k, v = pair.split("=", 1)
+ md[k] = _coerce(v)
+ else:
+ md[pair] = True
+ i += 1
+ args[key] = md
+ continue
+
+ # Standard: check if next token is a value (not a flag)
+ if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
+ args[key] = _coerce(toks[i + 1])
+ i += 2
+ else:
+ # lone flag -> True
+ args[key] = True
+ i += 1
+ else:
+ # unexpected positional; skip
+ i += 1
+
+ return {"executable": executable, "script": script, "args": args}
+
+
if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-r",
+ "--result",
+ type=str,
+ default="results",
+ help="Folder name for benchmark output results.",
+ )
+ args = parser.parse_args()
+ results_folder = Path(args.result)
+ if not results_folder.exists():
+ raise FileNotFoundError(f"results folder does not exist: {results_folder}")
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file) as f:
@@ -102,7 +209,6 @@ if __name__ == "__main__":
if "serving" in str(test_file):
# this result is generated via `vllm bench serve` command
-
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
@@ -110,12 +216,44 @@ if __name__ == "__main__":
except OSError as e:
print(e)
continue
+ # Parse Server Command Arg
+ out: dict[str, Any] = {
+ "server_command": parse_client_command(command["server_command"])
+ }
+ parse_args = [
+ "--tensor-parallel-size",
+ "--pipeline-parallel-size",
+ "--dtype",
+ ]
+ col_mapping = ["tp_size", "pp_size", "dtype"]
+ for index, arg in enumerate(parse_args):
+ if arg in out["server_command"]["args"]:
+ raw_result.update(
+ {col_mapping[index]: out["server_command"]["args"][arg]}
+ )
+ # Parse Client Command Arg
+ out: dict[str, Any] = {
+ "client_command": parse_client_command(command["client_command"])
+ }
+ parse_args = [
+ "--dataset-name",
+ "--random-input-len",
+ "--random-output-len",
+ "--request-rate",
+ ]
+ col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
+
+ for index, arg in enumerate(parse_args):
+ if arg in out["client_command"]["args"]:
+ raw_result.update(
+ {col_mapping[index]: out["client_command"]["args"][arg]}
+ )
+ # Add Server, Client command
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
-
# add the result to raw_result
serving_results.append(raw_result)
continue
@@ -205,7 +343,10 @@ if __name__ == "__main__":
columns=latency_column_mapping
)
if not serving_results.empty:
- serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+ valid_columns = [
+ col for col in serving_column_mapping if col in serving_results.columns
+ ]
+ serving_results = serving_results[valid_columns].rename(
columns=serving_column_mapping
)
if not throughput_results.empty:
@@ -245,7 +386,9 @@ if __name__ == "__main__":
)
# document the result
- with open(results_folder / "benchmark_results.md", "w") as f:
+ md_file = "benchmark_results.md"
+ json_file = "benchmark_results.json"
+ with open(results_folder / md_file, "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/"
+ "performance-benchmarks-descriptions.md"
@@ -260,7 +403,7 @@ if __name__ == "__main__":
f.write(results)
# document benchmarking results in json
- with open(results_folder / "benchmark_results.json", "w") as f:
+ with open(results_folder / json_file, "w") as f:
results = (
latency_results.to_dict(orient="records")
+ throughput_results.to_dict(orient="records")
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 2c57666a81aa3..b1b7d2d77a44d 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -194,9 +194,11 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -261,9 +263,11 @@ run_throughput_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -329,12 +333,21 @@ run_serving_tests() {
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
+ max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
+ if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
+ num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+ max_concurrency_list="[$num_prompts]"
+ fi
+ max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
+ echo "Running over max concurrency list $max_concurrency_list"
# check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -390,35 +403,39 @@ run_serving_tests() {
echo "now qps is $qps"
fi
- new_test_name=$test_name"_qps_"$qps
+ # iterate over different max_concurrency
+ for max_concurrency in $max_concurrency_list; do
+ new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+ echo " new test name $new_test_name"
+ # pass the tensor parallel size to the client so that it can be displayed
+ # on the benchmark dashboard
+ client_command="vllm bench serve \
+ --save-result \
+ --result-dir $RESULTS_FOLDER \
+ --result-filename ${new_test_name}.json \
+ --request-rate $qps \
+ --max-concurrency $max_concurrency \
+ --metadata "tensor_parallel_size=$tp" \
+ $client_args $client_remote_args "
- # pass the tensor parallel size to the client so that it can be displayed
- # on the benchmark dashboard
- client_command="vllm bench serve \
- --save-result \
- --result-dir $RESULTS_FOLDER \
- --result-filename ${new_test_name}.json \
- --request-rate $qps \
- --metadata "tensor_parallel_size=$tp" \
- $client_args $client_remote_args "
+ echo "Running test case $test_name with qps $qps"
+ echo "Client command: $client_command"
- echo "Running test case $test_name with qps $qps"
- echo "Client command: $client_command"
+ bash -c "$client_command"
- bash -c "$client_command"
-
- # record the benchmarking commands
- jq_output=$(jq -n \
- --arg server "$server_command" \
- --arg client "$client_command" \
- --arg gpu "$gpu_type" \
- '{
- server_command: $server,
- client_command: $client,
- gpu_type: $gpu
- }')
- echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+ # record the benchmarking commands
+ jq_output=$(jq -n \
+ --arg server "$server_command" \
+ --arg client "$client_command" \
+ --arg gpu "$gpu_type" \
+ '{
+ server_command: $server,
+ client_command: $client,
+ gpu_type: $gpu
+ }')
+ echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+ done
done
# clean up
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
index da93fdd1dbac1..569117aae852d 100644
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
@@ -20,7 +20,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index dd0e24edff98d..2d88a0b30c4f8 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,7 +1,8 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -87,17 +88,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -106,7 +107,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -120,19 +121,19 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -141,7 +142,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -155,19 +156,19 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -176,7 +177,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -190,13 +191,11 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
- "ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
}
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index f1bda65a7590b..823abbaa99f86 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,7 +1,8 @@
[
{
"test_name": "serving_llama8B_pp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_pp3_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
- "test_name": "serving_llama8B_tp2pp6_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
+ "test_name": "serving_llama8B_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
@@ -88,17 +89,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_pp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -107,7 +108,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -121,28 +122,28 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_pp3_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL:": 1,
+ "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -156,19 +157,19 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp2pp3_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -177,7 +178,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
@@ -192,13 +193,12 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 1000,
"num_prompts": 1000
}
}
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index f150b9abeea45..e21c8df0a9fe9 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -2,6 +2,7 @@
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -87,17 +88,17 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -106,7 +107,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -120,19 +121,19 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 100,
"num_prompts": 100
}
},
{
"test_name": "serving_llama8B_pp6_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -141,7 +142,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -155,13 +156,12 @@
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 100,
"num_prompts": 100
}
}
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
index f159c30637d34..48c015aa8403b 100644
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -21,7 +21,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index d4aceab4472fa..1b30c1292df85 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,11 +1,5 @@
-# Essential Elements of an Effective PR Description Checklist
-
-- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
-- [ ] The test plan, such as providing test command.
-- [ ] The test results, such as pasting the results comparison before and after, or e2e results
-- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
-
-PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
## Purpose
@@ -15,4 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
## (Optional) Documentation Update
+---
+
+ Essential Elements of an Effective PR Description Checklist
+
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
+- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+
+
**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
index 8d65936fba1d8..25af344aab2be 100755
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
cp "${OLD}" "${NEW}"
-# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
-sed -i '/FIX #xxxx.*$/d' "${NEW}"
+# Remove markdown comments (like the at the start)
+sed -i '/$/d' "${NEW}"
-# Remove "FILL IN THE PR DESCRIPTION HERE"
-sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
+sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
new file mode 100644
index 0000000000000..fd363c2ad0514
--- /dev/null
+++ b/benchmarks/benchmark_block_pool.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+from tabulate import tabulate
+
+from benchmark_utils import TimeCollector
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+def main(args):
+ rows = []
+ for allocate_block in args.allocate_blocks:
+ # Enforce a GC collect ahead to minimize the impact among runs
+ gc.collect()
+ block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+ get_blocks_times = TimeCollector(TimeCollector.US)
+ free_blocks_times = TimeCollector(TimeCollector.US)
+ for _ in range(args.num_iteration):
+ with get_blocks_times:
+ blocks = block_pool.get_new_blocks(allocate_block)
+ with free_blocks_times:
+ block_pool.free_blocks(blocks)
+
+ rows.append(
+ [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
+ + get_blocks_times.dump_avg_max()
+ + free_blocks_times.dump_avg_max()
+ )
+
+ print(
+ tabulate(
+ rows,
+ headers=[
+ "Iterations",
+ "Total\nBlocks",
+ "Allocated\nBlocks",
+ "Get Blocks\nAvg (us)",
+ "Get Blocks\nMax (us)",
+ "Free Blocks\nAvg (us)",
+ "Free Blocks\nMax (us)",
+ ],
+ tablefmt="grid",
+ floatfmt=".3f",
+ )
+ )
+
+
+def invoke_main() -> None:
+ parser = FlexibleArgumentParser(
+ description="Benchmark the performance of BlockPool for KV Cache."
+ )
+ parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+ parser.add_argument(
+ "--num-iteration",
+ type=int,
+ default=1000,
+ help="Number of iterations to run to stablize final data readings",
+ )
+ parser.add_argument(
+ "--allocate-blocks",
+ type=int,
+ nargs="*",
+ default=[10, 50, 100, 500, 1000],
+ help="Number of blocks to allocate",
+ )
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == "__main__":
+ invoke_main() # pragma: no cover
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
new file mode 100644
index 0000000000000..c60040d05ab7a
--- /dev/null
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+import numpy as np
+from tabulate import tabulate
+
+from benchmark_utils import TimeCollector
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+
+
+def main(args):
+ rows = []
+ for max_ngram in args.max_ngram:
+ collector = TimeCollector(TimeCollector.US)
+
+ model_config = ModelConfig(
+ model="facebook/opt-125m",
+ task="generate",
+ max_model_len=args.num_token + args.num_spec_token,
+ tokenizer="facebook/opt-125m",
+ tokenizer_mode="auto",
+ dtype="auto",
+ seed=None,
+ trust_remote_code=False,
+ )
+ proposer = NgramProposer(
+ vllm_config=VllmConfig(
+ model_config=model_config,
+ speculative_config=SpeculativeConfig(
+ prompt_lookup_min=args.min_ngram,
+ prompt_lookup_max=max_ngram,
+ num_speculative_tokens=args.num_spec_token,
+ method="ngram",
+ ),
+ )
+ )
+
+ # Warm up
+ proposer.propose(np.random.randint(0, 20, (args.num_token,)))
+
+ gc.collect()
+ for _ in range(args.num_iteration):
+ tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
+ with collector:
+ for i in range(args.num_req):
+ proposer.propose(tokens[i, :])
+ rows.append(
+ [args.num_req, args.num_token, args.min_ngram, max_ngram]
+ + collector.dump_avg_max()
+ )
+
+ print(
+ tabulate(
+ rows,
+ headers=[
+ "# Request",
+ "# Token",
+ "Min Ngram",
+ "Max Ngram",
+ "Avg (us)",
+ "Max (us)",
+ ],
+ tablefmt="grid",
+ floatfmt=".3f",
+ )
+ )
+
+
+def invoke_main() -> None:
+ parser = FlexibleArgumentParser(
+ description="Benchmark the performance of N-gram speculative decode drafting"
+ )
+ parser.add_argument(
+ "--num-iteration",
+ type=int,
+ default=100,
+ help="Number of iterations to run to stablize final data readings",
+ )
+ parser.add_argument(
+ "--num-req", type=int, default=128, help="Number of requests in the batch"
+ )
+ parser.add_argument(
+ "--num-token", type=int, default=1500, help="Number of tokens for each request"
+ )
+ parser.add_argument(
+ "--min-ngram",
+ type=int,
+ default=3,
+ help="Minimum n-gram to match",
+ )
+ parser.add_argument(
+ "--max-ngram",
+ type=int,
+ nargs="*",
+ default=[5, 7, 10, 15, 20],
+ help="Maximum n-gram to match",
+ )
+ parser.add_argument(
+ "--num-spec-token",
+ type=int,
+ default=3,
+ help="Number of speculative tokens to generate",
+ )
+ args = parser.parse_args()
+ main(args)
+
+
+if __name__ == "__main__":
+ invoke_main() # pragma: no cover
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 283f938df50af..98624abdf49fb 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,11 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
import argparse
import json
import math
import os
-from typing import Any
+import time
+from types import TracebackType
+from typing import Any, Optional, Union
def convert_to_pytorch_benchmark_format(
@@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None:
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)
+
+
+# Collect time and generate time metrics
+#
+# Example Usage:
+# collector = TimeCollector(TimeCollector.US)
+# for _ in range(total_iteration):
+# with collector:
+# ...
+# collector.dump_avg_max()
+class TimeCollector:
+ NS: int = 1
+ US: int = NS * 1000
+ MS: int = US * 1000
+ S: int = MS * 1000
+
+ def __init__(self, scale: int) -> None:
+ self.cnt: int = 0
+ self._sum: int = 0
+ self._max: Optional[int] = None
+ self.scale = scale
+ self.start_time: int = time.monotonic_ns()
+
+ def collect(self, v: int) -> None:
+ self.cnt += 1
+ self._sum += v
+ if self._max is None:
+ self._max = v
+ else:
+ self._max = max(self._max, v)
+
+ def avg(self) -> Union[float, str]:
+ return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
+
+ def max(self) -> Union[float, str]:
+ return self._max / self.scale if self._max else "N/A"
+
+ def dump_avg_max(self) -> list[Union[float, str]]:
+ return [self.avg(), self.max()]
+
+ def __enter__(self) -> None:
+ self.start_time = time.monotonic_ns()
+
+ def __exit__(
+ self,
+ exc_type: Optional[type[BaseException]],
+ exc_value: Optional[BaseException],
+ exc_traceback: Optional[TracebackType],
+ ) -> None:
+ self.collect(time.monotonic_ns() - self.start_time)
diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py
deleted file mode 100644
index 134551bb61285..0000000000000
--- a/benchmarks/kv_cache/benchmark_block_pool.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
-import time
-from typing import Optional
-
-from tabulate import tabulate
-
-from vllm.utils import FlexibleArgumentParser
-from vllm.v1.core.block_pool import BlockPool
-
-
-class Metric:
- def __init__(self) -> None:
- self.cnt: int = 0
- self.sum_v: int = 0
- self.max_v: Optional[int] = None
-
- def update(self, v: int) -> None:
- self.cnt += 1
- self.sum_v += v
- if self.max_v is None:
- self.max_v = v
- else:
- self.max_v = max(self.max_v, v)
-
- def avg_v(self) -> float:
- return self.sum_v * 1.0 / self.cnt
-
-
-def main(args):
- rows = []
- for allocate_block in args.allocate_blocks:
- # Enforce a GC collect ahead to minimize the impact among runs
- gc.collect()
- block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
-
- get_blocks_metric: Metric = Metric()
- free_blocks_metric: Metric = Metric()
- for _ in range(args.num_iteration):
- t1 = time.monotonic_ns()
- blocks = block_pool.get_new_blocks(allocate_block)
- t2 = time.monotonic_ns()
- block_pool.free_blocks(blocks)
- t3 = time.monotonic_ns()
- get_blocks_metric.update(t2 - t1)
- free_blocks_metric.update(t3 - t2)
-
- if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
- rows.append(
- [
- get_blocks_metric.cnt,
- args.num_gpu_blocks,
- allocate_block,
- get_blocks_metric.avg_v() / 1000000,
- get_blocks_metric.max_v / 1000000.0,
- free_blocks_metric.avg_v() / 1000000,
- free_blocks_metric.max_v / 1000000.0,
- ]
- )
- else:
- print(
- "No valid metrics found."
- f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
- )
-
- print(
- tabulate(
- rows,
- headers=[
- "Iterations",
- "Total\nBlocks",
- "Allocated\nBlocks",
- "Get Blocks\nAvg (ms)",
- "Get Blocks\nMax (ms)",
- "Free Blocks\nAvg (ms)",
- "Free Blocks\nMax (ms)",
- ],
- tablefmt="grid",
- floatfmt=".6f",
- )
- )
-
-
-def invoke_main() -> None:
- parser = FlexibleArgumentParser(
- description="Benchmark the performance of BlockPool for KV Cache."
- )
- parser.add_argument("--num-gpu-blocks", type=int, default=100000)
- parser.add_argument(
- "--num-iteration",
- type=int,
- default=1000,
- help="Number of iterations to run to stablize final data readings",
- )
- parser.add_argument(
- "--allocate-blocks",
- type=int,
- nargs="*",
- default=[10, 50, 100, 500, 1000],
- help="Number of blocks to allocate",
- )
- args = parser.parse_args()
- main(args)
-
-
-if __name__ == "__main__":
- invoke_main() # pragma: no cover
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 946c137db6366..99c52ef17d08b 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -423,12 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
}
+#ifndef USE_ROCM
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
- static_assert(WARP_SIZE == 32 || WARP_SIZE == 64, \
- "Unsupported warp size. Only 32 and 64 are supported."); \
+ static_assert(WARP_SIZE == 32, \
+ "Unsupported warp size. Only 32 is supported for CUDA"); \
topkGatingSoftmaxLauncherHelper( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+#else
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
+ if (WARP_SIZE == 64) { \
+ topkGatingSoftmaxLauncherHelper( \
+ gating_output, nullptr, topk_weights, topk_indices, \
+ token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
+ } else if (WARP_SIZE == 32) { \
+ topkGatingSoftmaxLauncherHelper( \
+ gating_output, nullptr, topk_weights, topk_indices, \
+ token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
+ } else { \
+ assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
+ }
+#endif
template
void topkGatingSoftmaxKernelLauncher(
@@ -443,7 +458,9 @@ void topkGatingSoftmaxKernelLauncher(
cudaStream_t stream) {
static constexpr int WARPS_PER_TB = 4;
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
+#ifndef USE_ROCM
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+#endif
switch (num_experts) {
case 1:
LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 0ebd99ba5ae12..2bbed778f3c6a 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -11,7 +11,7 @@ vLLM contains two sets of benchmarks:
The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
-The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
+The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index f6ecceb85d862..0ee680f5c688c 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -18,7 +18,7 @@ vLLM supports the following hardware platforms:
## Hardware Plugins
The backends below live **outside** the main `vllm` repository and follow the
-[Hardware-Pluggable RFC](../design/plugin_system.md).
+[Hardware-Pluggable RFC](../../design/plugin_system.md).
| Accelerator | PyPI / package | Repository |
|-------------|----------------|------------|
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index dbbbc5122b803..a24fa4bcce333 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
| `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ |
+| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py
index 1fc87c8b42a7a..372e9b1fecd42 100644
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -24,18 +24,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
@pytest.fixture(scope="module")
-def monkeypatch_module():
- from _pytest.monkeypatch import MonkeyPatch
- mpatch = MonkeyPatch()
- yield mpatch
- mpatch.undo()
-
-
-@pytest.fixture(scope="module", params=[False, True])
-def multimodal_server(request, monkeypatch_module): # noqa: F811
-
- use_v1 = request.param
- monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+def multimodal_server(): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 771119d04ea31..246bd014aa690 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
op = context.operation
assert op is not None
- def no_file_type(case: schemathesis.models.Case):
+ def no_invalid_types(case: schemathesis.models.Case):
"""
- This filter skips test cases for the `POST /tokenize` endpoint where the
- HTTP request body uses `"type": "file"` in any message's content.
- We expect these cases to fail because that type isn't implemented here
- https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
+ This filter skips test cases with invalid data that schemathesis
+ incorrectly generates due to permissive schema configurations.
+
+ 1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in
+ message content, which isn't implemented.
+
+ 2. Skips tool_calls with `"type": "custom"` which schemathesis
+ incorrectly generates instead of the valid `"type": "function"`.
Example test cases that are skipped:
curl -X POST -H 'Content-Type: application/json' \
- -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+ -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
http://localhost:8000/tokenize
curl -X POST -H 'Content-Type: application/json' \
- -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
- http://localhost:8000/tokenize
+ -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
+ http://localhost:8000/v1/chat/completions
""" # noqa: E501
- if (op.method.lower() == "post" and op.path == "/tokenize"
- and hasattr(case, "body") and isinstance(case.body, dict)
+ if (hasattr(case, "body") and isinstance(case.body, dict)
and "messages" in case.body
and isinstance(case.body["messages"], list)
and len(case.body["messages"]) > 0):
+
for message in case.body["messages"]:
if not isinstance(message, dict):
continue
- content = message.get("content", [])
- if not isinstance(content, list) or len(content) == 0:
- continue
- if any(item.get("type") == "file" for item in content):
- return False
+
+ # Check for invalid file type in tokenize endpoint
+ if op.method.lower() == "post" and op.path == "/tokenize":
+ content = message.get("content", [])
+ if (isinstance(content, list) and len(content) > 0 and any(
+ item.get("type") == "file" for item in content)):
+ return False
+
+ # Check for invalid tool_calls with non-function types
+ tool_calls = message.get("tool_calls", [])
+ if isinstance(tool_calls, list):
+ for tool_call in tool_calls:
+ if isinstance(tool_call, dict):
+ if tool_call.get("type") != "function":
+ return False
+ if "custom" in tool_call:
+ return False
return True
- return strategy.filter(no_file_type)
+ return strategy.filter(no_invalid_types)
@schema.parametrize()
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 28fd02171b954..e103bd206b54c 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -80,9 +80,6 @@ async def test_bad_requests(mary_had_lamb):
async def test_long_audio_request(mary_had_lamb, model_name):
server_args = ["--enforce-eager"]
- if model_name.startswith("openai"):
- return
-
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
index e48bdbe940be7..6cae53a660ad8 100644
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -36,7 +36,7 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
- mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+ mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
@pytest.mark.parametrize("model_info", MODELS)
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index 585fa0e683da2..c22c78592e535 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -46,7 +46,7 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
- mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+ mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
@pytest.mark.parametrize("model_info", MODELS)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2a65d7e244d71..2919bdbe91bbd 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -561,7 +561,7 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
- # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+ # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
marks=[pytest.mark.skip("HF import fails")],
),
"minicpmv_26": VLMTestInfo(
@@ -574,8 +574,6 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
- # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
- marks=[pytest.mark.skip("HF import fails")],
),
"minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"],
@@ -611,18 +609,6 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.ovis_patch_hf_runner,
marks=[large_gpu_mark(min_gb=32)],
),
- "ovis1_6": VLMTestInfo(
- models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
- test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
- img_idx_to_prompt=lambda idx: "\n", # noqa: E501
- max_model_len=4096,
- max_num_seqs=2,
- dtype="half",
- # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
- hf_model_kwargs={"llm_attn_implementation": "sdpa"},
- patch_hf_runner=model_utils.ovis_patch_hf_runner,
- ),
"ovis2": VLMTestInfo(
models=["AIDC-AI/Ovis2-1B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index a4cb1a68833a5..92390d8c2f7ee 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -153,4 +153,4 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
if hasattr(model, method_name):
getattr(model, method_name)(**mm_kwargs)
- vllm_model.apply_model(validate_model_input)
+ vllm_model.apply_model(validate_model_input)
\ No newline at end of file
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d7d20d1f3abf7..eb48c0f6a7738 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -195,7 +195,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
{"alias": "gpt2"}),
"GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
- {"tiny": "bigcode/tiny_starcoder_py"}), # noqa: E501
+ extras={"tiny": "bigcode/tiny_starcoder_py"}, # noqa: E501
+ min_transformers_version="4.55.1"),
"GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
{"6b": "EleutherAI/gpt-j-6b"}),
"GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index f5a7b9cc276b3..d72e50e5196b8 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -11,7 +11,8 @@ from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationLevel
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.forward_context import get_forward_context
-from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
+from vllm.model_executor.models.gemma3n_mm import (
+ Gemma3nForConditionalGeneration)
from vllm.model_executor.models.registry import ModelRegistry
from vllm.model_executor.models.utils import extract_layer_index
from vllm.sequence import IntermediateTensors
@@ -32,12 +33,13 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs,
) -> Union[torch.Tensor, IntermediateTensors]:
- hidden_states = self.model(input_ids, positions, intermediate_tensors,
- inputs_embeds, **kwargs)
+ hidden_states = super().forward(input_ids, positions,
+ intermediate_tensors, inputs_embeds,
+ **kwargs)
attn_metadata = get_forward_context().attn_metadata
# attn_metadata is None during dummy runs
if (attn_metadata is not None
- and self.cache_config.kv_sharing_fast_prefill):
+ and self.language_model.cache_config.kv_sharing_fast_prefill):
assert isinstance(attn_metadata, dict) # true in V1
# Gemma3n-E2B has 30 layers, with last 20 layers being
# cross-decoder layers. Check attention metadata is correct
@@ -52,7 +54,7 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
# Last layer will be a KV sharing layer
layer_attn_metadata = attn_metadata[
- self.model.language_model.layers[-1].self_attn.attn.layer_name]
+ self.language_model.model.layers[-1].self_attn.attn.layer_name]
logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
assert logits_indices_padded is not None
num_logits_indices = layer_attn_metadata.num_logits_indices
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 599916c0d1cfb..dde95fbe590b3 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -146,7 +146,11 @@ def test_ngram_correctness(
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
],
ids=[
- "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle",
+ # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501
+ # "qwen3_eagle3",
+ "llama3_eagle",
+ "llama3_eagle3",
+ "llama4_eagle",
"llama4_eagle_mm"
])
@pytest.mark.parametrize("attn_backend",
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index b7303e0443d32..4193f4041b32b 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -1,43 +1,63 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
import numpy as np
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
-from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
- _find_subarray_kmp,
- _kmp_lps_array)
+from vllm.v1.spec_decode.ngram_proposer import (
+ NgramProposer, _find_longest_matched_ngram_and_propose_tokens)
-def test_kmp_lps_array():
- np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
- np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
- np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
- np.array([0, 1, 2]))
- np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
- np.array([0, 0, 0, 0]))
- np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
- np.array([0, 0, 1, 2, 0]))
+def test_find_longest_matched_ngram_and_propose_tokens():
+ tokens = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
+ assert _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+ min_ngram=2,
+ max_ngram=2,
+ max_model_len=1024,
+ k=2) is None
+ tokens = np.array([1, 2, 3, 4, 1, 2, 3])
+ np.testing.assert_array_equal(
+ _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+ min_ngram=2,
+ max_ngram=2,
+ max_model_len=1024,
+ k=3),
+ np.array([4, 1, 2]))
+ np.testing.assert_array_equal(
+ _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+ min_ngram=2,
+ max_ngram=2,
+ max_model_len=1024,
+ k=2), np.array([4, 1]))
+ np.testing.assert_array_equal(
+ _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+ min_ngram=1,
+ max_ngram=1,
+ max_model_len=1024,
+ k=3),
+ np.array([4, 1, 2]))
+ np.testing.assert_array_equal(
+ _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+ min_ngram=1,
+ max_ngram=1,
+ max_model_len=1024,
+ k=2), np.array([4, 1]))
-def test_find_subarray_kmp():
- X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
- assert _find_subarray_kmp(X, 2, 2) is None
- X = np.array([1, 2, 3, 4, 1, 2, 3])
- np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
- np.array([4, 1, 2]))
- np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
- 1]))
- np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
- np.array([4, 1, 2]))
- np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
- 1]))
- X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
- np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
- np.array([4, 1, 2]))
+ tokens = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
+ np.testing.assert_array_equal(
+ _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+ min_ngram=2,
+ max_ngram=2,
+ max_model_len=1024,
+ k=3),
+ np.array([4, 1, 2]))
# Return on the first match
- np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
- np.array([6, 2, 3]))
+ np.testing.assert_array_equal(
+ _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+ min_ngram=1,
+ max_ngram=1,
+ max_model_len=1024,
+ k=2), np.array([6, 2]))
def test_ngram_proposer():
@@ -56,27 +76,35 @@ def test_ngram_proposer():
# No match.
result = ngram_proposer(
- 2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
+ min_n=2, max_n=2,
+ k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
assert result is None
# No match for 4-gram.
result = ngram_proposer(
- 4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+ min_n=4, max_n=4,
+ k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
assert result is None
# No match for 4-gram but match for 3-gram.
result = ngram_proposer(
- 3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+ min_n=3, max_n=4,
+ k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
assert np.array_equal(result, np.array([4, 1]))
# Match for both 4-gram and 3-gram.
# In this case, the proposer should return the 4-gram match.
- result = ngram_proposer(3, 4, 2).propose(
+ result = ngram_proposer(min_n=3, max_n=4, k=2).propose(
context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
assert np.array_equal(result, np.array([1, 2])) # Not [5, 1]
# Match for 2-gram and 3-gram, but not 4-gram.
- result = ngram_proposer(
- 2, 4,
- 2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
+ result = ngram_proposer(min_n=2, max_n=4, k=2).propose(
+ context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
assert np.array_equal(result, np.array([1, 2])) # Not [5, 2]
+
+ # Multiple 3-gram matched, but always pick the first one.
+ result = ngram_proposer(
+ min_n=3, max_n=3, k=2).propose(context_token_ids=np.array(
+ [1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]))
+ assert np.array_equal(result, np.array([100, 1]))
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 7dd104a4fcc4e..8dfb7959a510d 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -297,7 +297,7 @@ class CustomAllreduce:
@staticmethod
def free_shared_buffer(pointers: list[int],
group: Optional[ProcessGroup] = None,
- rank: Optional[int] = 0) -> None:
+ rank: Optional[int] = None) -> None:
if rank is None:
rank = dist.get_rank(group=group)
if ops is not None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c058001ceb974..dd1072da08447 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -711,7 +711,7 @@ class EngineArgs:
"--mm-processor-cache-gb",
**multimodal_kwargs["mm_processor_cache_gb"])
multimodal_group.add_argument("--disable-mm-preprocessor-cache",
- type=bool,
+ action="store_true",
deprecated=True)
multimodal_group.add_argument(
"--interleave-mm-strings",
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index b6ee4105340a1..73726eeab5fc7 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1092,6 +1092,7 @@ class AsyncLLMEngine(EngineClient):
self.engine.reset_prefix_cache(device)
async def sleep(self, level: int = 1) -> None:
+ await self.reset_prefix_cache()
self.engine.sleep(level)
async def wake_up(self, tags: Optional[list[str]] = None) -> None:
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 131c042c3c2db..aff491f9596c3 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
QKVParallelLinear,
- ReplicatedLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -118,14 +117,15 @@ class Glm4MoE(nn.Module):
if config.hidden_act != "silu":
raise ValueError(f"Unsupported activation: {config.hidden_act}. "
"Only silu is supported for now.")
-
- self.gate = ReplicatedLinear(config.hidden_size,
- config.n_routed_experts,
- bias=False,
- quant_config=None,
- params_dtype=torch.float32,
- prefix=f"{prefix}.gate")
-
+ # NOTE In the transformers implementation, the gate isn't an nn.Linear,
+ # so we cannot use ReplicatedLinear here.
+ # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260
+ self.gate = nn.Linear(
+ config.hidden_size,
+ config.n_routed_experts,
+ bias=False,
+ dtype=torch.float32,
+ )
self.gate.e_score_correction_bias = nn.Parameter(
torch.empty(config.n_routed_experts, dtype=torch.float32))
@@ -181,7 +181,7 @@ class Glm4MoE(nn.Module):
if self.n_shared_experts is not None:
shared_output = self.shared_experts(hidden_states)
- router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+ router_logits = self.gate(hidden_states.to(dtype=torch.float32))
final_hidden_states = self.experts(
hidden_states=hidden_states,
router_logits=router_logits) * self.routed_scaling_factor
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 89d2817b57e0e..4927d6b62c6d8 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -72,8 +72,9 @@ class PixtralHFImagePixelInputs(TensorSchema):
in which case the data is passed as a list instead of a batched tensor.
"""
type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral"
- pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]],
- TensorShape("bn", "c", "h", "w")]
+ pixel_values: Annotated[
+ Union[torch.Tensor, list[torch.Tensor]],
+ TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"})]
class LlavaImageEmbeddingInputs(TensorSchema):
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 7db3a1bb90b47..47ce771d8c901 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -91,7 +91,7 @@ class MiniCPMVImagePixelInputs(TensorSchema):
# batched tensor.
pixel_values: Annotated[
list[torch.Tensor],
- TensorShape("bns", "c", "h", "w"),
+ TensorShape("bns", "c", "h", "w", dynamic_dims={"h", "w"}),
]
tgt_sizes: Annotated[
torch.Tensor,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 02ea0814ddefa..d8c964fb2a4a4 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -449,23 +449,6 @@ def get_config(
raise e
config = _maybe_remap_hf_config_attrs(config)
- # Phi4Flash misuses this config as list[int]. Convert it to int and add
- # the layer_types list[str] to make it HF compatible
- if (config.model_type == "phi4flash"):
- # TODO: Remove after the following PR is merged:
- # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/6
- if not hasattr(config, "layer_types"):
- config.layer_types = [
- "sliding_attention" if i < config.num_hidden_layers // 2
- and i % 2 == 1 else "full_attention"
- for i in range(config.num_hidden_layers)
- ]
- # TODO: Remove after the following PR is merged:
- # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/7
- if isinstance(config.sliding_window, list):
- config.sliding_window = next(
- filter(None, config.sliding_window), None)
-
elif config_format == ConfigFormat.MISTRAL:
# This function loads a params.json config which
# should be used when loading models in mistral format
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 6ccc636efaf1b..4dd8b2439b3f5 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -2,13 +2,13 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
-from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Union, cast
import huggingface_hub
import regex as re
from huggingface_hub import HfApi, hf_hub_download
+from transformers.tokenization_utils_base import BatchEncoding
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer_base import TokenizerBase
@@ -27,11 +27,6 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
-@dataclass
-class Encoding:
- input_ids: Union[list[int], list[list[int]]]
-
-
def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
# SEE: https://github.com/vllm-project/vllm/pull/9951
# Credits go to: @gcalmettes
@@ -359,7 +354,7 @@ class MistralTokenizer(TokenizerBase):
# For str, single prompt text
else:
input_ids = self.encode_one(text, truncation, max_length)
- return Encoding(input_ids=input_ids)
+ return BatchEncoding({"input_ids": input_ids})
def get_vocab(self) -> dict[str, int]:
# NB: the dictionary form of the vocabulary collapses token ids that map
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 095829db83944..cae4eecc0deeb 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -709,8 +709,28 @@ class AsyncMicrobatchTokenizer:
def cancel_task_threadsafe(task: Task):
- if task and not task.done() and not (loop := task.get_loop()).is_closed():
- loop.call_soon_threadsafe(task.cancel)
+ if task and not task.done():
+ run_in_loop(task.get_loop(), task.cancel)
+
+
+def close_sockets(sockets: Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]):
+ for sock in sockets:
+ if sock is not None:
+ sock.close(linger=0)
+
+
+def run_in_loop(loop: AbstractEventLoop, function: Callable, *args):
+ if in_loop(loop):
+ function(*args)
+ elif not loop.is_closed():
+ loop.call_soon_threadsafe(function, *args)
+
+
+def in_loop(event_loop: AbstractEventLoop) -> bool:
+ try:
+ return asyncio.get_running_loop() == event_loop
+ except RuntimeError:
+ return False
def make_async(
diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
index 4c3acf0094c74..21d3249fe1547 100644
--- a/vllm/utils/tensor_schema.py
+++ b/vllm/utils/tensor_schema.py
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Annotated, Any, Union, get_args, get_origin, get_type_hints
+from typing import (Annotated, Any, Optional, Union, get_args, get_origin,
+ get_type_hints)
import torch
@@ -11,9 +12,13 @@ logger = init_logger(__name__)
class TensorShape:
- def __init__(self,
- *dims: Union[int, str],
- dynamic_dims: set[str, ...] = None) -> None:
+ def __init__(
+ self,
+ *dims: Union[int, str],
+ dynamic_dims: Optional[set[str]] = None,
+ ) -> None:
+ super().__init__()
+
self.dims = dims
self.dynamic_dims = dynamic_dims if dynamic_dims else set()
@@ -44,11 +49,15 @@ class TensorShape:
class TensorSchema:
- def __init__(self,
- *,
- validate: bool = True,
- resolve_bindings: dict[str, int] = None,
- **kwargs: Any) -> None:
+ def __init__(
+ self,
+ *,
+ validate: bool = True,
+ resolve_bindings: Optional[dict[str, int]] = None,
+ **kwargs: Any,
+ ) -> None:
+ super().__init__()
+
self._resolve_bindings = resolve_bindings if resolve_bindings else {}
for key, value in kwargs.items():
@@ -57,16 +66,19 @@ class TensorSchema:
if validate:
self.validate()
- def __getitem__(self, item) -> Any:
- return getattr(self, item)
+ def __getitem__(self, key: str) -> Any:
+ return getattr(self, key)
- def get(self, item, default=None) -> Any:
- return getattr(self, item, default)
+ def get(self, key: str, default: Any = None) -> Any:
+ return getattr(self, key, default)
- def _match_shape_with_dynamic(self, actual: tuple[int, ...],
- reference: tuple[int, ...],
- expected_shape: tuple[Union[int, str], ...],
- dynamic_dims: set[str, ...]) -> bool:
+ def _match_shape_with_dynamic(
+ self,
+ actual: tuple[int, ...],
+ reference: tuple[int, ...],
+ expected_shape: tuple[Union[int, str], ...],
+ dynamic_dims: set[str],
+ ) -> bool:
if len(actual) != len(reference) or len(actual) > len(expected_shape):
return False
@@ -84,10 +96,12 @@ class TensorSchema:
return True
def _validate_nested_tensors(
- self, value: Union[list[torch.Tensor, ...],
- tuple[torch.Tensor, ...]], field_name: str,
- expected_shape: tuple[Union[int, str], ...],
- dynamic_dims: set[str, ...]) -> tuple[int, ...]:
+ self,
+ value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
+ field_name: str,
+ expected_shape: tuple[Union[int, str], ...],
+ dynamic_dims: set[str],
+ ) -> tuple[int, ...]:
"""Validate a list/tuple of tensors and return the actual shape."""
# Ensure all tensors in the list have the same
# shape, besides dynamic dimensions
@@ -110,12 +124,14 @@ class TensorSchema:
# shape = (len(list), *tensor.shape)
return (len(value), ) + first.shape
- def _validate_tensor_shape_expected(self, actual_shape: tuple[int, ...],
- expected_shape: tuple[Union[int, str],
- ...],
- field_name: str, shape_env: dict[str,
- int],
- dynamic_dims: set[str, ...]) -> None:
+ def _validate_tensor_shape_expected(
+ self,
+ actual_shape: tuple[int, ...],
+ expected_shape: tuple[Union[int, str], ...],
+ field_name: str,
+ shape_env: dict[str, int],
+ dynamic_dims: set[str],
+ ) -> None:
"""Validate that the actual tensor shape matches the expected shape."""
if len(actual_shape) != len(expected_shape):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a2706327914c5..edc2e235c3c3f 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -576,6 +576,7 @@ class AsyncLLM(EngineClient):
await self.engine_core.reset_prefix_cache_async()
async def sleep(self, level: int = 1) -> None:
+ await self.reset_prefix_cache()
await self.engine_core.sleep_async(level)
async def wake_up(self, tags: Optional[list[str]] = None) -> None:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 05b4d72608963..5ffa555570a22 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -23,8 +23,8 @@ from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.tasks import SupportedTask
-from vllm.utils import (cancel_task_threadsafe, get_open_port,
- get_open_zmq_inproc_path, make_zmq_socket)
+from vllm.utils import (close_sockets, get_open_port, get_open_zmq_inproc_path,
+ in_loop, make_zmq_socket)
from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
EngineCoreRequestType,
ReconfigureDistributedRequest, ReconfigureRankType,
@@ -317,7 +317,7 @@ class BackgroundResources:
"""Used as a finalizer for clean shutdown, avoiding
circular reference back to the client object."""
- ctx: Union[zmq.Context]
+ ctx: zmq.Context
# If CoreEngineProcManager, it manages local engines;
# if CoreEngineActorManager, it manages all engines.
engine_manager: Optional[Union[CoreEngineProcManager,
@@ -326,6 +326,8 @@ class BackgroundResources:
output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
first_req_send_socket: Optional[zmq.asyncio.Socket] = None
+ first_req_rcv_socket: Optional[zmq.asyncio.Socket] = None
+ stats_update_socket: Optional[zmq.asyncio.Socket] = None
output_queue_task: Optional[asyncio.Task] = None
stats_update_task: Optional[asyncio.Task] = None
shutdown_path: Optional[str] = None
@@ -343,23 +345,47 @@ class BackgroundResources:
if self.coordinator is not None:
self.coordinator.close()
- cancel_task_threadsafe(self.output_queue_task)
- cancel_task_threadsafe(self.stats_update_task)
+ if isinstance(self.output_socket, zmq.asyncio.Socket):
+ # Async case.
+ loop = self.output_socket._get_loop()
+ asyncio.get_running_loop()
+ sockets = (self.output_socket, self.input_socket,
+ self.first_req_send_socket, self.first_req_rcv_socket,
+ self.stats_update_socket)
- # ZMQ context termination can hang if the sockets
- # aren't explicitly closed first.
- for socket in (self.output_socket, self.input_socket,
- self.first_req_send_socket):
- if socket is not None:
- socket.close(linger=0)
+ tasks = (self.output_queue_task, self.stats_update_task)
- if self.shutdown_path is not None:
- # We must ensure that the sync output socket is
- # closed cleanly in its own thread.
- with self.ctx.socket(zmq.PAIR) as shutdown_sender:
- shutdown_sender.connect(self.shutdown_path)
- # Send shutdown signal.
- shutdown_sender.send(b'')
+ def close_sockets_and_tasks():
+ close_sockets(sockets)
+ for task in tasks:
+ if task is not None and not task.done():
+ task.cancel()
+
+ if in_loop(loop):
+ close_sockets_and_tasks()
+ elif not loop.is_closed():
+ loop.call_soon_threadsafe(close_sockets_and_tasks)
+ else:
+ # Loop has been closed, try to clean up directly.
+ del tasks
+ del close_sockets_and_tasks
+ close_sockets(sockets)
+ del self.output_queue_task
+ del self.stats_update_task
+ else:
+ # Sync case.
+
+ # ZMQ context termination can hang if the sockets
+ # aren't explicitly closed first.
+ close_sockets((self.output_socket, self.input_socket))
+
+ if self.shutdown_path is not None:
+ # We must ensure that the sync output socket is
+ # closed cleanly in its own thread.
+ with self.ctx.socket(zmq.PAIR) as shutdown_sender:
+ shutdown_sender.connect(self.shutdown_path)
+ # Send shutdown signal.
+ shutdown_sender.send(b'')
def validate_alive(self, frames: Sequence[zmq.Frame]):
if len(frames) == 1 and (frames[0].buffer
@@ -969,14 +995,19 @@ class DPAsyncMPClient(AsyncMPClient):
self.engine_ranks_managed[-1] + 1)
async def run_engine_stats_update_task():
- with make_zmq_socket(self.ctx, self.stats_update_address,
- zmq.XSUB) as socket, make_zmq_socket(
- self.ctx,
- self.first_req_sock_addr,
- zmq.PAIR,
- bind=False) as first_req_rcv_socket:
+ with (make_zmq_socket(self.ctx,
+ self.stats_update_address,
+ zmq.XSUB,
+ linger=0) as socket,
+ make_zmq_socket(self.ctx,
+ self.first_req_sock_addr,
+ zmq.PAIR,
+ bind=False,
+ linger=0) as first_req_rcv_socket):
assert isinstance(socket, zmq.asyncio.Socket)
assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket)
+ self.resources.stats_update_socket = socket
+ self.resources.first_req_rcv_socket = first_req_rcv_socket
# Send subscription message.
await socket.send(b'\x01')
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 6b90d0970bd77..fbcf2cb50d371 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -11,6 +11,10 @@ from vllm.config import VllmConfig
class NgramProposer:
def __init__(self, vllm_config: VllmConfig):
+ assert vllm_config.speculative_config is not None
+ assert vllm_config.speculative_config.prompt_lookup_min is not None
+ assert vllm_config.speculative_config.prompt_lookup_max is not None
+
# Minimum length of the n-gram to match.
self.min_n = vllm_config.speculative_config.prompt_lookup_min
# Maximum length of the n-gram to match.
@@ -54,17 +58,13 @@ class NgramProposer:
followed that pattern. Here we will return [4,2,3] because
we only have three tokens after the match.
"""
- # Do not generate draft tokens beyond the max model length.
- k = min(self.k, self.max_model_len - context_token_ids.shape[0])
- if k <= 0:
- return None
-
# TODO(woosuk): Optimize this.
- for n in range(self.max_n, self.min_n - 1, -1):
- result = _find_subarray_kmp(context_token_ids, n, k)
- if result is not None:
- return result
- return None
+ return _find_longest_matched_ngram_and_propose_tokens(
+ origin_tokens=context_token_ids,
+ min_ngram=self.min_n,
+ max_ngram=self.max_n,
+ max_model_len=self.max_model_len,
+ k=self.k)
def load_model(self, *args, **kwargs):
# No model to load.
@@ -72,61 +72,86 @@ class NgramProposer:
@jit(nopython=True)
-def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
+def _find_longest_matched_ngram_and_propose_tokens(
+ origin_tokens: np.ndarray, min_ngram: int, max_ngram: int,
+ max_model_len: int, k: int) -> Optional[np.ndarray]:
"""
- Build the lps (longest proper prefix which is also suffix)
- array for the pattern.
+ Find the longest n-gram which matches the suffix of the given tokens
+ whose length is within [min_ngram, max_ngram] (inclusive).
+
+ If found, we will extract k right after the matched ngram.
"""
- lps = np.zeros(len(pattern), dtype=np.int32)
- prev_lps = 0 # length of the previous longest prefix suffix
+ # Do not generate draft tokens is context is shorter than minimum n-gram
+ total_token = origin_tokens.shape[0]
+ if total_token < min_ngram:
+ return None
+
+ # Do not generate draft tokens beyond the max model length.
+ k = min(k, max_model_len - total_token)
+ if k <= 0:
+ return None
+
+ # Flip tokens, and the goal become to find longest ngram
+ # on the rightmost position which matches the prefix with
+ # length [min_n, max_n] (inclusive).
+ tokens = origin_tokens[::-1]
+
+ # Longest prefix (not including itself) which is a suffix of
+ # the current position.
+ # lps[i] = max{v, where tokens[0:v] == tokens[i+1-v:i+1]}
+ #
+ # As ngram is capped by max_ngram to save memory, we only need to
+ # store lps for the first max_ngram prefix.
+ lps = np.zeros(max_ngram, dtype=np.int32)
+
+ longest_ngram = 0
+ position = 0
+
+ # lps[0] always equal to 0, we starts with index 1
+ prev_lps = 0
i = 1
-
- while i < len(pattern):
- if pattern[i] == pattern[prev_lps]:
+ while i < total_token:
+ # tokens[:prev_lps] is the longest prefix as a suffix of tokens[:i]
+ if tokens[prev_lps] == tokens[i]:
+ # Token match: tokens[:prev_lps+1] is the longest prefix as
+ # a suffix of tokens[:i+1]
prev_lps += 1
- lps[i] = prev_lps
+ # Check if we found a longer valid ngram.
+ #
+ # Update position when longest_ngram matched prev_lps,
+ # as we want to get the target n-gram of the earliest position
+ # in the original tokens (i.e.
+ # latest position in the reversed tokens)
+ if prev_lps >= longest_ngram:
+ longest_ngram = prev_lps
+ position = i
+ if i < max_ngram:
+ # Store LPS for the first max_ngram prefix
+ lps[i] = prev_lps
+ if prev_lps == max_ngram:
+ # When prev_lps reached max_ngram, update prev_lps
+ # to lps[max_ngram-1] to avoid matching ngram
+ # longer than max_ngram
+ prev_lps = lps[max_ngram - 1]
i += 1
+ elif prev_lps != 0:
+ # Token mismatch: try the second longest prefix
+ # among all suffix of tokens[:i],
+ # which is the longest prefix of tokens[:prev_lps]
+ prev_lps = lps[prev_lps - 1]
else:
- if prev_lps != 0:
- prev_lps = lps[prev_lps - 1]
- else:
- lps[i] = 0
- i += 1
- return lps
-
-
-@jit(nopython=True)
-def _find_subarray_kmp(
- context_token_ids: np.ndarray,
- n: int,
- k: int,
-) -> Optional[np.ndarray]:
- context_len = context_token_ids.shape[0]
- assert n > 0
-
- pattern = context_token_ids[-n:]
- # Precompute lps array for Y
- lps = _kmp_lps_array(pattern)
-
- i = 0
- j = 0
- # -n because the last n tokens are used as pattern
- while i < context_len - n:
- if context_token_ids[i] == pattern[j]:
+ # Token mismatch, and no more prefix (except empty string)
+ # as a suffix of tokens[:i]
i += 1
- j += 1
- # If we have matched the entire Y
- if j == n:
- # Found pattern in context, gather the next K elements
- return context_token_ids[i:i + k]
- else:
- # Mismatch
- if j != 0:
- # Use the lps array to avoid re-checking elements
- j = lps[j - 1]
- else:
- i += 1
+ if longest_ngram < min_ngram:
+ # No valid ngram is found
+ return None
- # Y not found
- return None
+ # Flip the position back, so in origin_tokens,
+ # origin_tokens[total_token-1-position:total_token-1-position+longest_ngram]
+ # is the matched ngram, so we should start drafting tokens from
+ # total_token-1-position+longest_ngram
+ start_position = total_token - 1 - position + longest_ngram
+ k = min(k, total_token - start_position)
+ return origin_tokens[start_position:start_position + k]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a03e860a91c71..8fb9641844fb5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -341,13 +341,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
model_kwargs = dict[str, Any]()
num_reqs = self.input_batch.num_reqs
- pooling_params = self.input_batch.pooling_metadata.pooling_params
-
- num_pooling_reqs = len(pooling_params)
+ num_pooling_reqs = len(self.input_batch.pooling_params)
if num_pooling_reqs == 0:
return model_kwargs
+ pooling_params = self.input_batch.pooling_metadata.pooling_params
+
assert num_pooling_reqs == num_reqs
token_type_id_requests = dict[int, Any]()