mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-11 05:42:15 +08:00
Merge branch 'vllm-project:main' into wye-refactor-quant-folder
This commit is contained in:
commit
52c905a3d4
@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
|
||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||
|
||||
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
@ -138,28 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
|
||||
|
||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
||||
|
||||
Here is an example using the script to compare result_a and result_b without detail test name.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
|
||||
|
||||
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||
|----|----------------------------------------|----------------------------------------|----------|
|
||||
| 0 | 142.633982 | 156.526018 | 1.097396 |
|
||||
| 1 | 241.620334 | 294.018783 | 1.216863 |
|
||||
| 2 | 218.298905 | 262.664916 | 1.203235 |
|
||||
| 3 | 242.743860 | 299.816190 | 1.235113 |
|
||||
|
||||
Here is an example using the script to compare result_a and result_b with detail test name.
|
||||
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
|
||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||
|
||||
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
||||
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
||||
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
||||
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
|
||||
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
|
||||
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
|
||||
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
|
||||
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
|
||||
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
|
||||
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
|
||||
|
||||
A comparison diagram will be generated below the table.
|
||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
||||
|
||||
## Nightly test details
|
||||
|
||||
|
||||
@ -1,24 +1,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def compare_data_columns(
|
||||
files, name_column, data_column, drop_column, ignore_test_name=False
|
||||
files, name_column, data_column, info_cols, drop_column, debug=False
|
||||
):
|
||||
print("\ncompare_data_column: " + data_column)
|
||||
frames = []
|
||||
raw_data_cols = []
|
||||
compare_frames = []
|
||||
for file in files:
|
||||
data_df = pd.read_json(file)
|
||||
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
|
||||
if ignore_test_name is False:
|
||||
# Show all info columns in the first couple columns
|
||||
if not frames:
|
||||
for col in info_cols:
|
||||
if col not in serving_df.columns:
|
||||
print(f"Skipping missing column: {col}")
|
||||
continue
|
||||
frames.append(serving_df[col])
|
||||
# only show test name under debug mode
|
||||
if debug is True:
|
||||
serving_df = serving_df.rename(columns={name_column: file + "_name"})
|
||||
frames.append(serving_df[file + "_name"])
|
||||
|
||||
file = "/".join(file.split("/")[:-1])
|
||||
serving_df = serving_df.rename(columns={data_column: file})
|
||||
frames.append(serving_df[file])
|
||||
raw_data_cols.append(file)
|
||||
compare_frames.append(serving_df[file])
|
||||
if len(compare_frames) >= 2:
|
||||
# Compare numbers among two files
|
||||
@ -27,7 +41,68 @@ def compare_data_columns(
|
||||
compare_frames.pop(1)
|
||||
|
||||
concat_df = pd.concat(frames, axis=1)
|
||||
return concat_df
|
||||
print(raw_data_cols)
|
||||
return concat_df, raw_data_cols
|
||||
|
||||
|
||||
def split_json_by_tp_pp(
|
||||
input_file: str = "benchmark_results.json", output_root: str = "."
|
||||
) -> list[str]:
|
||||
"""
|
||||
Split a benchmark JSON into separate folders by (TP Size, PP Size).
|
||||
|
||||
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
|
||||
Returns: list of file paths written.
|
||||
"""
|
||||
# Load JSON data into DataFrame
|
||||
with open(input_file, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# If the JSON is a dict with a list under common keys, use that list
|
||||
if isinstance(data, dict):
|
||||
for key in ("results", "serving_results", "benchmarks", "data"):
|
||||
if isinstance(data.get(key), list):
|
||||
data = data[key]
|
||||
break
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Handle alias column names
|
||||
rename_map = {
|
||||
"tp_size": "TP Size",
|
||||
"tensor_parallel_size": "TP Size",
|
||||
"pp_size": "PP Size",
|
||||
"pipeline_parallel_size": "PP Size",
|
||||
}
|
||||
df.rename(
|
||||
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
|
||||
)
|
||||
|
||||
# Ensure TP/PP columns exist (default to 1 if missing)
|
||||
if "TP Size" not in df.columns:
|
||||
df["TP Size"] = 1
|
||||
if "PP Size" not in df.columns:
|
||||
df["PP Size"] = 1
|
||||
|
||||
# make sure TP/PP are numeric ints with no NaN
|
||||
df["TP Size"] = (
|
||||
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
|
||||
)
|
||||
df["PP Size"] = (
|
||||
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
|
||||
)
|
||||
|
||||
# Split into separate folders
|
||||
saved_paths: list[str] = []
|
||||
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
|
||||
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
|
||||
os.makedirs(folder_name, exist_ok=True)
|
||||
filepath = os.path.join(folder_name, "benchmark_results.json")
|
||||
group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
|
||||
print(f"Saved: {filepath}")
|
||||
saved_paths.append(filepath)
|
||||
|
||||
return saved_paths
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -36,31 +111,105 @@ if __name__ == "__main__":
|
||||
"-f", "--file", action="append", type=str, help="input file name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
|
||||
"--debug", action="store_true", help="show all information for debugging"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--plot",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=True,
|
||||
help="plot perf diagrams or not --no-plot --plot",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-x",
|
||||
"--xaxis",
|
||||
type=str,
|
||||
default="# of max concurrency.",
|
||||
help="column name to use as X Axis in comparision graph",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
files = args.file
|
||||
print("comparing : " + ", ".join(files))
|
||||
|
||||
drop_column = "P99"
|
||||
name_column = "Test name"
|
||||
info_cols = [
|
||||
"Model",
|
||||
"Dataset Name",
|
||||
"Input Len",
|
||||
"Output Len",
|
||||
"TP Size",
|
||||
"PP Size",
|
||||
"# of max concurrency.",
|
||||
"qps",
|
||||
]
|
||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||
html_msgs_for_data_cols = [
|
||||
"Compare Output Tokens /n",
|
||||
"Median TTFT /n",
|
||||
"Median TPOT /n",
|
||||
]
|
||||
ignore_test_name = args.ignore_test_name
|
||||
|
||||
if len(args.file) == 1:
|
||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
|
||||
else:
|
||||
files = args.file
|
||||
print("comparing : " + ", ".join(files))
|
||||
debug = args.debug
|
||||
plot = args.plot
|
||||
# For Plot feature, assign y axis from one of info_cols
|
||||
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
|
||||
with open("perf_comparison.html", "w") as text_file:
|
||||
for i in range(len(data_cols_to_compare)):
|
||||
output_df = compare_data_columns(
|
||||
output_df, raw_data_cols = compare_data_columns(
|
||||
files,
|
||||
name_column,
|
||||
data_cols_to_compare[i],
|
||||
info_cols,
|
||||
drop_column,
|
||||
ignore_test_name=ignore_test_name,
|
||||
debug=debug,
|
||||
)
|
||||
print(output_df)
|
||||
html = output_df.to_html()
|
||||
text_file.write(html_msgs_for_data_cols[i])
|
||||
text_file.write(html)
|
||||
|
||||
# For Plot feature, insert y axis from one of info_cols
|
||||
raw_data_cols.insert(0, info_cols[y_axis_index])
|
||||
|
||||
filtered_info_cols = info_cols[:-2]
|
||||
existing_group_cols = [
|
||||
c for c in filtered_info_cols if c in output_df.columns
|
||||
]
|
||||
if not existing_group_cols:
|
||||
raise ValueError(
|
||||
f"No valid group-by columns "
|
||||
f"Expected subset: {filtered_info_cols}, "
|
||||
f"but DataFrame has: {list(output_df.columns)}"
|
||||
)
|
||||
|
||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||
for name, group in output_groups:
|
||||
html = group.to_html()
|
||||
text_file.write(html_msgs_for_data_cols[i])
|
||||
text_file.write(html)
|
||||
|
||||
if plot is True:
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
|
||||
df = group[raw_data_cols]
|
||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||
# Melt DataFrame for plotting
|
||||
df_melted = df_sorted.melt(
|
||||
id_vars=info_cols[y_axis_index],
|
||||
var_name="Configuration",
|
||||
value_name=data_cols_to_compare[i],
|
||||
)
|
||||
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||
# Create Plotly line chart
|
||||
fig = px.line(
|
||||
df_melted,
|
||||
x=info_cols[y_axis_index],
|
||||
y=data_cols_to_compare[i],
|
||||
color="Configuration",
|
||||
title=title,
|
||||
markers=True,
|
||||
)
|
||||
# Export to HTML
|
||||
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
||||
|
||||
@ -1,17 +1,19 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
from importlib import util
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
import psutil
|
||||
from tabulate import tabulate
|
||||
|
||||
results_folder = Path("results/")
|
||||
|
||||
# latency results and the keys that will be printed into markdown
|
||||
latency_results = []
|
||||
latency_column_mapping = {
|
||||
@ -42,14 +44,22 @@ throughput_results_column_mapping = {
|
||||
serving_results = []
|
||||
serving_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"model_id": "Model",
|
||||
"dataset_name": "Dataset Name",
|
||||
"input_len": "Input Len",
|
||||
"output_len": "Output Len",
|
||||
"tp_size": "TP Size",
|
||||
"pp_size": "PP Size",
|
||||
"dtype": "dtype",
|
||||
"gpu_type": "GPU",
|
||||
"completed": "# of req.",
|
||||
"qps": "qps",
|
||||
"max_concurrency": "# of max concurrency.",
|
||||
"request_throughput": "Tput (req/s)",
|
||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||
"output_throughput": "Output Tput (tok/s)",
|
||||
"total_input_tokens": "Total input tokens",
|
||||
"total_output_tokens": "Total output tokens",
|
||||
# "total_input_tokens": "Total input tokens",
|
||||
# "total_output_tokens": "Total output tokens",
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||
@ -94,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"):
|
||||
bytes /= factor
|
||||
|
||||
|
||||
def _coerce(val: str) -> Any:
|
||||
"""Best-effort type coercion from string to Python types."""
|
||||
low = val.lower()
|
||||
if low == "null":
|
||||
return None
|
||||
if low == "true":
|
||||
return True
|
||||
if low == "false":
|
||||
return False
|
||||
# integers
|
||||
if re.fullmatch(r"[+-]?\d+", val):
|
||||
try:
|
||||
return int(val)
|
||||
except ValueError:
|
||||
pass
|
||||
# floats (keep 'inf'/'-inf'/'nan' as strings)
|
||||
if re.fullmatch(r"[+-]?\d*\.\d+", val):
|
||||
try:
|
||||
return float(val)
|
||||
except ValueError:
|
||||
pass
|
||||
return val
|
||||
|
||||
|
||||
def parse_client_command(cmd: str) -> dict[str, Any]:
|
||||
"""Parse the client_command shell string into {executable, script, args}."""
|
||||
toks = shlex.split(cmd)
|
||||
if len(toks) < 2:
|
||||
raise ValueError("client_command must include an executable and a script")
|
||||
executable, script = toks[0], toks[1]
|
||||
args: dict[str, Any] = {}
|
||||
|
||||
i = 2
|
||||
while i < len(toks):
|
||||
t = toks[i]
|
||||
if t.startswith("--"):
|
||||
# --key=value or --key (value) or boolean flag
|
||||
if "=" in t:
|
||||
key, val = t.split("=", 1)
|
||||
if key == "--metadata":
|
||||
md = {}
|
||||
if val:
|
||||
if "=" in val:
|
||||
k, v = val.split("=", 1)
|
||||
md[k] = _coerce(v)
|
||||
else:
|
||||
md[val] = True
|
||||
args[key] = md
|
||||
else:
|
||||
args[key] = _coerce(val)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
key = t
|
||||
|
||||
# Special: consume metadata k=v pairs until next --flag
|
||||
if key == "--metadata":
|
||||
i += 1
|
||||
md = {}
|
||||
while i < len(toks) and not toks[i].startswith("--"):
|
||||
pair = toks[i]
|
||||
if "=" in pair:
|
||||
k, v = pair.split("=", 1)
|
||||
md[k] = _coerce(v)
|
||||
else:
|
||||
md[pair] = True
|
||||
i += 1
|
||||
args[key] = md
|
||||
continue
|
||||
|
||||
# Standard: check if next token is a value (not a flag)
|
||||
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
|
||||
args[key] = _coerce(toks[i + 1])
|
||||
i += 2
|
||||
else:
|
||||
# lone flag -> True
|
||||
args[key] = True
|
||||
i += 1
|
||||
else:
|
||||
# unexpected positional; skip
|
||||
i += 1
|
||||
|
||||
return {"executable": executable, "script": script, "args": args}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--result",
|
||||
type=str,
|
||||
default="results",
|
||||
help="Folder name for benchmark output results.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
results_folder = Path(args.result)
|
||||
if not results_folder.exists():
|
||||
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
with open(test_file) as f:
|
||||
@ -102,7 +209,6 @@ if __name__ == "__main__":
|
||||
|
||||
if "serving" in str(test_file):
|
||||
# this result is generated via `vllm bench serve` command
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
try:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
@ -110,12 +216,44 @@ if __name__ == "__main__":
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
# Parse Server Command Arg
|
||||
out: dict[str, Any] = {
|
||||
"server_command": parse_client_command(command["server_command"])
|
||||
}
|
||||
parse_args = [
|
||||
"--tensor-parallel-size",
|
||||
"--pipeline-parallel-size",
|
||||
"--dtype",
|
||||
]
|
||||
col_mapping = ["tp_size", "pp_size", "dtype"]
|
||||
for index, arg in enumerate(parse_args):
|
||||
if arg in out["server_command"]["args"]:
|
||||
raw_result.update(
|
||||
{col_mapping[index]: out["server_command"]["args"][arg]}
|
||||
)
|
||||
|
||||
# Parse Client Command Arg
|
||||
out: dict[str, Any] = {
|
||||
"client_command": parse_client_command(command["client_command"])
|
||||
}
|
||||
parse_args = [
|
||||
"--dataset-name",
|
||||
"--random-input-len",
|
||||
"--random-output-len",
|
||||
"--request-rate",
|
||||
]
|
||||
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
|
||||
|
||||
for index, arg in enumerate(parse_args):
|
||||
if arg in out["client_command"]["args"]:
|
||||
raw_result.update(
|
||||
{col_mapping[index]: out["client_command"]["args"][arg]}
|
||||
)
|
||||
# Add Server, Client command
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# add the result to raw_result
|
||||
serving_results.append(raw_result)
|
||||
continue
|
||||
@ -205,7 +343,10 @@ if __name__ == "__main__":
|
||||
columns=latency_column_mapping
|
||||
)
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||
valid_columns = [
|
||||
col for col in serving_column_mapping if col in serving_results.columns
|
||||
]
|
||||
serving_results = serving_results[valid_columns].rename(
|
||||
columns=serving_column_mapping
|
||||
)
|
||||
if not throughput_results.empty:
|
||||
@ -245,7 +386,9 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
# document the result
|
||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||
md_file = "benchmark_results.md"
|
||||
json_file = "benchmark_results.json"
|
||||
with open(results_folder / md_file, "w") as f:
|
||||
results = read_markdown(
|
||||
"../.buildkite/nightly-benchmarks/"
|
||||
+ "performance-benchmarks-descriptions.md"
|
||||
@ -260,7 +403,7 @@ if __name__ == "__main__":
|
||||
f.write(results)
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||
with open(results_folder / json_file, "w") as f:
|
||||
results = (
|
||||
latency_results.to_dict(orient="records")
|
||||
+ throughput_results.to_dict(orient="records")
|
||||
|
||||
@ -194,9 +194,11 @@ run_latency_tests() {
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
@ -261,9 +263,11 @@ run_throughput_tests() {
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
@ -329,12 +333,21 @@ run_serving_tests() {
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||
max_concurrency_list="[$num_prompts]"
|
||||
fi
|
||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
||||
echo "Running over max concurrency list $max_concurrency_list"
|
||||
|
||||
# check if there is enough resources to run the test
|
||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
if [[ $numa_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
@ -390,35 +403,39 @@ run_serving_tests() {
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
# iterate over different max_concurrency
|
||||
for max_concurrency in $max_concurrency_list; do
|
||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
||||
echo " new test name $new_test_name"
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--max-concurrency $max_concurrency \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
$client_args $client_remote_args "
|
||||
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
$client_args $client_remote_args "
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
bash -c "$client_command"
|
||||
|
||||
bash -c "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
done
|
||||
|
||||
# clean up
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
@ -20,7 +20,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -10,7 +11,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -23,17 +24,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -42,7 +43,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -55,17 +56,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -74,7 +75,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -87,17 +88,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -106,7 +107,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -120,19 +121,19 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 1000,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -141,7 +142,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -155,19 +156,19 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 1000,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -176,7 +177,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -190,13 +191,11 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 1000,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_pp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -10,7 +11,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -23,17 +24,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp3_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -42,7 +43,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -55,17 +56,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2pp6_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"test_name": "serving_llama8B_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -74,7 +75,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
@ -88,17 +89,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp1_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -107,7 +108,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -121,28 +122,28 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 1000,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp3_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL:": 1,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -156,19 +157,19 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 1000,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2pp3_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -177,7 +178,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
@ -192,13 +193,12 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 1000,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -10,7 +11,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -23,17 +24,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -42,7 +43,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -55,17 +56,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -74,7 +75,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -87,17 +88,17 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"max_concurrency": 60,
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -106,7 +107,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -120,19 +121,19 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 100,
|
||||
"num_prompts": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
@ -141,7 +142,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"pipeline_parallel_size": 6,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
@ -155,13 +156,12 @@
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 1024,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"max_concurrency": 100,
|
||||
"num_prompts": 100
|
||||
}
|
||||
}
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
@ -21,7 +21,7 @@
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
|
||||
20
.github/PULL_REQUEST_TEMPLATE.md
vendored
20
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,11 +1,5 @@
|
||||
# Essential Elements of an Effective PR Description Checklist
|
||||
|
||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||
- [ ] The test plan, such as providing test command.
|
||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||
|
||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
||||
<!-- markdownlint-disable -->
|
||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
|
||||
|
||||
## Purpose
|
||||
|
||||
@ -15,4 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
|
||||
|
||||
## (Optional) Documentation Update
|
||||
|
||||
---
|
||||
<details>
|
||||
<summary> Essential Elements of an Effective PR Description Checklist </summary>
|
||||
|
||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||
- [ ] The test plan, such as providing test command.
|
||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||
</details>
|
||||
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||
|
||||
8
.github/scripts/cleanup_pr_body.sh
vendored
8
.github/scripts/cleanup_pr_body.sh
vendored
@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
|
||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||
cp "${OLD}" "${NEW}"
|
||||
|
||||
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
|
||||
sed -i '/FIX #xxxx.*$/d' "${NEW}"
|
||||
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
|
||||
sed -i '/<!--.*-->$/d' "${NEW}"
|
||||
|
||||
# Remove "FILL IN THE PR DESCRIPTION HERE"
|
||||
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
|
||||
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
|
||||
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
|
||||
|
||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
74
benchmarks/benchmark_block_pool.py
Normal file
74
benchmarks/benchmark_block_pool.py
Normal file
@ -0,0 +1,74 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for allocate_block in args.allocate_blocks:
|
||||
# Enforce a GC collect ahead to minimize the impact among runs
|
||||
gc.collect()
|
||||
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
||||
|
||||
get_blocks_times = TimeCollector(TimeCollector.US)
|
||||
free_blocks_times = TimeCollector(TimeCollector.US)
|
||||
for _ in range(args.num_iteration):
|
||||
with get_blocks_times:
|
||||
blocks = block_pool.get_new_blocks(allocate_block)
|
||||
with free_blocks_times:
|
||||
block_pool.free_blocks(blocks)
|
||||
|
||||
rows.append(
|
||||
[get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
|
||||
+ get_blocks_times.dump_avg_max()
|
||||
+ free_blocks_times.dump_avg_max()
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"Iterations",
|
||||
"Total\nBlocks",
|
||||
"Allocated\nBlocks",
|
||||
"Get Blocks\nAvg (us)",
|
||||
"Get Blocks\nMax (us)",
|
||||
"Free Blocks\nAvg (us)",
|
||||
"Free Blocks\nMax (us)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".3f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of BlockPool for KV Cache."
|
||||
)
|
||||
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allocate-blocks",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[10, 50, 100, 500, 1000],
|
||||
help="Number of blocks to allocate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
112
benchmarks/benchmark_ngram_proposer.py
Normal file
112
benchmarks/benchmark_ngram_proposer.py
Normal file
@ -0,0 +1,112 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
|
||||
import numpy as np
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for max_ngram in args.max_ngram:
|
||||
collector = TimeCollector(TimeCollector.US)
|
||||
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
max_model_len=args.num_token + args.num_spec_token,
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=None,
|
||||
trust_remote_code=False,
|
||||
)
|
||||
proposer = NgramProposer(
|
||||
vllm_config=VllmConfig(
|
||||
model_config=model_config,
|
||||
speculative_config=SpeculativeConfig(
|
||||
prompt_lookup_min=args.min_ngram,
|
||||
prompt_lookup_max=max_ngram,
|
||||
num_speculative_tokens=args.num_spec_token,
|
||||
method="ngram",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Warm up
|
||||
proposer.propose(np.random.randint(0, 20, (args.num_token,)))
|
||||
|
||||
gc.collect()
|
||||
for _ in range(args.num_iteration):
|
||||
tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
|
||||
with collector:
|
||||
for i in range(args.num_req):
|
||||
proposer.propose(tokens[i, :])
|
||||
rows.append(
|
||||
[args.num_req, args.num_token, args.min_ngram, max_ngram]
|
||||
+ collector.dump_avg_max()
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"# Request",
|
||||
"# Token",
|
||||
"Min Ngram",
|
||||
"Max Ngram",
|
||||
"Avg (us)",
|
||||
"Max (us)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".3f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-req", type=int, default=128, help="Number of requests in the batch"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-token", type=int, default=1500, help="Number of tokens for each request"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-ngram",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Minimum n-gram to match",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-ngram",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[5, 7, 10, 15, 20],
|
||||
help="Maximum n-gram to match",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-spec-token",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of speculative tokens to generate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
@ -1,11 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from typing import Any
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None:
|
||||
cls=InfEncoder,
|
||||
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||
)
|
||||
|
||||
|
||||
# Collect time and generate time metrics
|
||||
#
|
||||
# Example Usage:
|
||||
# collector = TimeCollector(TimeCollector.US)
|
||||
# for _ in range(total_iteration):
|
||||
# with collector:
|
||||
# ...
|
||||
# collector.dump_avg_max()
|
||||
class TimeCollector:
|
||||
NS: int = 1
|
||||
US: int = NS * 1000
|
||||
MS: int = US * 1000
|
||||
S: int = MS * 1000
|
||||
|
||||
def __init__(self, scale: int) -> None:
|
||||
self.cnt: int = 0
|
||||
self._sum: int = 0
|
||||
self._max: Optional[int] = None
|
||||
self.scale = scale
|
||||
self.start_time: int = time.monotonic_ns()
|
||||
|
||||
def collect(self, v: int) -> None:
|
||||
self.cnt += 1
|
||||
self._sum += v
|
||||
if self._max is None:
|
||||
self._max = v
|
||||
else:
|
||||
self._max = max(self._max, v)
|
||||
|
||||
def avg(self) -> Union[float, str]:
|
||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||
|
||||
def max(self) -> Union[float, str]:
|
||||
return self._max / self.scale if self._max else "N/A"
|
||||
|
||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||
return [self.avg(), self.max()]
|
||||
|
||||
def __enter__(self) -> None:
|
||||
self.start_time = time.monotonic_ns()
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[type[BaseException]],
|
||||
exc_value: Optional[BaseException],
|
||||
exc_traceback: Optional[TracebackType],
|
||||
) -> None:
|
||||
self.collect(time.monotonic_ns() - self.start_time)
|
||||
|
||||
@ -1,108 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from tabulate import tabulate
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
|
||||
|
||||
class Metric:
|
||||
def __init__(self) -> None:
|
||||
self.cnt: int = 0
|
||||
self.sum_v: int = 0
|
||||
self.max_v: Optional[int] = None
|
||||
|
||||
def update(self, v: int) -> None:
|
||||
self.cnt += 1
|
||||
self.sum_v += v
|
||||
if self.max_v is None:
|
||||
self.max_v = v
|
||||
else:
|
||||
self.max_v = max(self.max_v, v)
|
||||
|
||||
def avg_v(self) -> float:
|
||||
return self.sum_v * 1.0 / self.cnt
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for allocate_block in args.allocate_blocks:
|
||||
# Enforce a GC collect ahead to minimize the impact among runs
|
||||
gc.collect()
|
||||
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
||||
|
||||
get_blocks_metric: Metric = Metric()
|
||||
free_blocks_metric: Metric = Metric()
|
||||
for _ in range(args.num_iteration):
|
||||
t1 = time.monotonic_ns()
|
||||
blocks = block_pool.get_new_blocks(allocate_block)
|
||||
t2 = time.monotonic_ns()
|
||||
block_pool.free_blocks(blocks)
|
||||
t3 = time.monotonic_ns()
|
||||
get_blocks_metric.update(t2 - t1)
|
||||
free_blocks_metric.update(t3 - t2)
|
||||
|
||||
if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
|
||||
rows.append(
|
||||
[
|
||||
get_blocks_metric.cnt,
|
||||
args.num_gpu_blocks,
|
||||
allocate_block,
|
||||
get_blocks_metric.avg_v() / 1000000,
|
||||
get_blocks_metric.max_v / 1000000.0,
|
||||
free_blocks_metric.avg_v() / 1000000,
|
||||
free_blocks_metric.max_v / 1000000.0,
|
||||
]
|
||||
)
|
||||
else:
|
||||
print(
|
||||
"No valid metrics found."
|
||||
f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"Iterations",
|
||||
"Total\nBlocks",
|
||||
"Allocated\nBlocks",
|
||||
"Get Blocks\nAvg (ms)",
|
||||
"Get Blocks\nMax (ms)",
|
||||
"Free Blocks\nAvg (ms)",
|
||||
"Free Blocks\nMax (ms)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".6f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of BlockPool for KV Cache."
|
||||
)
|
||||
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allocate-blocks",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[10, 50, 100, 500, 1000],
|
||||
help="Number of blocks to allocate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
@ -423,12 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
static_assert(WARP_SIZE == 32 || WARP_SIZE == 64, \
|
||||
"Unsupported warp size. Only 32 and 64 are supported."); \
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
|
||||
#else
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
} else { \
|
||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IndType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
@ -443,7 +458,9 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
cudaStream_t stream) {
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||
#ifndef USE_ROCM
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
|
||||
#endif
|
||||
switch (num_experts) {
|
||||
case 1:
|
||||
LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
|
||||
@ -11,7 +11,7 @@ vLLM contains two sets of benchmarks:
|
||||
|
||||
The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
|
||||
|
||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
|
||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ vLLM supports the following hardware platforms:
|
||||
## Hardware Plugins
|
||||
|
||||
The backends below live **outside** the main `vllm` repository and follow the
|
||||
[Hardware-Pluggable RFC](../design/plugin_system.md).
|
||||
[Hardware-Pluggable RFC](../../design/plugin_system.md).
|
||||
|
||||
| Accelerator | PyPI / package | Repository |
|
||||
|-------------|----------------|------------|
|
||||
|
||||
@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
||||
|
||||
@ -24,18 +24,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def monkeypatch_module():
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
mpatch = MonkeyPatch()
|
||||
yield mpatch
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
def multimodal_server(request, monkeypatch_module): # noqa: F811
|
||||
|
||||
use_v1 = request.param
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
||||
def multimodal_server(): # noqa: F811
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
|
||||
@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
|
||||
op = context.operation
|
||||
assert op is not None
|
||||
|
||||
def no_file_type(case: schemathesis.models.Case):
|
||||
def no_invalid_types(case: schemathesis.models.Case):
|
||||
"""
|
||||
This filter skips test cases for the `POST /tokenize` endpoint where the
|
||||
HTTP request body uses `"type": "file"` in any message's content.
|
||||
We expect these cases to fail because that type isn't implemented here
|
||||
https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
|
||||
This filter skips test cases with invalid data that schemathesis
|
||||
incorrectly generates due to permissive schema configurations.
|
||||
|
||||
1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in
|
||||
message content, which isn't implemented.
|
||||
|
||||
2. Skips tool_calls with `"type": "custom"` which schemathesis
|
||||
incorrectly generates instead of the valid `"type": "function"`.
|
||||
|
||||
Example test cases that are skipped:
|
||||
curl -X POST -H 'Content-Type: application/json' \
|
||||
-d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
http://localhost:8000/tokenize
|
||||
|
||||
curl -X POST -H 'Content-Type: application/json' \
|
||||
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
http://localhost:8000/tokenize
|
||||
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
|
||||
http://localhost:8000/v1/chat/completions
|
||||
""" # noqa: E501
|
||||
if (op.method.lower() == "post" and op.path == "/tokenize"
|
||||
and hasattr(case, "body") and isinstance(case.body, dict)
|
||||
if (hasattr(case, "body") and isinstance(case.body, dict)
|
||||
and "messages" in case.body
|
||||
and isinstance(case.body["messages"], list)
|
||||
and len(case.body["messages"]) > 0):
|
||||
|
||||
for message in case.body["messages"]:
|
||||
if not isinstance(message, dict):
|
||||
continue
|
||||
content = message.get("content", [])
|
||||
if not isinstance(content, list) or len(content) == 0:
|
||||
continue
|
||||
if any(item.get("type") == "file" for item in content):
|
||||
return False
|
||||
|
||||
# Check for invalid file type in tokenize endpoint
|
||||
if op.method.lower() == "post" and op.path == "/tokenize":
|
||||
content = message.get("content", [])
|
||||
if (isinstance(content, list) and len(content) > 0 and any(
|
||||
item.get("type") == "file" for item in content)):
|
||||
return False
|
||||
|
||||
# Check for invalid tool_calls with non-function types
|
||||
tool_calls = message.get("tool_calls", [])
|
||||
if isinstance(tool_calls, list):
|
||||
for tool_call in tool_calls:
|
||||
if isinstance(tool_call, dict):
|
||||
if tool_call.get("type") != "function":
|
||||
return False
|
||||
if "custom" in tool_call:
|
||||
return False
|
||||
return True
|
||||
|
||||
return strategy.filter(no_file_type)
|
||||
return strategy.filter(no_invalid_types)
|
||||
|
||||
|
||||
@schema.parametrize()
|
||||
|
||||
@ -80,9 +80,6 @@ async def test_bad_requests(mary_had_lamb):
|
||||
async def test_long_audio_request(mary_had_lamb, model_name):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("openai"):
|
||||
return
|
||||
|
||||
mary_had_lamb.seek(0)
|
||||
audio, sr = librosa.load(mary_had_lamb)
|
||||
# Add small silence after each audio for repeatability in the split process
|
||||
|
||||
@ -36,7 +36,7 @@ MODELS = [
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
|
||||
@ -46,7 +46,7 @@ MODELS = [
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
|
||||
@ -561,7 +561,7 @@ VLM_TEST_SETTINGS = {
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
|
||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
|
||||
marks=[pytest.mark.skip("HF import fails")],
|
||||
),
|
||||
"minicpmv_26": VLMTestInfo(
|
||||
@ -574,8 +574,6 @@ VLM_TEST_SETTINGS = {
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
|
||||
marks=[pytest.mark.skip("HF import fails")],
|
||||
),
|
||||
"minimax_vl_01": VLMTestInfo(
|
||||
models=["MiniMaxAI/MiniMax-VL-01"],
|
||||
@ -611,18 +609,6 @@ VLM_TEST_SETTINGS = {
|
||||
patch_hf_runner=model_utils.ovis_patch_hf_runner,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"ovis1_6": VLMTestInfo(
|
||||
models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="half",
|
||||
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
|
||||
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
|
||||
patch_hf_runner=model_utils.ovis_patch_hf_runner,
|
||||
),
|
||||
"ovis2": VLMTestInfo(
|
||||
models=["AIDC-AI/Ovis2-1B"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
|
||||
@ -153,4 +153,4 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
|
||||
if hasattr(model, method_name):
|
||||
getattr(model, method_name)(**mm_kwargs)
|
||||
|
||||
vllm_model.apply_model(validate_model_input)
|
||||
vllm_model.apply_model(validate_model_input)
|
||||
@ -195,7 +195,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
|
||||
{"alias": "gpt2"}),
|
||||
"GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
|
||||
{"tiny": "bigcode/tiny_starcoder_py"}), # noqa: E501
|
||||
extras={"tiny": "bigcode/tiny_starcoder_py"}, # noqa: E501
|
||||
min_transformers_version="4.55.1"),
|
||||
"GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
|
||||
{"6b": "EleutherAI/gpt-j-6b"}),
|
||||
"GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
|
||||
|
||||
@ -11,7 +11,8 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationConfig, CompilationLevel
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
|
||||
from vllm.model_executor.models.gemma3n_mm import (
|
||||
Gemma3nForConditionalGeneration)
|
||||
from vllm.model_executor.models.registry import ModelRegistry
|
||||
from vllm.model_executor.models.utils import extract_layer_index
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@ -32,12 +33,13 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Union[torch.Tensor, IntermediateTensors]:
|
||||
hidden_states = self.model(input_ids, positions, intermediate_tensors,
|
||||
inputs_embeds, **kwargs)
|
||||
hidden_states = super().forward(input_ids, positions,
|
||||
intermediate_tensors, inputs_embeds,
|
||||
**kwargs)
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
# attn_metadata is None during dummy runs
|
||||
if (attn_metadata is not None
|
||||
and self.cache_config.kv_sharing_fast_prefill):
|
||||
and self.language_model.cache_config.kv_sharing_fast_prefill):
|
||||
assert isinstance(attn_metadata, dict) # true in V1
|
||||
# Gemma3n-E2B has 30 layers, with last 20 layers being
|
||||
# cross-decoder layers. Check attention metadata is correct
|
||||
@ -52,7 +54,7 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
|
||||
|
||||
# Last layer will be a KV sharing layer
|
||||
layer_attn_metadata = attn_metadata[
|
||||
self.model.language_model.layers[-1].self_attn.attn.layer_name]
|
||||
self.language_model.model.layers[-1].self_attn.attn.layer_name]
|
||||
logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
|
||||
assert logits_indices_padded is not None
|
||||
num_logits_indices = layer_attn_metadata.num_logits_indices
|
||||
|
||||
@ -146,7 +146,11 @@ def test_ngram_correctness(
|
||||
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
|
||||
],
|
||||
ids=[
|
||||
"qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle",
|
||||
# TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501
|
||||
# "qwen3_eagle3",
|
||||
"llama3_eagle",
|
||||
"llama3_eagle3",
|
||||
"llama4_eagle",
|
||||
"llama4_eagle_mm"
|
||||
])
|
||||
@pytest.mark.parametrize("attn_backend",
|
||||
|
||||
@ -1,43 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import numpy as np
|
||||
|
||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||
from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
|
||||
_find_subarray_kmp,
|
||||
_kmp_lps_array)
|
||||
from vllm.v1.spec_decode.ngram_proposer import (
|
||||
NgramProposer, _find_longest_matched_ngram_and_propose_tokens)
|
||||
|
||||
|
||||
def test_kmp_lps_array():
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
|
||||
np.array([0, 1, 2]))
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
|
||||
np.array([0, 0, 0, 0]))
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
|
||||
np.array([0, 0, 1, 2, 0]))
|
||||
def test_find_longest_matched_ngram_and_propose_tokens():
|
||||
tokens = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
|
||||
assert _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=2) is None
|
||||
|
||||
tokens = np.array([1, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=3),
|
||||
np.array([4, 1, 2]))
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=2), np.array([4, 1]))
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=1,
|
||||
max_ngram=1,
|
||||
max_model_len=1024,
|
||||
k=3),
|
||||
np.array([4, 1, 2]))
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=1,
|
||||
max_ngram=1,
|
||||
max_model_len=1024,
|
||||
k=2), np.array([4, 1]))
|
||||
|
||||
def test_find_subarray_kmp():
|
||||
X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
|
||||
assert _find_subarray_kmp(X, 2, 2) is None
|
||||
X = np.array([1, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
|
||||
np.array([4, 1, 2]))
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
|
||||
1]))
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
|
||||
np.array([4, 1, 2]))
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
|
||||
1]))
|
||||
X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
|
||||
np.array([4, 1, 2]))
|
||||
tokens = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=3),
|
||||
np.array([4, 1, 2]))
|
||||
# Return on the first match
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
|
||||
np.array([6, 2, 3]))
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=1,
|
||||
max_ngram=1,
|
||||
max_model_len=1024,
|
||||
k=2), np.array([6, 2]))
|
||||
|
||||
|
||||
def test_ngram_proposer():
|
||||
@ -56,27 +76,35 @@ def test_ngram_proposer():
|
||||
|
||||
# No match.
|
||||
result = ngram_proposer(
|
||||
2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
|
||||
min_n=2, max_n=2,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
|
||||
assert result is None
|
||||
|
||||
# No match for 4-gram.
|
||||
result = ngram_proposer(
|
||||
4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
min_n=4, max_n=4,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
assert result is None
|
||||
|
||||
# No match for 4-gram but match for 3-gram.
|
||||
result = ngram_proposer(
|
||||
3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
min_n=3, max_n=4,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
assert np.array_equal(result, np.array([4, 1]))
|
||||
|
||||
# Match for both 4-gram and 3-gram.
|
||||
# In this case, the proposer should return the 4-gram match.
|
||||
result = ngram_proposer(3, 4, 2).propose(
|
||||
result = ngram_proposer(min_n=3, max_n=4, k=2).propose(
|
||||
context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
|
||||
assert np.array_equal(result, np.array([1, 2])) # Not [5, 1]
|
||||
|
||||
# Match for 2-gram and 3-gram, but not 4-gram.
|
||||
result = ngram_proposer(
|
||||
2, 4,
|
||||
2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
|
||||
result = ngram_proposer(min_n=2, max_n=4, k=2).propose(
|
||||
context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
|
||||
assert np.array_equal(result, np.array([1, 2])) # Not [5, 2]
|
||||
|
||||
# Multiple 3-gram matched, but always pick the first one.
|
||||
result = ngram_proposer(
|
||||
min_n=3, max_n=3, k=2).propose(context_token_ids=np.array(
|
||||
[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]))
|
||||
assert np.array_equal(result, np.array([100, 1]))
|
||||
|
||||
@ -297,7 +297,7 @@ class CustomAllreduce:
|
||||
@staticmethod
|
||||
def free_shared_buffer(pointers: list[int],
|
||||
group: Optional[ProcessGroup] = None,
|
||||
rank: Optional[int] = 0) -> None:
|
||||
rank: Optional[int] = None) -> None:
|
||||
if rank is None:
|
||||
rank = dist.get_rank(group=group)
|
||||
if ops is not None:
|
||||
|
||||
@ -711,7 +711,7 @@ class EngineArgs:
|
||||
"--mm-processor-cache-gb",
|
||||
**multimodal_kwargs["mm_processor_cache_gb"])
|
||||
multimodal_group.add_argument("--disable-mm-preprocessor-cache",
|
||||
type=bool,
|
||||
action="store_true",
|
||||
deprecated=True)
|
||||
multimodal_group.add_argument(
|
||||
"--interleave-mm-strings",
|
||||
|
||||
@ -1092,6 +1092,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
self.engine.reset_prefix_cache(device)
|
||||
|
||||
async def sleep(self, level: int = 1) -> None:
|
||||
await self.reset_prefix_cache()
|
||||
self.engine.sleep(level)
|
||||
|
||||
async def wake_up(self, tags: Optional[list[str]] = None) -> None:
|
||||
|
||||
@ -41,7 +41,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
ReplicatedLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
@ -118,14 +117,15 @@ class Glm4MoE(nn.Module):
|
||||
if config.hidden_act != "silu":
|
||||
raise ValueError(f"Unsupported activation: {config.hidden_act}. "
|
||||
"Only silu is supported for now.")
|
||||
|
||||
self.gate = ReplicatedLinear(config.hidden_size,
|
||||
config.n_routed_experts,
|
||||
bias=False,
|
||||
quant_config=None,
|
||||
params_dtype=torch.float32,
|
||||
prefix=f"{prefix}.gate")
|
||||
|
||||
# NOTE In the transformers implementation, the gate isn't an nn.Linear,
|
||||
# so we cannot use ReplicatedLinear here.
|
||||
# See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260
|
||||
self.gate = nn.Linear(
|
||||
config.hidden_size,
|
||||
config.n_routed_experts,
|
||||
bias=False,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
self.gate.e_score_correction_bias = nn.Parameter(
|
||||
torch.empty(config.n_routed_experts, dtype=torch.float32))
|
||||
|
||||
@ -181,7 +181,7 @@ class Glm4MoE(nn.Module):
|
||||
|
||||
if self.n_shared_experts is not None:
|
||||
shared_output = self.shared_experts(hidden_states)
|
||||
router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
|
||||
router_logits = self.gate(hidden_states.to(dtype=torch.float32))
|
||||
final_hidden_states = self.experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits) * self.routed_scaling_factor
|
||||
|
||||
@ -72,8 +72,9 @@ class PixtralHFImagePixelInputs(TensorSchema):
|
||||
in which case the data is passed as a list instead of a batched tensor.
|
||||
"""
|
||||
type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral"
|
||||
pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]],
|
||||
TensorShape("bn", "c", "h", "w")]
|
||||
pixel_values: Annotated[
|
||||
Union[torch.Tensor, list[torch.Tensor]],
|
||||
TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"})]
|
||||
|
||||
|
||||
class LlavaImageEmbeddingInputs(TensorSchema):
|
||||
|
||||
@ -91,7 +91,7 @@ class MiniCPMVImagePixelInputs(TensorSchema):
|
||||
# batched tensor.
|
||||
pixel_values: Annotated[
|
||||
list[torch.Tensor],
|
||||
TensorShape("bns", "c", "h", "w"),
|
||||
TensorShape("bns", "c", "h", "w", dynamic_dims={"h", "w"}),
|
||||
]
|
||||
tgt_sizes: Annotated[
|
||||
torch.Tensor,
|
||||
|
||||
@ -449,23 +449,6 @@ def get_config(
|
||||
raise e
|
||||
config = _maybe_remap_hf_config_attrs(config)
|
||||
|
||||
# Phi4Flash misuses this config as list[int]. Convert it to int and add
|
||||
# the layer_types list[str] to make it HF compatible
|
||||
if (config.model_type == "phi4flash"):
|
||||
# TODO: Remove after the following PR is merged:
|
||||
# https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/6
|
||||
if not hasattr(config, "layer_types"):
|
||||
config.layer_types = [
|
||||
"sliding_attention" if i < config.num_hidden_layers // 2
|
||||
and i % 2 == 1 else "full_attention"
|
||||
for i in range(config.num_hidden_layers)
|
||||
]
|
||||
# TODO: Remove after the following PR is merged:
|
||||
# https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/7
|
||||
if isinstance(config.sliding_window, list):
|
||||
config.sliding_window = next(
|
||||
filter(None, config.sliding_window), None)
|
||||
|
||||
elif config_format == ConfigFormat.MISTRAL:
|
||||
# This function loads a params.json config which
|
||||
# should be used when loading models in mistral format
|
||||
|
||||
@ -2,13 +2,13 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
|
||||
import huggingface_hub
|
||||
import regex as re
|
||||
from huggingface_hub import HfApi, hf_hub_download
|
||||
from transformers.tokenization_utils_base import BatchEncoding
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerBase
|
||||
@ -27,11 +27,6 @@ if TYPE_CHECKING:
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Encoding:
|
||||
input_ids: Union[list[int], list[list[int]]]
|
||||
|
||||
|
||||
def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
|
||||
# SEE: https://github.com/vllm-project/vllm/pull/9951
|
||||
# Credits go to: @gcalmettes
|
||||
@ -359,7 +354,7 @@ class MistralTokenizer(TokenizerBase):
|
||||
# For str, single prompt text
|
||||
else:
|
||||
input_ids = self.encode_one(text, truncation, max_length)
|
||||
return Encoding(input_ids=input_ids)
|
||||
return BatchEncoding({"input_ids": input_ids})
|
||||
|
||||
def get_vocab(self) -> dict[str, int]:
|
||||
# NB: the dictionary form of the vocabulary collapses token ids that map
|
||||
|
||||
@ -709,8 +709,28 @@ class AsyncMicrobatchTokenizer:
|
||||
|
||||
|
||||
def cancel_task_threadsafe(task: Task):
|
||||
if task and not task.done() and not (loop := task.get_loop()).is_closed():
|
||||
loop.call_soon_threadsafe(task.cancel)
|
||||
if task and not task.done():
|
||||
run_in_loop(task.get_loop(), task.cancel)
|
||||
|
||||
|
||||
def close_sockets(sockets: Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]):
|
||||
for sock in sockets:
|
||||
if sock is not None:
|
||||
sock.close(linger=0)
|
||||
|
||||
|
||||
def run_in_loop(loop: AbstractEventLoop, function: Callable, *args):
|
||||
if in_loop(loop):
|
||||
function(*args)
|
||||
elif not loop.is_closed():
|
||||
loop.call_soon_threadsafe(function, *args)
|
||||
|
||||
|
||||
def in_loop(event_loop: AbstractEventLoop) -> bool:
|
||||
try:
|
||||
return asyncio.get_running_loop() == event_loop
|
||||
except RuntimeError:
|
||||
return False
|
||||
|
||||
|
||||
def make_async(
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Annotated, Any, Union, get_args, get_origin, get_type_hints
|
||||
from typing import (Annotated, Any, Optional, Union, get_args, get_origin,
|
||||
get_type_hints)
|
||||
|
||||
import torch
|
||||
|
||||
@ -11,9 +12,13 @@ logger = init_logger(__name__)
|
||||
|
||||
class TensorShape:
|
||||
|
||||
def __init__(self,
|
||||
*dims: Union[int, str],
|
||||
dynamic_dims: set[str, ...] = None) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
*dims: Union[int, str],
|
||||
dynamic_dims: Optional[set[str]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.dims = dims
|
||||
self.dynamic_dims = dynamic_dims if dynamic_dims else set()
|
||||
|
||||
@ -44,11 +49,15 @@ class TensorShape:
|
||||
|
||||
class TensorSchema:
|
||||
|
||||
def __init__(self,
|
||||
*,
|
||||
validate: bool = True,
|
||||
resolve_bindings: dict[str, int] = None,
|
||||
**kwargs: Any) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
validate: bool = True,
|
||||
resolve_bindings: Optional[dict[str, int]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self._resolve_bindings = resolve_bindings if resolve_bindings else {}
|
||||
|
||||
for key, value in kwargs.items():
|
||||
@ -57,16 +66,19 @@ class TensorSchema:
|
||||
if validate:
|
||||
self.validate()
|
||||
|
||||
def __getitem__(self, item) -> Any:
|
||||
return getattr(self, item)
|
||||
def __getitem__(self, key: str) -> Any:
|
||||
return getattr(self, key)
|
||||
|
||||
def get(self, item, default=None) -> Any:
|
||||
return getattr(self, item, default)
|
||||
def get(self, key: str, default: Any = None) -> Any:
|
||||
return getattr(self, key, default)
|
||||
|
||||
def _match_shape_with_dynamic(self, actual: tuple[int, ...],
|
||||
reference: tuple[int, ...],
|
||||
expected_shape: tuple[Union[int, str], ...],
|
||||
dynamic_dims: set[str, ...]) -> bool:
|
||||
def _match_shape_with_dynamic(
|
||||
self,
|
||||
actual: tuple[int, ...],
|
||||
reference: tuple[int, ...],
|
||||
expected_shape: tuple[Union[int, str], ...],
|
||||
dynamic_dims: set[str],
|
||||
) -> bool:
|
||||
if len(actual) != len(reference) or len(actual) > len(expected_shape):
|
||||
return False
|
||||
|
||||
@ -84,10 +96,12 @@ class TensorSchema:
|
||||
return True
|
||||
|
||||
def _validate_nested_tensors(
|
||||
self, value: Union[list[torch.Tensor, ...],
|
||||
tuple[torch.Tensor, ...]], field_name: str,
|
||||
expected_shape: tuple[Union[int, str], ...],
|
||||
dynamic_dims: set[str, ...]) -> tuple[int, ...]:
|
||||
self,
|
||||
value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
|
||||
field_name: str,
|
||||
expected_shape: tuple[Union[int, str], ...],
|
||||
dynamic_dims: set[str],
|
||||
) -> tuple[int, ...]:
|
||||
"""Validate a list/tuple of tensors and return the actual shape."""
|
||||
# Ensure all tensors in the list have the same
|
||||
# shape, besides dynamic dimensions
|
||||
@ -110,12 +124,14 @@ class TensorSchema:
|
||||
# shape = (len(list), *tensor.shape)
|
||||
return (len(value), ) + first.shape
|
||||
|
||||
def _validate_tensor_shape_expected(self, actual_shape: tuple[int, ...],
|
||||
expected_shape: tuple[Union[int, str],
|
||||
...],
|
||||
field_name: str, shape_env: dict[str,
|
||||
int],
|
||||
dynamic_dims: set[str, ...]) -> None:
|
||||
def _validate_tensor_shape_expected(
|
||||
self,
|
||||
actual_shape: tuple[int, ...],
|
||||
expected_shape: tuple[Union[int, str], ...],
|
||||
field_name: str,
|
||||
shape_env: dict[str, int],
|
||||
dynamic_dims: set[str],
|
||||
) -> None:
|
||||
"""Validate that the actual tensor shape matches the expected shape."""
|
||||
|
||||
if len(actual_shape) != len(expected_shape):
|
||||
|
||||
@ -576,6 +576,7 @@ class AsyncLLM(EngineClient):
|
||||
await self.engine_core.reset_prefix_cache_async()
|
||||
|
||||
async def sleep(self, level: int = 1) -> None:
|
||||
await self.reset_prefix_cache()
|
||||
await self.engine_core.sleep_async(level)
|
||||
|
||||
async def wake_up(self, tags: Optional[list[str]] = None) -> None:
|
||||
|
||||
@ -23,8 +23,8 @@ from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils import (cancel_task_threadsafe, get_open_port,
|
||||
get_open_zmq_inproc_path, make_zmq_socket)
|
||||
from vllm.utils import (close_sockets, get_open_port, get_open_zmq_inproc_path,
|
||||
in_loop, make_zmq_socket)
|
||||
from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
|
||||
EngineCoreRequestType,
|
||||
ReconfigureDistributedRequest, ReconfigureRankType,
|
||||
@ -317,7 +317,7 @@ class BackgroundResources:
|
||||
"""Used as a finalizer for clean shutdown, avoiding
|
||||
circular reference back to the client object."""
|
||||
|
||||
ctx: Union[zmq.Context]
|
||||
ctx: zmq.Context
|
||||
# If CoreEngineProcManager, it manages local engines;
|
||||
# if CoreEngineActorManager, it manages all engines.
|
||||
engine_manager: Optional[Union[CoreEngineProcManager,
|
||||
@ -326,6 +326,8 @@ class BackgroundResources:
|
||||
output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
|
||||
input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
|
||||
first_req_send_socket: Optional[zmq.asyncio.Socket] = None
|
||||
first_req_rcv_socket: Optional[zmq.asyncio.Socket] = None
|
||||
stats_update_socket: Optional[zmq.asyncio.Socket] = None
|
||||
output_queue_task: Optional[asyncio.Task] = None
|
||||
stats_update_task: Optional[asyncio.Task] = None
|
||||
shutdown_path: Optional[str] = None
|
||||
@ -343,23 +345,47 @@ class BackgroundResources:
|
||||
if self.coordinator is not None:
|
||||
self.coordinator.close()
|
||||
|
||||
cancel_task_threadsafe(self.output_queue_task)
|
||||
cancel_task_threadsafe(self.stats_update_task)
|
||||
if isinstance(self.output_socket, zmq.asyncio.Socket):
|
||||
# Async case.
|
||||
loop = self.output_socket._get_loop()
|
||||
asyncio.get_running_loop()
|
||||
sockets = (self.output_socket, self.input_socket,
|
||||
self.first_req_send_socket, self.first_req_rcv_socket,
|
||||
self.stats_update_socket)
|
||||
|
||||
# ZMQ context termination can hang if the sockets
|
||||
# aren't explicitly closed first.
|
||||
for socket in (self.output_socket, self.input_socket,
|
||||
self.first_req_send_socket):
|
||||
if socket is not None:
|
||||
socket.close(linger=0)
|
||||
tasks = (self.output_queue_task, self.stats_update_task)
|
||||
|
||||
if self.shutdown_path is not None:
|
||||
# We must ensure that the sync output socket is
|
||||
# closed cleanly in its own thread.
|
||||
with self.ctx.socket(zmq.PAIR) as shutdown_sender:
|
||||
shutdown_sender.connect(self.shutdown_path)
|
||||
# Send shutdown signal.
|
||||
shutdown_sender.send(b'')
|
||||
def close_sockets_and_tasks():
|
||||
close_sockets(sockets)
|
||||
for task in tasks:
|
||||
if task is not None and not task.done():
|
||||
task.cancel()
|
||||
|
||||
if in_loop(loop):
|
||||
close_sockets_and_tasks()
|
||||
elif not loop.is_closed():
|
||||
loop.call_soon_threadsafe(close_sockets_and_tasks)
|
||||
else:
|
||||
# Loop has been closed, try to clean up directly.
|
||||
del tasks
|
||||
del close_sockets_and_tasks
|
||||
close_sockets(sockets)
|
||||
del self.output_queue_task
|
||||
del self.stats_update_task
|
||||
else:
|
||||
# Sync case.
|
||||
|
||||
# ZMQ context termination can hang if the sockets
|
||||
# aren't explicitly closed first.
|
||||
close_sockets((self.output_socket, self.input_socket))
|
||||
|
||||
if self.shutdown_path is not None:
|
||||
# We must ensure that the sync output socket is
|
||||
# closed cleanly in its own thread.
|
||||
with self.ctx.socket(zmq.PAIR) as shutdown_sender:
|
||||
shutdown_sender.connect(self.shutdown_path)
|
||||
# Send shutdown signal.
|
||||
shutdown_sender.send(b'')
|
||||
|
||||
def validate_alive(self, frames: Sequence[zmq.Frame]):
|
||||
if len(frames) == 1 and (frames[0].buffer
|
||||
@ -969,14 +995,19 @@ class DPAsyncMPClient(AsyncMPClient):
|
||||
self.engine_ranks_managed[-1] + 1)
|
||||
|
||||
async def run_engine_stats_update_task():
|
||||
with make_zmq_socket(self.ctx, self.stats_update_address,
|
||||
zmq.XSUB) as socket, make_zmq_socket(
|
||||
self.ctx,
|
||||
self.first_req_sock_addr,
|
||||
zmq.PAIR,
|
||||
bind=False) as first_req_rcv_socket:
|
||||
with (make_zmq_socket(self.ctx,
|
||||
self.stats_update_address,
|
||||
zmq.XSUB,
|
||||
linger=0) as socket,
|
||||
make_zmq_socket(self.ctx,
|
||||
self.first_req_sock_addr,
|
||||
zmq.PAIR,
|
||||
bind=False,
|
||||
linger=0) as first_req_rcv_socket):
|
||||
assert isinstance(socket, zmq.asyncio.Socket)
|
||||
assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket)
|
||||
self.resources.stats_update_socket = socket
|
||||
self.resources.first_req_rcv_socket = first_req_rcv_socket
|
||||
# Send subscription message.
|
||||
await socket.send(b'\x01')
|
||||
|
||||
|
||||
@ -11,6 +11,10 @@ from vllm.config import VllmConfig
|
||||
class NgramProposer:
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
assert vllm_config.speculative_config is not None
|
||||
assert vllm_config.speculative_config.prompt_lookup_min is not None
|
||||
assert vllm_config.speculative_config.prompt_lookup_max is not None
|
||||
|
||||
# Minimum length of the n-gram to match.
|
||||
self.min_n = vllm_config.speculative_config.prompt_lookup_min
|
||||
# Maximum length of the n-gram to match.
|
||||
@ -54,17 +58,13 @@ class NgramProposer:
|
||||
followed that pattern. Here we will return [4,2,3] because
|
||||
we only have three tokens after the match.
|
||||
"""
|
||||
# Do not generate draft tokens beyond the max model length.
|
||||
k = min(self.k, self.max_model_len - context_token_ids.shape[0])
|
||||
if k <= 0:
|
||||
return None
|
||||
|
||||
# TODO(woosuk): Optimize this.
|
||||
for n in range(self.max_n, self.min_n - 1, -1):
|
||||
result = _find_subarray_kmp(context_token_ids, n, k)
|
||||
if result is not None:
|
||||
return result
|
||||
return None
|
||||
return _find_longest_matched_ngram_and_propose_tokens(
|
||||
origin_tokens=context_token_ids,
|
||||
min_ngram=self.min_n,
|
||||
max_ngram=self.max_n,
|
||||
max_model_len=self.max_model_len,
|
||||
k=self.k)
|
||||
|
||||
def load_model(self, *args, **kwargs):
|
||||
# No model to load.
|
||||
@ -72,61 +72,86 @@ class NgramProposer:
|
||||
|
||||
|
||||
@jit(nopython=True)
|
||||
def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
|
||||
def _find_longest_matched_ngram_and_propose_tokens(
|
||||
origin_tokens: np.ndarray, min_ngram: int, max_ngram: int,
|
||||
max_model_len: int, k: int) -> Optional[np.ndarray]:
|
||||
"""
|
||||
Build the lps (longest proper prefix which is also suffix)
|
||||
array for the pattern.
|
||||
Find the longest n-gram which matches the suffix of the given tokens
|
||||
whose length is within [min_ngram, max_ngram] (inclusive).
|
||||
|
||||
If found, we will extract k right after the matched ngram.
|
||||
"""
|
||||
lps = np.zeros(len(pattern), dtype=np.int32)
|
||||
prev_lps = 0 # length of the previous longest prefix suffix
|
||||
# Do not generate draft tokens is context is shorter than minimum n-gram
|
||||
total_token = origin_tokens.shape[0]
|
||||
if total_token < min_ngram:
|
||||
return None
|
||||
|
||||
# Do not generate draft tokens beyond the max model length.
|
||||
k = min(k, max_model_len - total_token)
|
||||
if k <= 0:
|
||||
return None
|
||||
|
||||
# Flip tokens, and the goal become to find longest ngram
|
||||
# on the rightmost position which matches the prefix with
|
||||
# length [min_n, max_n] (inclusive).
|
||||
tokens = origin_tokens[::-1]
|
||||
|
||||
# Longest prefix (not including itself) which is a suffix of
|
||||
# the current position.
|
||||
# lps[i] = max{v, where tokens[0:v] == tokens[i+1-v:i+1]}
|
||||
#
|
||||
# As ngram is capped by max_ngram to save memory, we only need to
|
||||
# store lps for the first max_ngram prefix.
|
||||
lps = np.zeros(max_ngram, dtype=np.int32)
|
||||
|
||||
longest_ngram = 0
|
||||
position = 0
|
||||
|
||||
# lps[0] always equal to 0, we starts with index 1
|
||||
prev_lps = 0
|
||||
i = 1
|
||||
|
||||
while i < len(pattern):
|
||||
if pattern[i] == pattern[prev_lps]:
|
||||
while i < total_token:
|
||||
# tokens[:prev_lps] is the longest prefix as a suffix of tokens[:i]
|
||||
if tokens[prev_lps] == tokens[i]:
|
||||
# Token match: tokens[:prev_lps+1] is the longest prefix as
|
||||
# a suffix of tokens[:i+1]
|
||||
prev_lps += 1
|
||||
lps[i] = prev_lps
|
||||
# Check if we found a longer valid ngram.
|
||||
#
|
||||
# Update position when longest_ngram matched prev_lps,
|
||||
# as we want to get the target n-gram of the earliest position
|
||||
# in the original tokens (i.e.
|
||||
# latest position in the reversed tokens)
|
||||
if prev_lps >= longest_ngram:
|
||||
longest_ngram = prev_lps
|
||||
position = i
|
||||
if i < max_ngram:
|
||||
# Store LPS for the first max_ngram prefix
|
||||
lps[i] = prev_lps
|
||||
if prev_lps == max_ngram:
|
||||
# When prev_lps reached max_ngram, update prev_lps
|
||||
# to lps[max_ngram-1] to avoid matching ngram
|
||||
# longer than max_ngram
|
||||
prev_lps = lps[max_ngram - 1]
|
||||
i += 1
|
||||
elif prev_lps != 0:
|
||||
# Token mismatch: try the second longest prefix
|
||||
# among all suffix of tokens[:i],
|
||||
# which is the longest prefix of tokens[:prev_lps]
|
||||
prev_lps = lps[prev_lps - 1]
|
||||
else:
|
||||
if prev_lps != 0:
|
||||
prev_lps = lps[prev_lps - 1]
|
||||
else:
|
||||
lps[i] = 0
|
||||
i += 1
|
||||
return lps
|
||||
|
||||
|
||||
@jit(nopython=True)
|
||||
def _find_subarray_kmp(
|
||||
context_token_ids: np.ndarray,
|
||||
n: int,
|
||||
k: int,
|
||||
) -> Optional[np.ndarray]:
|
||||
context_len = context_token_ids.shape[0]
|
||||
assert n > 0
|
||||
|
||||
pattern = context_token_ids[-n:]
|
||||
# Precompute lps array for Y
|
||||
lps = _kmp_lps_array(pattern)
|
||||
|
||||
i = 0
|
||||
j = 0
|
||||
# -n because the last n tokens are used as pattern
|
||||
while i < context_len - n:
|
||||
if context_token_ids[i] == pattern[j]:
|
||||
# Token mismatch, and no more prefix (except empty string)
|
||||
# as a suffix of tokens[:i]
|
||||
i += 1
|
||||
j += 1
|
||||
|
||||
# If we have matched the entire Y
|
||||
if j == n:
|
||||
# Found pattern in context, gather the next K elements
|
||||
return context_token_ids[i:i + k]
|
||||
else:
|
||||
# Mismatch
|
||||
if j != 0:
|
||||
# Use the lps array to avoid re-checking elements
|
||||
j = lps[j - 1]
|
||||
else:
|
||||
i += 1
|
||||
if longest_ngram < min_ngram:
|
||||
# No valid ngram is found
|
||||
return None
|
||||
|
||||
# Y not found
|
||||
return None
|
||||
# Flip the position back, so in origin_tokens,
|
||||
# origin_tokens[total_token-1-position:total_token-1-position+longest_ngram]
|
||||
# is the matched ngram, so we should start drafting tokens from
|
||||
# total_token-1-position+longest_ngram
|
||||
start_position = total_token - 1 - position + longest_ngram
|
||||
k = min(k, total_token - start_position)
|
||||
return origin_tokens[start_position:start_position + k]
|
||||
|
||||
@ -341,13 +341,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
model_kwargs = dict[str, Any]()
|
||||
num_reqs = self.input_batch.num_reqs
|
||||
|
||||
pooling_params = self.input_batch.pooling_metadata.pooling_params
|
||||
|
||||
num_pooling_reqs = len(pooling_params)
|
||||
num_pooling_reqs = len(self.input_batch.pooling_params)
|
||||
|
||||
if num_pooling_reqs == 0:
|
||||
return model_kwargs
|
||||
|
||||
pooling_params = self.input_batch.pooling_metadata.pooling_params
|
||||
|
||||
assert num_pooling_reqs == num_reqs
|
||||
|
||||
token_type_id_requests = dict[int, Any]()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user