diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 68aff793ae6aa..76f6d7aeca0d8 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@ import os
import sys
import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index b39f9899a8f28..e6f5c8b60f459 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 8afde017d383e..2ef36089b6afb 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -8,7 +8,7 @@ This benchmark aims to:
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup
@@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+ - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
- 8x Nvidia A100 GPUs
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 50431d0cd4c5e..5ea5a50a258a4 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -218,7 +218,7 @@ if __name__ == "__main__":
"--xaxis",
type=str,
default="# of max concurrency.",
- help="column name to use as X Axis in comparision graph",
+ help="column name to use as X Axis in comparison graph",
)
args = parser.parse_args()
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 77047636bb951..a655a650cb325 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -368,7 +368,7 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
- lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+ lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
)
# get markdown tables
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
index fb5063db86942..ebacdcbd6821b 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@@ -181,18 +181,14 @@ launch_vllm_server() {
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
- server_command="python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="vllm serve $model \
-tp $tp \
- --model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
- server_command="python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="vllm serve $model \
-tp $tp \
- --model $model \
--port $port \
$server_args"
fi
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index b1b7d2d77a44d..c64e5638029e7 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -365,8 +365,7 @@ run_serving_tests() {
continue
fi
- server_command="$server_envs python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="$server_envs vllm serve \
$server_args"
# run the server
@@ -455,11 +454,6 @@ main() {
fi
check_hf_token
- # Set to v1 to run v1 benchmark
- if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
- export VLLM_USE_V1=1
- fi
-
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index 2d88a0b30c4f8..f758097e098e4 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,6 +1,6 @@
[
{
- "test_name": "serving_llama8B_tp1_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -32,7 +32,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -64,7 +64,7 @@
}
},
{
- "test_name": "serving_llama8B_tp4_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -96,7 +96,7 @@
}
},
{
- "test_name": "serving_llama8B_tp1_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -131,7 +131,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -166,7 +166,7 @@
}
},
{
- "test_name": "serving_llama8B_tp4_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -198,5 +198,413 @@
"random-output-len": 128,
"num_prompts": 1000
}
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
}
]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index 823abbaa99f86..ce396d6e54f27 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,6 +1,6 @@
[
{
- "test_name": "serving_llama8B_pp1_sharegpt",
+ "test_name": "serving_llama8B_bf16_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -32,7 +32,39 @@
}
},
{
- "test_name": "serving_llama8B_pp3_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -64,7 +96,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2pp3_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -97,7 +129,7 @@
}
},
{
- "test_name": "serving_llama8B_pp1_random_128_128",
+ "test_name": "serving_llama8B_bf16_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -132,7 +164,42 @@
}
},
{
- "test_name": "serving_llama8B_pp3_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -167,7 +234,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2pp3_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -201,5 +268,553 @@
"ignore-eos": "",
"num_prompts": 1000
}
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
}
]
diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml
deleted file mode 100644
index d5cad1c73c6f8..0000000000000
--- a/.buildkite/pyproject.toml
+++ /dev/null
@@ -1,46 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
- # pycodestyle
- "E",
- # Pyflakes
- "F",
- # pyupgrade
- "UP",
- # flake8-bugbear
- "B",
- # flake8-simplify
- "SIM",
- # isort
- "I",
- # flake8-logging-format
- "G",
-]
-ignore = [
- # star imports
- "F405", "F403",
- # lambda expression assignment
- "E731",
- # Loop control variable not used within loop body
- "B007",
- # f-string format
- "UP032",
- # Can remove once 3.10+ is the minimum Python version
- "UP007",
-]
-
-[tool.ruff.format]
-docstring-code-format = true
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f96c38bf57db7..505323bc2b654 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,21 +1,22 @@
steps:
- # aarch64 + CUDA builds
- - label: "Build arm64 wheel - CUDA 12.8"
- id: build-wheel-arm64-cuda-12-8
+ # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+ - label: "Build arm64 wheel - CUDA 12.9"
+ depends_on: ~
+ id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- # x86 + CUDA builds
- label: "Build wheel - CUDA 12.8"
+ depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
@@ -27,12 +28,8 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build CUDA 12.6 wheel"
- key: block-build-cu126-wheel
- depends_on: ~
-
- label: "Build wheel - CUDA 12.6"
- depends_on: block-build-cu126-wheel
+ depends_on: ~
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
@@ -44,44 +41,61 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
- # However, this block can be uncommented to save some compute hours.
- # - block: "Build CUDA 11.8 wheel"
- # key: block-build-cu118-wheel
-
- - label: "Build wheel - CUDA 11.8"
- # depends_on: block-build-cu118-wheel
- id: build-wheel-cuda-11-8
+ # x86 + CUDA builds
+ - label: "Build wheel - CUDA 12.9"
+ depends_on: ~
+ id: build-wheel-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- - block: "Build release image"
+ - label: "Build release image (x86)"
depends_on: ~
- key: block-release-image-build
-
- - label: "Build release image"
- depends_on: block-release-image-build
- id: build-release-image
+ id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+ # re-tag to default image tag and push, just in case arm64 build fails
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+ - label: "Build release image (arm64)"
+ depends_on: ~
+ id: build-release-image-arm64
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+ # Add job to create multi-arch manifest
+ - label: "Create multi-arch manifest"
+ depends_on:
+ - build-release-image-x86
+ - build-release-image-arm64
+ id: create-multi-arch-manifest
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
- label: "Annotate release workflow"
depends_on:
- - build-release-image
+ - create-multi-arch-manifest
- build-wheel-cuda-12-8
- - build-wheel-cuda-12-6
- - build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
@@ -128,18 +142,30 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build Neuron release image"
- key: block-neuron-release-image-build
- depends_on: ~
-
- - label: "Build and publish Neuron release image"
- depends_on: block-neuron-release-image-build
+ - label: "Build and publish nightly multi-arch image to DockerHub"
+ depends_on:
+ - create-multi-arch-manifest
+ if: build.env("NIGHTLY") == "1"
agents:
- queue: neuron-postmerge
+ queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+ - "docker push vllm/vllm-openai:nightly-x86_64"
+ - "docker push vllm/vllm-openai:nightly-aarch64"
+ - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+ - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+ - "docker manifest push vllm/vllm-openai:nightly"
+ - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ # Clean up old nightly builds (keep only last 14)
+ - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllmbot
+ password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
+ DOCKERHUB_USERNAME: "vllmbot"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 94e0ac2398f34..fde48603ad3cd 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker push vllm/vllm-openai:latest
-docker push vllm/vllm-openai:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF
\ No newline at end of file
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
new file mode 100755
index 0000000000000..f02a128c67726
--- /dev/null
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+set -ex
+
+# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+
+# DockerHub API endpoint for vllm/vllm-openai repository
+REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
+
+# Get DockerHub credentials from environment
+if [ -z "$DOCKERHUB_TOKEN" ]; then
+ echo "Error: DOCKERHUB_TOKEN environment variable is not set"
+ exit 1
+fi
+
+if [ -z "$DOCKERHUB_USERNAME" ]; then
+ echo "Error: DOCKERHUB_USERNAME environment variable is not set"
+ exit 1
+fi
+
+# Get DockerHub bearer token
+echo "Getting DockerHub bearer token..."
+set +x
+BEARER_TOKEN=$(curl -s -X POST \
+ -H "Content-Type: application/json" \
+ -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
+ "https://hub.docker.com/v2/users/login" | jq -r '.token')
+set -x
+
+if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
+ echo "Error: Failed to get DockerHub bearer token"
+ exit 1
+fi
+
+# Function to get all tags from DockerHub
+get_all_tags() {
+ local page=1
+ local all_tags=""
+
+ while true; do
+ set +x
+ local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
+ "$REPO_API_URL?page=$page&page_size=100")
+ set -x
+
+ # Get both last_updated timestamp and tag name, separated by |
+ local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+
+ if [ -z "$tags" ]; then
+ break
+ fi
+
+ all_tags="$all_tags$tags"$'\n'
+ page=$((page + 1))
+ done
+
+ # Sort by timestamp (newest first) and extract just the tag names
+ echo "$all_tags" | sort -r | cut -d'|' -f2
+}
+
+delete_tag() {
+ local tag_name="$1"
+ echo "Deleting tag: $tag_name"
+
+ local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
+ set +x
+ local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
+ set -x
+
+ if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
+ echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
+ else
+ echo "Successfully deleted tag: $tag_name"
+ fi
+}
+
+# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
+echo "Fetching all tags from DockerHub..."
+all_tags=$(get_all_tags)
+
+if [ -z "$all_tags" ]; then
+ echo "No tags found to clean up"
+ exit 0
+fi
+
+# Count total tags
+total_tags=$(echo "$all_tags" | wc -l)
+echo "Found $total_tags tags"
+
+# Keep only the last 14 builds (including the current one)
+tags_to_keep=14
+tags_to_delete=$((total_tags - tags_to_keep))
+
+if [ $tags_to_delete -le 0 ]; then
+ echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
+ exit 0
+fi
+
+echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
+
+# Get tags to delete (skip the first $tags_to_keep tags)
+tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
+
+if [ -z "$tags_to_delete_list" ]; then
+ echo "No tags to delete"
+ exit 0
+fi
+
+# Delete old tags
+echo "Deleting old tags..."
+while IFS= read -r tag; do
+ if [ -n "$tag" ]; then
+ delete_tag "$tag"
+ # Add a small delay to avoid rate limiting
+ sleep 1
+ fi
+done <<< "$tags_to_delete_list"
+
+echo "Cleanup completed successfully"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index df0bae0c9cbff..aa4cc7b35a543 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
- commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
@@ -164,16 +160,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
- --ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 36bcb015d308e..39ea180173081 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,25 +25,28 @@ function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
- set -e
- python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ set -xve
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
- set -e
+ set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
- pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+
+ # Note: disable Bart until supports V1
+ # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
- pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+ # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
+ # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
-timeout 40m bash -c cpu_tests
+timeout 120m bash -c cpu_tests
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 9dec9f8e9eb32..7512cb1bbed01 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
@@ -49,57 +49,69 @@ function cpu_tests() {
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -v -s tests/kernels/test_onednn.py"
+ pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
- # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
- # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+ # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+ # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
- # Note: disable Bart until supports V1
- pytest -v -s tests/models/language/generation -m cpu_model \
- --ignore=tests/models/language/generation/test_bart.py
- VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
- --ignore=tests/models/language/generation/test_bart.py
+ pytest -x -v -s tests/models/language/generation -m cpu_model
+ VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
- pytest -v -s tests/models/language/pooling -m cpu_model
- pytest -v -s tests/models/multimodal/generation \
- --ignore=tests/models/multimodal/generation/test_mllama.py \
+ pytest -x -v -s tests/models/language/pooling -m cpu_model
+ pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -s -v \
+ pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
- # VLLM_USE_V1=0 pytest -s -v \
+ # VLLM_USE_V1=0 pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -s -v \
+ pytest -x -s -v \
tests/lora/test_qwen2vl.py"
- # online serving
+ # online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+ server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
- --endpoint /v1/completions'
+ --endpoint /v1/completions
+ kill -s SIGTERM $server_pid &'
+
+ # online serving: tp+dp
+ docker exec cpu-test-"$NUMA_NODE" bash -c '
+ set -e
+ VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+ server_pid=$!
+ timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+ vllm bench serve \
+ --backend vllm \
+ --dataset-name random \
+ --model meta-llama/Llama-3.2-3B-Instruct \
+ --num-prompts 20 \
+ --endpoint /v1/completions
+ kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
deleted file mode 100644
index a397457c83261..0000000000000
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# This script build the Neuron docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
-
-# Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-
-# prune old image and containers to save disk space, and only once a day
-# by using a timestamp file in tmp.
-if [ -f /tmp/neuron-docker-build-timestamp ]; then
- last_build=$(cat /tmp/neuron-docker-build-timestamp)
- current_time=$(date +%s)
- if [ $((current_time - last_build)) -gt 86400 ]; then
- # Remove dangling images (those that are not tagged and not used by any container)
- docker image prune -f
- # Remove unused volumes / force the system prune for old images as well.
- docker volume prune -f && docker system prune -f
- echo "$current_time" > /tmp/neuron-docker-build-timestamp
- fi
-else
- date "+%s" > /tmp/neuron-docker-build-timestamp
-fi
-
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
-
-# Setup cleanup
-remove_docker_container() {
- docker image rm -f "${image_name}" || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
- -v "${HF_CACHE}:${HF_MOUNT}" \
- -e "HF_HOME=${HF_MOUNT}" \
- -e "HF_TOKEN=${HF_TOKEN}" \
- -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
- -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
- --name "${container_name}" \
- ${image_name} \
- /bin/bash -c "
- set -e; # Exit on first error
- python3 /workspace/vllm/examples/offline_inference/neuron.py;
- python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
- for f in /workspace/vllm/tests/neuron/2_core/*.py; do
- echo \"Running test file: \$f\";
- python3 -m pytest \$f -v --capture=tee-sys;
- done
- "
\ No newline at end of file
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
new file mode 100644
index 0000000000000..29c8f5ed5a91a
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# This script build the Ascend NPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Base ubuntu image with basic ascend development libraries and python installed
+VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
+CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
+TEST_RUN_CONFIG_FILE="vllm_test.cfg"
+VLLM_ASCEND_TMP_DIR=
+# Get the test run configuration file from the vllm-ascend repository
+fetch_vllm_test_cfg() {
+ VLLM_ASCEND_TMP_DIR=$(mktemp -d)
+ # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
+ cleanup() {
+ rm -rf "${VLLM_ASCEND_TMP_DIR}"
+ }
+ trap cleanup EXIT
+
+ GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
+ if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
+ echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
+ exit 1
+ fi
+
+ # If the file already exists locally, just overwrite it
+ cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
+ echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
+
+ # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
+ # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
+ rm -rf "${VLLM_ASCEND_TMP_DIR}"
+ trap - EXIT
+}
+
+# Downloads test run configuration file from a remote URL.
+# Loads the configuration into the current script environment.
+get_config() {
+ if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
+ echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
+ exit 1
+ fi
+ source "${TEST_RUN_CONFIG_FILE}"
+ echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
+ return 0
+}
+
+# get test running configuration.
+fetch_vllm_test_cfg
+get_config
+# Check if the function call was successful. If not, exit the script.
+if [ $? -ne 0 ]; then
+ exit 1
+fi
+
+image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
+container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
+agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
+echo "agent_idx: ${agent_idx}"
+builder_name="cachebuilder${agent_idx}"
+builder_cache_dir="/mnt/docker-cache${agent_idx}"
+mkdir -p ${builder_cache_dir}
+
+# Try building the docker image
+cat <=6.0 modelscope
+
+WORKDIR /workspace/vllm
+
+# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+COPY requirements/common.txt /workspace/vllm/requirements/common.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -r requirements/common.txt
+
+COPY . .
+
+# Install vLLM
+RUN --mount=type=cache,target=/root/.cache/pip \
+ VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+ python3 -m pip uninstall -y triton
+
+# Install vllm-ascend
+WORKDIR /workspace
+ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
+ARG VLLM_ASCEND_TAG=main
+RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
+ git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
+
+# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -r /workspace/vllm-ascend/requirements.txt
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+ source /usr/local/Ascend/nnal/atb/set_env.sh && \
+ export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+ python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV VLLM_USE_MODELSCOPE=True
+
+WORKDIR /workspace/vllm-ascend
+
+CMD ["/bin/bash"]
+
+EOF
+
+# Setup cleanup
+remove_docker_container() {
+ docker rm -f "${container_name}" || true;
+ docker image rm -f "${image_name}" || true;
+ docker system prune -f || true;
+}
+trap remove_docker_container EXIT
+
+# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
+# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
+# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
+# returns --device /dev/davinci0 --device /dev/davinci1
+parse_and_gen_devices() {
+ local input="$1"
+ local index cards_num
+ if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
+ index="${BASH_REMATCH[1]}"
+ cards_num="${BASH_REMATCH[2]}"
+ else
+ echo "parse error" >&2
+ return 1
+ fi
+
+ local devices=""
+ local i=0
+ while (( i < cards_num )); do
+ local dev_idx=$(((index - 1)*cards_num + i ))
+ devices="$devices --device /dev/davinci${dev_idx}"
+ ((i++))
+ done
+
+ # trim leading space
+ devices="${devices#"${devices%%[![:space:]]*}"}"
+ # Output devices: assigned to the caller variable
+ printf '%s' "$devices"
+}
+
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+
+# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
+# This test checks whether the OOT platform interface is functioning properly in conjunction with
+# the hardware plugin vllm-ascend.
+model_cache_dir=/mnt/modelscope${agent_idx}
+mkdir -p ${model_cache_dir}
+docker run \
+ ${devices} \
+ --device /dev/davinci_manager \
+ --device /dev/devmm_svm \
+ --device /dev/hisi_hdc \
+ -v /usr/local/dcmi:/usr/local/dcmi \
+ -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+ -v /etc/ascend_install.info:/etc/ascend_install.info \
+ -v ${model_cache_dir}:/root/.cache/modelscope \
+ --entrypoint="" \
+ --name "${container_name}" \
+ "${image_name}" \
+ bash -c '
+ set -e
+ pytest -v -s tests/e2e/vllm_interface/
+'
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index b571618f48c2b..cbb2527a4ff0a 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,13 +61,12 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
- && python3 -m pip install --progress-bar off hf-transfer
+ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index d55a786e41e8b..f022fa3672eeb 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,13 +61,12 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
- && python3 -m pip install --progress-bar off hf-transfer
+ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 445cd2735c190..250a64fdd071c 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -30,18 +30,19 @@ docker run \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ pip install tblib==3.1.0
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
+ pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_metrics_reader.py
'
diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 72812218cb668..51536b36b808d 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
-python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+vllm serve meta-llama/Llama-2-7b-chat-hf &
server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
new file mode 100755
index 0000000000000..5b25c358fc4aa
--- /dev/null
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+ echo "Removing existing Prime-RL directory..."
+ rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+ echo "Installing UV package manager..."
+ curl -LsSf https://astral.sh/uv/install.sh | sh
+ source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env
index bd25c803081a6..ecb98d4516bd5 100644
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=10.0
+EXPECTED_THROUGHPUT=8.7
INPUT_LEN=1800
OUTPUT_LEN=128
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index b1e17b438578d..3364fce8e1fdc 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,7 +42,7 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
-VLLM_USE_V1=1 vllm serve $MODEL \
+vllm serve $MODEL \
--seed 42 \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 745f285c008ad..43aa8c47be299 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-if [[ $normal_wheel == *"cu118"* ]]; then
- # if $normal_wheel matches cu118, do not upload the index.html
- echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+ # if $normal_wheel matches cu128, do not upload the index.html
+ echo "Skipping index files for cu128 wheels"
else
- # only upload index.html for cu128 wheels (default wheels)
+ # only upload index.html for cu129 wheels (default wheels) as it
+ # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
@@ -74,14 +75,15 @@ fi
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu118"* ]]; then
- # if $normal_wheel matches cu118, do not upload the index.html
- echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+ # if $normal_wheel matches cu128, do not upload the index.html
+ echo "Skipping index files for cu128 wheels"
else
- # only upload index.html for cu128 wheels (default wheels)
+ # only upload index.html for cu129 wheels (default wheels) as it
+ # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 20f3ce1adb46d..ebe0602a1b5db 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -6,24 +6,28 @@
# to generate the final pipeline yaml file.
# Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
-# in this case, commands must be specified. the first command runs on first host, the second
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+# in this case, commands must be specified. the first command runs on the first host, the second
# command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# When adding a test
-# - If the test belong to an existing group, add it there
+# - If the test belongs to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
@@ -41,29 +45,36 @@ steps:
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh
-- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- - tests/mq_llm_engine
- - tests/async_engine
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- - tests/utils_
- - tests/worker
- tests/standalone_tests/lazy_imports.py
+ - tests/transformers_utils
+ no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- - pytest -v -s mq_llm_engine # MQLLMEngine
- - pytest -v -s async_engine # AsyncLLMEngine
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- - pytest -v -s multimodal
- - pytest -v -s utils_ # Utils
- - pytest -v -s worker # Worker
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s transformers_utils
-- label: Python-only Installation Test
+- label: Python-only Installation Test # 10min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
@@ -71,7 +82,8 @@ steps:
commands:
- bash standalone_tests/python_only_compile.sh
-- label: Basic Correctness Test # 30min
+- label: Basic Correctness Test # 20min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
fast_check: true
torch_nightly: true
@@ -79,26 +91,26 @@ steps:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- - tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-- label: Core Test # 10min
- mirror_hardwares: [amdexperimental]
+- label: Entrypoints Unit Tests # 5min
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
fast_check: true
source_file_dependencies:
- - vllm/core
- - vllm/distributed
- - tests/core
+ - vllm/entrypoints
+ - tests/entrypoints/
commands:
- - pytest -v -s core
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Integration Test (LLM) # 30min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -109,13 +121,12 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
- - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-- label: Entrypoints Test (API Server) # 40min
+- label: Entrypoints Integration Test (API Server) # 100min
+ timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -127,16 +138,29 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
-- label: Distributed Tests (4 GPUs) # 10min
+- label: Entrypoints Integration Test (Pooling)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- - vllm/core/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
@@ -144,28 +168,34 @@ steps:
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- - tests/v1/test_async_llm_dp.py
- - tests/v1/test_external_lb_dp.py
- - tests/v1/test_internal_lb_dp.py
- - tests/v1/test_hybrid_lb_dp.py
+ - tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
commands:
- # test with tp=2 and external_dp=2
- - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- # test with tp=2 and pp=2
+ # test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
- - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
- - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- pushd ../examples/offline_inference
@@ -173,7 +203,8 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
-- label: EPLB Algorithm Test
+- label: EPLB Algorithm Test # 5min
+ timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
@@ -182,6 +213,7 @@ steps:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min
+ timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@@ -190,26 +222,26 @@ steps:
commands:
- pytest -v -s distributed/test_eplb_execute.py
-- label: Metrics, Tracing Test # 10min
+- label: Metrics, Tracing Test # 12min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
num_gpus: 2
source_file_dependencies:
- vllm/
- - tests/metrics
- - tests/tracing
+ - tests/v1/tracing
commands:
- - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- - pytest -v -s tracing
+ - pytest -v -s v1/tracing
##### fast check tests #####
##### 1 GPU test #####
-- label: Regression Test # 5min
+- label: Regression Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -219,7 +251,8 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 10min
+- label: Engine Test # 25min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -234,36 +267,66 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
-- label: V1 Test
+- label: V1 Test e2e + engine # 30min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints # 35min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: V1 Test others # 42min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/v1
commands:
# split the test to avoid interference
- - pytest -v -s v1/core
- - pytest -v -s v1/engine
- - pytest -v -s v1/entrypoints
+ - pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- - pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode
- - pytest -v -s v1/kv_connector/unit
- - pytest -v -s v1/metrics
- - pytest -v -s v1/test_serial_utils.py
- - pytest -v -s v1/test_utils.py
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- - pytest -v -s v1/test_metrics_reader.py
- # TODO: accuracy does not match, whether setting
- # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- - pytest -v -s v1/e2e
+ - pytest -v -s v1/test_request.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-- label: Examples Test # 25min
+- label: V1 Test others (CPU) # 5 mins
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+
+- label: Examples Test # 30min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
@@ -280,15 +343,16 @@ steps:
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-- label: Platform Tests (CUDA)
+- label: Platform Tests (CUDA) # 4min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -296,7 +360,8 @@ steps:
commands:
- pytest -v -s cuda/test_cuda_context.py
-- label: Samplers Test # 36min
+- label: Samplers Test # 56min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/layers
@@ -307,15 +372,23 @@ steps:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-- label: LoRA Test %N # 15min each
+- label: LoRA Test %N # 20min each
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/lora
- tests/lora
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+ commands:
+ - pytest -v -s lora \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --ignore=lora/test_chatglm3_tp.py \
+ --ignore=lora/test_llama_tp.py \
+ --ignore=lora/test_llm_with_multi_loras.py
parallelism: 4
-- label: PyTorch Compilation Unit Tests
+- label: PyTorch Compilation Unit Tests # 15min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -325,13 +398,15 @@ steps:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
+ - pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- - pytest -v -s compile/test_sequence_parallelism.py
- - pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
+ - pytest -v -s compile/test_noop_elimination.py
+ - pytest -v -s compile/test_aot_compile.py
-- label: PyTorch Fullgraph Smoke Test # 9min
+- label: PyTorch Fullgraph Smoke Test # 15min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -339,13 +414,10 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- # these tests need to be separated, cannot combine
- - pytest -v -s compile/piecewise/test_simple.py
- - pytest -v -s compile/piecewise/test_toy_llama.py
- - pytest -v -s compile/piecewise/test_full_cudagraph.py
- - pytest -v -s compile/piecewise/test_multiple_graphs.py
+ - pytest -v -s compile/piecewise/
-- label: PyTorch Fullgraph Test # 18min
+- label: PyTorch Fullgraph Test # 20min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -354,15 +426,18 @@ steps:
commands:
- pytest -v -s compile/test_full_graph.py
-- label: Kernels Core Operation Test
+- label: Kernels Core Operation Test # 48min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
- tests/kernels/core
+ - tests/kernels/test_top_k_per_row.py
commands:
- - pytest -v -s kernels/core
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
-- label: Kernels Attention Test %N
+- label: Kernels Attention Test %N # 23min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/attention/
@@ -373,7 +448,8 @@ steps:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels Quantization Test %N
+- label: Kernels Quantization Test %N # 64min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/quantization/
@@ -383,48 +459,44 @@ steps:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels MoE Test %N
+- label: Kernels MoE Test %N # 40min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels Mamba Test
+- label: Kernels Mamba Test # 31min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
-- label: Tensorizer Test # 11min
- mirror_hardwares: [amdexperimental]
- source_file_dependencies:
- - vllm/model_executor/model_loader
- - tests/tensorizer_loader
- - tests/entrypoints/openai/test_tensorizer_entrypoint.py
- commands:
- - apt-get update && apt-get install -y curl libsodium23
- - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s tensorizer_loader
- - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-
-- label: Model Executor Test
+- label: Model Executor Test # 23min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-- label: Benchmarks # 9min
+- label: Benchmarks # 11min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
@@ -432,7 +504,8 @@ steps:
commands:
- bash scripts/run-benchmarks.sh
-- label: Benchmarks CLI Test # 10min
+- label: Benchmarks CLI Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -440,7 +513,8 @@ steps:
commands:
- pytest -v -s benchmarks/
-- label: Quantization Test
+- label: Quantization Test # 70min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -448,11 +522,16 @@ steps:
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
- # after torchao 0.12 release
- - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -460,7 +539,8 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-- label: OpenAI API correctness
+- label: OpenAI API correctness # 22min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -469,51 +549,109 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
-- label: Encoder Decoder tests # 5min
- mirror_hardwares: [amdexperimental]
- source_file_dependencies:
- - vllm/
- - tests/encoder_decoder
- commands:
- - pytest -v -s encoder_decoder
-
-- label: OpenAI-Compatible Tool Use # 20 min
+- label: OpenAI-Compatible Tool Use # 23 min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
- - tests/mistral_tool_use
commands:
- - pytest -v -s tool_use
- - pytest -v -s mistral_tool_use
+ - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ no_gpu: true
+ commands:
+ - pytest -v -s -m 'cpu_test' tool_use
##### models test #####
-- label: Basic Models Test # 24min
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- - tests/models
+ - tests/models/test_initialization.py
commands:
- - pytest -v -s models/test_transformers.py
- - pytest -v -s models/test_registry.py
- - pytest -v -s models/test_utils.py
- - pytest -v -s models/test_vision.py
- - pytest -v -s models/test_initialization.py
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-- label: Language Models Test (Standard)
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py \
+ -k 'not test_can_initialize_small_subset' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ timeout_in_minutes: 10
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
commands:
+ # Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- - pytest -v -s models/language -m core_model
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
-- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -524,9 +662,15 @@ steps:
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- - pytest -v -s models/language/generation -m hybrid_model
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation \
+ -m hybrid_model \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
-- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
@@ -537,7 +681,18 @@ steps:
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
@@ -546,16 +701,27 @@ steps:
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
-- label: Multi-Modal Processor Test
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+ timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+ - pytest -v -s models/multimodal/processing
-- label: Multi-Modal Models Test (Standard)
+- label: Multi-Modal Models Test (Standard) # 60min
+ timeout_in_minutes: 80
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -565,7 +731,7 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
@@ -597,7 +763,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
-- label: Quantized Models Test
+- label: Quantized Models Test # 45 min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/layers/quantization
@@ -621,13 +788,16 @@ steps:
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- - python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
-- label: Blackwell Test
+- label: Blackwell Test # 38 min
+ timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
@@ -649,23 +819,71 @@ steps:
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+
+- label: Blackwell GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Blackwell Quantized MoE Test
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Blackwell LM Eval Small Models
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -675,8 +893,11 @@ steps:
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
- label: 2 Node Tests (4 GPUs in total) # 16min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -700,47 +921,61 @@ steps:
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 68min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
+ - vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- - vllm/model_executor/models/
- - tests/distributed/
- - vllm/compilation
- vllm/worker/worker_base.py
- - vllm/worker/worker.py
- - vllm/worker/model_runner.py
- - entrypoints/llm/test_collective_rpc.py
- - tests/v1/test_async_llm_dp.py
- - tests/v1/test_external_lb_dp.py
- - tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
commands:
- - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Model Tests (2 GPUs) # 37min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
- # test sequence parallel
- - pytest -v -s distributed/test_sequence_parallel.py
- # this test fails consistently.
- # TODO: investigate and fix
- - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- - pytest -v -s models/multimodal/generation/test_maverick.py
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
- label: Plugin Tests (2 GPUs) # 40min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -753,6 +988,11 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
@@ -761,7 +1001,8 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
-- label: Pipeline Parallelism Test # 45min
+- label: Pipeline + Context Parallelism Test # 45min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
@@ -775,7 +1016,8 @@ steps:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
-- label: LoRA TP Test (Distributed)
+- label: LoRA TP Test (Distributed) # 17 min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
num_gpus: 4
source_file_dependencies:
@@ -789,13 +1031,15 @@ steps:
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- - pytest -v -s -x lora/test_multi_loras_with_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
- label: Weight Loading Multiple GPU Test # 33min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
+ optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
@@ -844,9 +1088,36 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-- label: Qwen MoE EP Test # optional
+##### H200 test #####
+- label: Distrubted Tests (H200) # optional
gpu: h200
optional: true
+ working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/compile/test_async_tp.py
+ - pytest -v -s tests/compile/test_sequence_parallelism.py
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000000000..bc6342956109b
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,32 @@
+[run]
+source = vllm
+omit =
+ */tests/*
+ */test_*
+ */__pycache__/*
+ */build/*
+ */dist/*
+ */vllm.egg-info/*
+ */third_party/*
+ */examples/*
+ */benchmarks/*
+ */docs/*
+
+[report]
+exclude_lines =
+ pragma: no cover
+ def __repr__
+ if self.debug:
+ if settings.DEBUG
+ raise AssertionError
+ raise NotImplementedError
+ if 0:
+ if __name__ == .__main__.:
+ class .*\bProtocol\):
+ @(abc\.)?abstractmethod
+
+[html]
+directory = htmlcov
+
+[xml]
+output = coverage.xml
diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml
new file mode 100644
index 0000000000000..443dfa45af22c
--- /dev/null
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+# - "vllm/v1/attetion/*.py"
+# - "vllm/v1/core/*.py"
+exclude:
+ - "**/*.py"
+
+scan:
+ functions: true # check free functions and methods
+ classes: true # check classes/dataclasses
+ public_only: true # ignore names starting with "_" at any level
+
+annotations:
+ include: # decorators that force‑include a symbol
+ - name: "bc_linter_include" # matched by simple name or dotted suffix
+ propagate_to_members: false # for classes, include methods/inner classes
+ exclude: # decorators that force‑exclude a symbol
+ - name: "bc_linter_skip" # matched by simple name or dotted suffix
+ propagate_to_members: true # for classes, exclude methods/inner classes
+
+excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ce9590f02ce71..dbcad3aa308f5 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,64 +2,88 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
+/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
-/vllm/multimodal @DarkLight1337 @ywang96
+/vllm/model_executor/model_loader @22quinn
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/sample @22quinn @houseroad @njhill
+/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/kv_cache_interface.py @heheda12345
+/vllm/v1/offloading @ApostaC
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/evals @mgoin
+/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
+/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
+/tests/v1/kv_connector/nixl_integration @NickLucche
+/tests/v1/kv_connector @ApostaC
+/tests/v1/offloading @ApostaC
+
+# Transformers backend
+/vllm/model_executor/models/transformers.py @hmellor
+/tests/models/test_transformers.py @hmellor
# Docs
-/docs @hmellor
+/docs/mkdocs @hmellor
+/docs/**/*.yml @hmellor
+/requirements/docs.txt @hmellor
+.readthedocs.yaml @hmellor
mkdocs.yaml @hmellor
+# Linting
+.markdownlint.yaml @hmellor
+.pre-commit-config.yaml @hmellor
+/tools/pre_commit @hmellor
+
# CPU
-/vllm/v1/worker/^cpu @bigPYJ1151
+/vllm/v1/worker/cpu* @bigPYJ1151
/csrc/cpu @bigPYJ1151
/vllm/platforms/cpu.py @bigPYJ1151
/cmake/cpu_extension.cmake @bigPYJ1151
/docker/Dockerfile.cpu @bigPYJ1151
# Intel GPU
-/vllm/v1/worker/^xpu @jikunshang
+/vllm/v1/worker/xpu* @jikunshang
/vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang
@@ -67,6 +91,9 @@ mkdocs.yaml @hmellor
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
+# MTP-specific files
+/vllm/model_executor/models/deepseek_mtp.py @luccafong
+
# Mistral-specific files
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
@@ -79,4 +106,18 @@ mkdocs.yaml @hmellor
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/attention/ops/triton_unified_attention.py @tdoublep
+# ROCm related: specify owner with write access to notify AMD folks for careful code review
+/docker/Dockerfile.rocm* @gshtras
+/vllm/v1/attention/backends/rocm*.py @gshtras
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras
+/vllm/attention/ops/rocm*.py @gshtras
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+# TPU
+/vllm/v1/worker/tpu* @NickLucche
+/vllm/platforms/tpu.py @NickLucche
+/vllm/v1/sample/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
+
+# KVConnector installation files
+/requirements/kv_connectors.txt @NickLucche
diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml
index 7ee57c42895ca..c0e009855964a 100644
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -43,10 +43,6 @@ body:
Any other things you would like to mention.
validations:
required: false
-- type: markdown
- attributes:
- value: >
- Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes
id: askllm
attributes:
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1b30c1292df85..8043df65d5585 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
## Test Result
-## (Optional) Documentation Update
-
---
Essential Elements of an Effective PR Description Checklist
@@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 495d207d44260..de1a8314a4ecd 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -2,6 +2,7 @@ pull_request_rules:
- name: label-documentation
description: Automatically apply documentation label
conditions:
+ - label != stale
- or:
- files~=^[^/]+\.md$
- files~=^docs/
@@ -10,10 +11,13 @@ pull_request_rules:
label:
add:
- documentation
+ comment:
+ message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
+ - label != stale
- or:
- files~=^\.github/
- files~=\.buildkite/
@@ -30,6 +34,7 @@ pull_request_rules:
- name: label-deepseek
description: Automatically apply deepseek label
conditions:
+ - label != stale
- or:
- files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py
@@ -46,6 +51,7 @@ pull_request_rules:
- name: label-frontend
description: Automatically apply frontend label
conditions:
+ - label != stale
- files~=^vllm/entrypoints/
actions:
label:
@@ -55,6 +61,7 @@ pull_request_rules:
- name: label-llama
description: Automatically apply llama label
conditions:
+ - label != stale
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
@@ -70,6 +77,7 @@ pull_request_rules:
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
+ - label != stale
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
@@ -83,6 +91,7 @@ pull_request_rules:
- name: label-new-model
description: Automatically apply new-model label
conditions:
+ - label != stale
- and:
- files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py
@@ -94,6 +103,7 @@ pull_request_rules:
- name: label-performance
description: Automatically apply performance label
conditions:
+ - label != stale
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
@@ -107,6 +117,7 @@ pull_request_rules:
- name: label-qwen
description: Automatically apply qwen label
conditions:
+ - label != stale
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
@@ -121,12 +132,20 @@ pull_request_rules:
- name: label-gpt-oss
description: Automatically apply gpt-oss label
conditions:
+ - label != stale
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
+ - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
+ - files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
+ - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/tool_server.py
+ - files~=^vllm/entrypoints/tool.py
+ - files~=^vllm/entrypoints/context.py
- title~=(?i)gpt[-_]?oss
+ - title~=(?i)harmony
actions:
label:
add:
@@ -135,6 +154,7 @@ pull_request_rules:
- name: label-rocm
description: Automatically apply rocm label
conditions:
+ - label != stale
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
@@ -155,6 +175,7 @@ pull_request_rules:
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
+ - label != stale
- or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
@@ -164,7 +185,7 @@ pull_request_rules:
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/
- - files=tests/v1/entrypoints/llm/test_guided_generate.py
+ - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@@ -174,6 +195,7 @@ pull_request_rules:
- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
+ - label != stale
- or:
- files~=^vllm/v1/spec_decode/
- files~=^tests/v1/spec_decode/
@@ -189,6 +211,7 @@ pull_request_rules:
- name: label-v1
description: Automatically apply v1 label
conditions:
+ - label != stale
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
@@ -201,6 +224,7 @@ pull_request_rules:
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
+ - label != stale
- or:
- files~=tpu.py
- files~=_tpu
@@ -216,6 +240,7 @@ pull_request_rules:
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
+ - label != stale
- and:
- -files~=tpu.py
- -files~=_tpu
@@ -230,9 +255,9 @@ pull_request_rules:
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
+ - label != stale
- or:
- files~=^tests/tool_use/
- - files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
@@ -249,8 +274,9 @@ pull_request_rules:
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- - conflict
- - -closed
+ - label != stale
+ - conflict
+ - -closed
actions:
label:
add:
@@ -264,20 +290,55 @@ pull_request_rules:
- name: assign reviewer for tensorizer changes
conditions:
+ - label != stale
+ - or:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- - files~=^tests/tensorizer_loader/
+ - files~=^tests/model_executor/model_loader/tensorizer_loader/
actions:
assign:
users:
- "sangstar"
+- name: assign reviewer for modelopt changes
+ conditions:
+ - label != stale
+ - or:
+ - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
+ - files~=^vllm/model_executor/layers/quantization/__init__\.py$
+ - files~=^tests/models/quantization/test_modelopt\.py$
+ - files~=^tests/quantization/test_modelopt\.py$
+ - files~=^tests/models/quantization/test_nvfp4\.py$
+ - files~=^docs/features/quantization/modelopt\.md$
+ actions:
+ assign:
+ users:
+ - "Edwardf0t1"
+
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- - -conflict
- - -closed
+ - -conflict
+ - -closed
actions:
label:
remove:
- needs-rebase
+
+- name: label-kv-connector
+ description: Automatically apply kv-connector label
+ conditions:
+ - label != stale
+ - or:
+ - files~=^examples/online_serving/disaggregated[^/]*/.*
+ - files~=^examples/offline_inference/disaggregated[^/]*/.*
+ - files~=^examples/others/lmcache/
+ - files~=^tests/v1/kv_connector/
+ - files~=^vllm/distributed/kv_transfer/
+ - title~=(?i)\bP/?D\b
+ - title~=(?i)NIXL
+ - title~=(?i)LMCache
+ actions:
+ label:
+ add:
+ - kv-connector
\ No newline at end of file
diff --git a/.github/scale-config.yml b/.github/scale-config.yml
new file mode 100644
index 0000000000000..c41a3ee3eb196
--- /dev/null
+++ b/.github/scale-config.yml
@@ -0,0 +1,21 @@
+# scale-config.yml:
+# Powers what instance types are available for GHA auto-scaled
+# runners. Runners listed here will be available as self hosted
+# runners, configuration is directly pulled from the main branch.
+# runner_types:
+# runner_label:
+# instance_type: m4.large
+# os: linux
+# # min_available defaults to the global cfg in the ALI Terraform
+# min_available: undefined
+# # when max_available value is not defined, no max runners is enforced
+# max_available: undefined
+# disk_size: 50
+# is_ephemeral: true
+
+runner_types:
+ linux.2xlarge:
+ disk_size: 150
+ instance_type: c5.2xlarge
+ is_ephemeral: true
+ os: linux
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 315042fbf5cf4..d8bbedef3174b 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
github.rest.issues.addLabels({
diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml
new file mode 100644
index 0000000000000..823695a921321
--- /dev/null
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,29 @@
+name: BC Lint
+
+on:
+ pull_request:
+ types:
+ - opened
+ - synchronize
+ - reopened
+ - labeled
+ - unlabeled
+
+jobs:
+ bc_lint:
+ if: github.repository_owner == 'vllm-project'
+ runs-on: ubuntu-latest
+ steps:
+ - name: Run BC Lint Action
+ uses: pytorch/test-infra/.github/actions/bc-lint@main
+ with:
+ repo: ${{ github.event.pull_request.head.repo.full_name }}
+ base_sha: ${{ github.event.pull_request.base.sha }}
+ head_sha: ${{ github.event.pull_request.head.sha }}
+ suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+ docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+ config_dir: .github
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+ cancel-in-progress: true
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index d5c6b8d43a6ef..c3e132a536a42 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: '3.12'
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
new file mode 100644
index 0000000000000..c2b17abe811cd
--- /dev/null
+++ b/.github/workflows/issue_autolabel.yml
@@ -0,0 +1,309 @@
+name: Label issues based on keywords
+on:
+ issues:
+ types: [opened, edited, reopened]
+permissions:
+ issues: write # needed so the workflow can add labels
+ contents: read
+concurrency:
+ group: issue-labeler-${{ github.event.issue.number }}
+ cancel-in-progress: true
+jobs:
+ add-labels:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Label issues based on keywords
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+ with:
+ script: |
+ // Configuration: Add new labels and keywords here
+ const labelConfig = {
+ rocm: {
+ // Keyword search - matches whole words only (with word boundaries)
+ keywords: [
+ {
+ term: "composable kernel",
+ searchIn: "both"
+ },
+ {
+ term: "rccl",
+ searchIn: "body" // only search in body
+ },
+ {
+ term: "migraphx",
+ searchIn: "title" // only search in title
+ },
+ {
+ term: "hipgraph",
+ searchIn: "both"
+ },
+ {
+ term: "ROCm System Management Interface",
+ searchIn: "body"
+ },
+ ],
+
+ // Substring search - matches anywhere in text (partial matches)
+ substrings: [
+ {
+ term: "VLLM_ROCM_",
+ searchIn: "both"
+ },
+ {
+ term: "aiter",
+ searchIn: "title"
+ },
+ {
+ term: "rocm",
+ searchIn: "title"
+ },
+ {
+ term: "amd",
+ searchIn: "title"
+ },
+ {
+ term: "hip-",
+ searchIn: "both"
+ },
+ {
+ term: "gfx",
+ searchIn: "both"
+ },
+ {
+ term: "cdna",
+ searchIn: "both"
+ },
+ {
+ term: "rdna",
+ searchIn: "both"
+ },
+ {
+ term: "torch_hip",
+ searchIn: "body" // only in body
+ },
+ {
+ term: "_hip",
+ searchIn: "both"
+ },
+ {
+ term: "hip_",
+ searchIn: "both"
+ },
+
+ // ROCm tools and libraries
+ {
+ term: "hipify",
+ searchIn: "both"
+ },
+ ],
+
+ // Regex patterns - for complex pattern matching
+ regexPatterns: [
+ {
+ pattern: "\\bmi\\d{3}[a-z]*\\b",
+ description: "AMD GPU names (mi + 3 digits + optional letters)",
+ flags: "gi",
+ searchIn: "both" // "title", "body", or "both"
+ }
+ ],
+ },
+ };
+
+ // Helper function to create regex based on search type
+ function createSearchRegex(term, type) {
+ // Escape special regex characters in the term
+ const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+
+ switch (type) {
+ case 'keyword':
+ // Word boundary search - matches whole words only
+ return new RegExp(`\\b${escapedTerm}\\b`, "gi");
+ case 'substring':
+ // Substring search - matches anywhere in the text
+ return new RegExp(escapedTerm, "gi");
+ default:
+ throw new Error(`Unknown search type: ${type}`);
+ }
+ }
+
+ // Helper function to find matching terms in text with line information
+ function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
+ const matches = [];
+ const lines = text.split('\n');
+
+ for (const termConfig of searchTerms) {
+ let regex;
+ let term, searchIn, pattern, description, flags;
+
+ // Handle different input formats (string or object)
+ if (typeof termConfig === 'string') {
+ term = termConfig;
+ searchIn = 'both'; // default
+ } else {
+ term = termConfig.term;
+ searchIn = termConfig.searchIn || 'both';
+ pattern = termConfig.pattern;
+ description = termConfig.description;
+ flags = termConfig.flags;
+ }
+
+ // Skip if this term shouldn't be searched in the current location
+ if (searchIn !== 'both' && searchIn !== searchLocation) {
+ continue;
+ }
+
+ // Create appropriate regex
+ if (searchType === 'regex') {
+ regex = new RegExp(pattern, flags || "gi");
+ } else {
+ regex = createSearchRegex(term, searchType);
+ }
+
+ const termMatches = [];
+
+ // Check each line for matches
+ lines.forEach((line, lineIndex) => {
+ const lineMatches = line.match(regex);
+ if (lineMatches) {
+ lineMatches.forEach(match => {
+ termMatches.push({
+ match: match,
+ lineNumber: lineIndex + 1,
+ lineContent: line.trim(),
+ searchType: searchType,
+ searchLocation: searchLocation,
+ originalTerm: term || pattern,
+ description: description,
+ // Show context around the match in the line
+ context: line.length > 100 ?
+ line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+ line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
+ : line.trim()
+ });
+ });
+ }
+ });
+
+ if (termMatches.length > 0) {
+ matches.push({
+ term: term || (description || pattern),
+ searchType: searchType,
+ searchLocation: searchLocation,
+ searchIn: searchIn,
+ pattern: pattern,
+ matches: termMatches,
+ count: termMatches.length
+ });
+ }
+ }
+
+ return matches;
+ }
+
+ // Helper function to check if label should be added
+ async function processLabel(labelName, config) {
+ const body = context.payload.issue.body || "";
+ const title = context.payload.issue.title || "";
+
+ core.notice(`Processing label: ${labelName}`);
+ core.notice(`Issue Title: "${title}"`);
+ core.notice(`Issue Body length: ${body.length} characters`);
+
+ let shouldAddLabel = false;
+ let allMatches = [];
+ let reason = '';
+
+ const keywords = config.keywords || [];
+ const substrings = config.substrings || [];
+ const regexPatterns = config.regexPatterns || [];
+
+ core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
+
+ // Search in title
+ if (title.trim()) {
+ core.notice(`Searching in title: "${title}"`);
+
+ const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
+ const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
+ const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
+
+ allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
+ }
+
+ // Search in body
+ if (body.trim()) {
+ core.notice(`Searching in body (${body.length} characters)`);
+
+ const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
+ const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
+ const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
+
+ allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
+ }
+
+ if (allMatches.length > 0) {
+ core.notice(`Found ${allMatches.length} matching term(s):`);
+
+ for (const termMatch of allMatches) {
+ const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
+ const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
+
+ if (termMatch.searchType === 'regex') {
+ core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+ } else {
+ core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+ }
+
+ // Show details for each match
+ termMatch.matches.forEach((match, index) => {
+ core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
+ if (match.description) {
+ core.notice(` Description: ${match.description}`);
+ }
+ core.notice(` Context: ${match.context}`);
+ if (match.lineContent !== match.context) {
+ core.notice(` Full line: ${match.lineContent}`);
+ }
+ });
+ }
+
+ shouldAddLabel = true;
+ const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
+ const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
+ const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
+ const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
+ const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
+ const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
+
+ reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
+ }
+
+ core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
+ core.notice(`Reason: ${reason || 'No matching terms found'}`);
+
+ if (shouldAddLabel) {
+ const existingLabels = context.payload.issue.labels.map(l => l.name);
+ if (!existingLabels.includes(labelName)) {
+ await github.rest.issues.addLabels({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ labels: [labelName],
+ });
+ core.notice(`Label "${labelName}" added. ${reason}`);
+ return true;
+ }
+ core.notice(`Label "${labelName}" already present.`);
+ return false;
+ }
+
+ core.notice(`No matching terms found for label "${labelName}".`);
+ return false;
+ }
+
+ // Process all configured labels
+ const processLabels = Object.entries(labelConfig)
+ .map(([labelName, config]) => processLabel(labelName, config));
+ const labelsAdded = await Promise.all(processLabels);
+ const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+ core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
\ No newline at end of file
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 195579f206a2f..e21d13b8161f3 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 1ee605dc7bb0d..8884359fa0ce4 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
try {
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 656f3d3fa7bc4..dca3089f496c9 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+ - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/.gitignore b/.gitignore
index 465935d488f84..b1df673e83ca8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
-# triton jit
+# triton jit
.triton
# Byte-compiled / optimized / DLL files
@@ -177,6 +177,14 @@ cython_debug/
# VSCode
.vscode/
+# Claude
+CLAUDE.md
+.claude/
+
+# Codex
+AGENTS.md
+.codex/
+
# DS Store
.DS_Store
@@ -209,4 +217,4 @@ shellcheck*/
csrc/moe/marlin_moe_wna16/kernel_*
# Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
\ No newline at end of file
+ep_kernels_workspace/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 612b290e88d46..832c3edcdc7fe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,30 +6,18 @@ default_stages:
- manual # Run in CI
exclude: 'vllm/third_party/.*'
repos:
-- repo: https://github.com/google/yapf
- rev: v0.43.0
- hooks:
- - id: yapf
- args: [--in-place, --verbose]
- # Keep the same list from yapfignore here to avoid yapf failing without any inputs
- exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.11.7
+ rev: v0.14.0
hooks:
- - id: ruff
+ - id: ruff-check
args: [--output-format, github, --fix]
- id: ruff-format
- files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
- rev: v1.34.0
+ rev: v1.38.1
hooks:
- id: typos
-- repo: https://github.com/PyCQA/isort
- rev: 6.0.1
- hooks:
- - id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: v20.1.3
+ rev: v21.1.2
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@@ -46,10 +34,10 @@ repos:
hooks:
- id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
- rev: 0.6.17
+ rev: 0.9.1
hooks:
- id: pip-compile
- args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+ args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
@@ -60,38 +48,32 @@ repos:
files: ^requirements/test\.(in|txt)$
- id: mypy-local
name: Run mypy for local Python installation
- entry: tools/mypy.sh 0 "local"
- language: python
- types: [python]
- additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
+ entry: python tools/pre_commit/mypy.py 0 "local"
stages: [pre-commit] # Don't run in CI
- - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
- name: Run mypy for Python 3.9
- entry: tools/mypy.sh 1 "3.9"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
- stages: [manual] # Only run in CI
+ <<: &mypy_common
+ language: python
+ types_or: [python, pyi]
+ require_serial: true
+ additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
- entry: tools/mypy.sh 1 "3.10"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.10"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
- entry: tools/mypy.sh 1 "3.11"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.11"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
- entry: tools/mypy.sh 1 "3.12"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.12"
+ <<: *mypy_common
+ stages: [manual] # Only run in CI
+ - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+ name: Run mypy for Python 3.13
+ entry: python tools/pre_commit/mypy.py 1 "3.13"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
@@ -155,18 +137,15 @@ repos:
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
- entry: python tools/check_pickle_imports.py
+ entry: python tools/pre_commit/check_pickle_imports.py
language: python
types: [python]
- pass_filenames: false
- additional_dependencies: [pathspec, regex]
+ additional_dependencies: [regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
- types: [python]
- pass_filenames: true
- files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
+ additional_dependencies: [regex]
# Keep `suggestion` last
- id: suggestion
name: Suggestion
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 4329750090683..d83d6df35ed9a 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -13,6 +13,7 @@ build:
mkdocs:
configuration: mkdocs.yaml
+ fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:
diff --git a/.yapfignore b/.yapfignore
index 2d6dcf8380cac..38158259032a6 100644
--- a/.yapfignore
+++ b/.yapfignore
@@ -1 +1,2 @@
collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a1deefb07f09c..005590445361a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
@@ -30,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
#
# Supported/expected torch versions for CUDA/ROCm.
@@ -45,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
#
# Try to find python package with an executable that exactly matches
@@ -82,6 +86,9 @@ find_package(Torch REQUIRED)
# Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+ set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
+elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else()
@@ -171,6 +178,25 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
+#
+# Set compression mode for CUDA >=13.x.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA" AND
+ DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+ list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
+endif()
+
+#
+# Set CUDA include flags for CXX compiler.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
+ if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
+ endif()
+endif()
+
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@@ -243,8 +269,8 @@ set(VLLM_EXT_SRC
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
- "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
- "csrc/quantization/fp8/common.cu"
+ "csrc/quantization/w8a8/int8/scaled_quant.cu"
+ "csrc/quantization/w8a8/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
@@ -256,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
- set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+ set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -288,14 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
- "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
- "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
- "csrc/attention/mla/cutlass_mla_entry.cu"
- "csrc/quantization/fp8/per_token_group_quant.cu")
+ "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+ "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@@ -399,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -427,12 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -457,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -493,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@@ -537,10 +570,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
# CUDA 12.8 or later
- cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+ "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -555,10 +593,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# FP4 Archs and flags
- cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+ "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
@@ -576,10 +619,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# CUTLASS MLA Archs and flags
- cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS
- "csrc/attention/mla/cutlass_mla_kernels.cu"
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -603,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -621,9 +667,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -642,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# moe_data.cu is used by all CUTLASS MoE kernels.
- cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@@ -661,9 +715,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -750,6 +808,44 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"found in CUDA target architectures")
endif()
endif()
+
+ # Only build W4A8 kernels if we are building for something compatible with sm90a
+ cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
+ set(SRCS
+ "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${W4A8_ARCHS}")
+
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+
+ message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
+ else()
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+ AND W4A8_ARCHS)
+ message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
+ "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+ "later if you intend on running w4a16 quantized models on "
+ "Hopper.")
+ else()
+ message(STATUS "Not building W4A8 kernels as no compatible archs "
+ "found in CUDA target architectures")
+ endif()
+ endif()
+
+ # Hadacore kernels
+ cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+ if(HADACORE_ARCHS)
+ set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${HADACORE_ARCHS}")
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+ message(STATUS "Building hadacore")
+ endif()
+
# if CUDA endif
endif()
@@ -790,7 +886,9 @@ set(VLLM_MOE_EXT_SRC
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
- list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+ list(APPEND VLLM_MOE_EXT_SRC
+ "csrc/moe/moe_wna16.cu"
+ "csrc/moe/grouped_topk_kernels.cu")
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -909,6 +1007,7 @@ endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
+ include(cmake/external_projects/qutlass.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake)
diff --git a/MANIFEST.in b/MANIFEST.in
index 82fd22b845f09..fb3cccbb4a9c1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,6 @@ include LICENSE
include requirements/common.txt
include requirements/cuda.txt
include requirements/rocm.txt
-include requirements/neuron.txt
include requirements/cpu.txt
include CMakeLists.txt
diff --git a/README.md b/README.md
index fd8b02ac1f781..3dcdd7dc00942 100644
--- a/README.md
+++ b/README.md
@@ -14,18 +14,26 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |
+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
---
*Latest News* 🔥
-- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
-- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
+- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
+- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Previous News
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
@@ -74,7 +82,7 @@ vLLM is flexible and easy to use with:
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support
- Multi-LoRA support
@@ -141,6 +149,7 @@ Compute Resources:
- Trainy
- UC Berkeley
- UC San Diego
+- Volcengine
Slack Sponsor: Anyscale
diff --git a/SECURITY.md b/SECURITY.md
index 414669fb3712e..d6319cdb1ac27 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -42,4 +42,9 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
+* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
+ * Substantial internal deployment leveraging the upstream vLLM project.
+ * Established internal security teams and comprehensive compliance measures.
+ * Active and consistent contributions to the upstream vLLM project.
+
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 176b40212978f..269a4d51ec2ef 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,725 +1,20 @@
-# Benchmarking vLLM
+# Benchmarks
-This README guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It’s a living document, updated as new features and datasets
-become available.
+This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
-## Dataset Overview
+## Contents
-
-
-
- | Dataset |
- Online |
- Offline |
- Data Path |
-
-
-
-
- | ShareGPT |
- ✅ |
- ✅ |
- wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json |
-
-
- | ShareGPT4V (Image) |
- ✅ |
- ✅ |
-
- wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
-
- Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
- wget http://images.cocodataset.org/zips/train2017.zip
- |
-
-
- | ShareGPT4Video (Video) |
- ✅ |
- ✅ |
-
- git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video
- |
-
-
- | BurstGPT |
- ✅ |
- ✅ |
- wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv |
-
-
- | Sonnet (deprecated) |
- ✅ |
- ✅ |
- Local file: benchmarks/sonnet.txt |
-
-
- | Random |
- ✅ |
- ✅ |
- synthetic |
-
-
- | Prefix Repetition |
- ✅ |
- ✅ |
- synthetic |
-
-
- | HuggingFace-VisionArena |
- ✅ |
- ✅ |
- lmarena-ai/VisionArena-Chat |
-
-
- | HuggingFace-InstructCoder |
- ✅ |
- ✅ |
- likaixin/InstructCoder |
-
-
- | HuggingFace-AIMO |
- ✅ |
- ✅ |
- AI-MO/aimo-validation-aime , AI-MO/NuminaMath-1.5, AI-MO/NuminaMath-CoT |
-
-
- | HuggingFace-Other |
- ✅ |
- ✅ |
- lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered |
-
-
- | Custom |
- ✅ |
- ✅ |
- Local file: data.jsonl |
-
-
-
+- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
+- **Throughput benchmarks**: Scripts for testing offline batch inference performance
+- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
+- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
-✅: supported
+## Usage
-🟡: Partial support
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
-🚧: to be supported
+For full CLI reference see:
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
-
-## 🚀 Example - Online Benchmark
-
-
-Show more
-
-
-
-First start serving your model
-
-```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B
-```
-
-Then run the benchmarking script
-
-```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench serve \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --endpoint /v1/completions \
- --dataset-name sharegpt \
- --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```text
-============ Serving Benchmark Result ============
-Successful requests: 10
-Benchmark duration (s): 5.78
-Total input tokens: 1369
-Total generated tokens: 2212
-Request throughput (req/s): 1.73
-Output token throughput (tok/s): 382.89
-Total Token throughput (tok/s): 619.85
----------------Time to First Token----------------
-Mean TTFT (ms): 71.54
-Median TTFT (ms): 73.88
-P99 TTFT (ms): 79.49
------Time per Output Token (excl. 1st token)------
-Mean TPOT (ms): 7.91
-Median TPOT (ms): 7.96
-P99 TPOT (ms): 8.03
----------------Inter-token Latency----------------
-Mean ITL (ms): 7.74
-Median ITL (ms): 7.70
-P99 ITL (ms): 8.39
-==================================================
-```
-
-### Custom Dataset
-
-If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
-
-```json
-{"prompt": "What is the capital of India?"}
-{"prompt": "What is the capital of Iran?"}
-{"prompt": "What is the capital of China?"}
-```
-
-```bash
-# start server
-VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
-```
-
-```bash
-# run benchmarking script
-vllm bench serve --port 9001 --save-result --save-detailed \
- --backend vllm \
- --model meta-llama/Llama-3.1-8B-Instruct \
- --endpoint /v1/completions \
- --dataset-name custom \
- --dataset-path \
- --custom-skip-chat-template \
- --num-prompts 80 \
- --max-concurrency 1 \
- --temperature=0.3 \
- --top-p=0.75 \
- --result-dir "./log/"
-```
-
-You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-# need a model with vision capability here
-vllm serve Qwen/Qwen2-VL-7B-Instruct
-```
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path lmarena-ai/VisionArena-Chat \
- --hf-split train \
- --num-prompts 1000
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
- --speculative-config $'{"method": "ngram",
- "num_speculative_tokens": 5, "prompt_lookup_max": 5,
- "prompt_lookup_min": 2}'
-```
-
-``` bash
-vllm bench serve \
- --model meta-llama/Meta-Llama-3-8B-Instruct \
- --dataset-name hf \
- --dataset-path likaixin/InstructCoder \
- --num-prompts 2048
-```
-
-### Other HuggingFaceDataset Examples
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct
-```
-
-`lmms-lab/LLaVA-OneVision-Data`:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path lmms-lab/LLaVA-OneVision-Data \
- --hf-split train \
- --hf-subset "chart2text(cauldron)" \
- --num-prompts 10
-```
-
-`Aeala/ShareGPT_Vicuna_unfiltered`:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
- --hf-split train \
- --num-prompts 10
-```
-
-`AI-MO/aimo-validation-aime`:
-
-``` bash
-vllm bench serve \
- --model Qwen/QwQ-32B \
- --dataset-name hf \
- --dataset-path AI-MO/aimo-validation-aime \
- --num-prompts 10 \
- --seed 42
-```
-
-`philschmid/mt-bench`:
-
-``` bash
-vllm bench serve \
- --model Qwen/QwQ-32B \
- --dataset-name hf \
- --dataset-path philschmid/mt-bench \
- --num-prompts 80
-```
-
-### Running With Sampling Parameters
-
-When using OpenAI-compatible backends such as `vllm`, optional sampling
-parameters can be specified. Example client command:
-
-```bash
-vllm bench serve \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --endpoint /v1/completions \
- --dataset-name sharegpt \
- --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --top-k 10 \
- --top-p 0.9 \
- --temperature 0.5 \
- --num-prompts 10
-```
-
-### Running With Ramp-Up Request Rate
-
-The benchmark tool also supports ramping up the request rate over the
-duration of the benchmark run. This can be useful for stress testing the
-server or finding the maximum throughput that it can handle, given some latency budget.
-
-Two ramp-up strategies are supported:
-
-- `linear`: Increases the request rate linearly from a start value to an end value.
-- `exponential`: Increases the request rate exponentially.
-
-The following arguments can be used to control the ramp-up:
-
-- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
-- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
-- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
-
-
-
-## 📈 Example - Offline Throughput Benchmark
-
-
-Show more
-
-
-
-```bash
-vllm bench throughput \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset-name sonnet \
- --dataset-path vllm/benchmarks/sonnet.txt \
- --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```text
-Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
-Total num prompt tokens: 5014
-Total num output tokens: 1500
-```
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path lmarena-ai/VisionArena-Chat \
- --num-prompts 1000 \
- --hf-split train
-```
-
-The `num prompt tokens` now includes image token counts
-
-```text
-Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
-Total num prompt tokens: 14527
-Total num output tokens: 1280
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
-vllm bench throughput \
- --dataset-name=hf \
- --dataset-path=likaixin/InstructCoder \
- --model=meta-llama/Meta-Llama-3-8B-Instruct \
- --input-len=1000 \
- --output-len=100 \
- --num-prompts=2048 \
- --async-engine \
- --speculative-config $'{"method": "ngram",
- "num_speculative_tokens": 5, "prompt_lookup_max": 5,
- "prompt_lookup_min": 2}'
-```
-
-```text
-Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
-Total num prompt tokens: 261136
-Total num output tokens: 204800
-```
-
-### Other HuggingFaceDataset Examples
-
-`lmms-lab/LLaVA-OneVision-Data`:
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path lmms-lab/LLaVA-OneVision-Data \
- --hf-split train \
- --hf-subset "chart2text(cauldron)" \
- --num-prompts 10
-```
-
-`Aeala/ShareGPT_Vicuna_unfiltered`:
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
- --hf-split train \
- --num-prompts 10
-```
-
-`AI-MO/aimo-validation-aime`:
-
-```bash
-vllm bench throughput \
- --model Qwen/QwQ-32B \
- --backend vllm \
- --dataset-name hf \
- --dataset-path AI-MO/aimo-validation-aime \
- --hf-split train \
- --num-prompts 10
-```
-
-Benchmark with LoRA adapters:
-
-``` bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench throughput \
- --model meta-llama/Llama-2-7b-hf \
- --backend vllm \
- --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --dataset_name sharegpt \
- --num-prompts 10 \
- --max-loras 2 \
- --max-lora-rank 8 \
- --enable-lora \
- --lora-path yard1/llama-2-7b-sql-lora-test
- ```
-
-
-
-## 🛠️ Example - Structured Output Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of structured output generation (JSON, grammar, regex).
-
-### Server Setup
-
-```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B
-```
-
-### JSON Schema Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset json \
- --structured-output-ratio 1.0 \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Grammar-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset grammar \
- --structure-type grammar \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Regex-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset regex \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Choice-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset choice \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### XGrammar Benchmark Dataset
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset xgrammar_bench \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-
-
-## 📚 Example - Long Document QA Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of long document question-answering with prefix caching.
-
-### Basic Long Document QA Test
-
-```bash
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 16 \
- --document-length 2000 \
- --output-len 50 \
- --repeat-count 5
-```
-
-### Different Repeat Modes
-
-```bash
-# Random mode (default) - shuffle prompts randomly
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode random
-
-# Tile mode - repeat entire prompt list in sequence
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode tile
-
-# Interleave mode - repeat each prompt consecutively
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode interleave
-```
-
-
-
-## 🗂️ Example - Prefix Caching Benchmark
-
-
-Show more
-
-
-
-Benchmark the efficiency of automatic prefix caching.
-
-### Fixed Prompt with Prefix Caching
-
-```bash
-python3 benchmarks/benchmark_prefix_caching.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-prompts 1 \
- --repeat-count 100 \
- --input-length-range 128:256
-```
-
-### ShareGPT Dataset with Prefix Caching
-
-```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-python3 benchmarks/benchmark_prefix_caching.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
- --enable-prefix-caching \
- --num-prompts 20 \
- --repeat-count 5 \
- --input-length-range 128:256
-```
-
-### Prefix Repetition Dataset
-
-```bash
-vllm bench serve \
- --backend openai \
- --model meta-llama/Llama-2-7b-chat-hf \
- --dataset-name prefix_repetition \
- --num-prompts 100 \
- --prefix-repetition-prefix-len 512 \
- --prefix-repetition-suffix-len 128 \
- --prefix-repetition-num-prefixes 5 \
- --prefix-repetition-output-len 128
-```
-
-
-
-## ⚡ Example - Request Prioritization Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of request prioritization in vLLM.
-
-### Basic Prioritization Test
-
-```bash
-python3 benchmarks/benchmark_prioritization.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --input-len 128 \
- --output-len 64 \
- --num-prompts 100 \
- --scheduling-policy priority
-```
-
-### Multiple Sequences per Prompt
-
-```bash
-python3 benchmarks/benchmark_prioritization.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --input-len 128 \
- --output-len 64 \
- --num-prompts 100 \
- --scheduling-policy priority \
- --n 2
-```
-
-
-
-## 👁️ Example - Multi-Modal Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of multi-modal requests in vLLM.
-
-### Images (ShareGPT4V)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dtype bfloat16 \
- --limit-mm-per-prompt '{"image": 1}' \
- --allowed-local-media-path /path/to/sharegpt4v/images
-```
-
-Send requests with images:
-
-```bash
-python benchmarks/benchmark_serving.py \
- --backend openai-chat \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dataset-name sharegpt \
- --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
- --num-prompts 100 \
- --save-result \
- --result-dir ~/vllm_benchmark_results \
- --save-detailed \
- --endpoint /v1/chat/completion
-```
-
-### Videos (ShareGPT4Video)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dtype bfloat16 \
- --limit-mm-per-prompt '{"video": 1}' \
- --allowed-local-media-path /path/to/sharegpt4video/videos
-```
-
-Send requests with videos:
-
-```bash
-python benchmarks/benchmark_serving.py \
- --backend openai-chat \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dataset-name sharegpt \
- --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
- --num-prompts 100 \
- --save-result \
- --result-dir ~/vllm_benchmark_results \
- --save-detailed \
- --endpoint /v1/chat/completion
-```
-
-
+-
+-
+-
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index 9aad51df6e003..d1bdb4c43f10b 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -31,6 +31,12 @@ cd vllm
You must set the following variables at the top of the script before execution.
+ Note: You can also override the default values below via environment variables when running the script.
+
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+
| Variable | Description | Example Value |
| --- | --- | --- |
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
@@ -143,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
+
+## Batched `auto_tune`
+
+The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
+
+### Prerequisites
+
+- **jq**: This script requires `jq` to parse the JSON configuration file.
+- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
+
+### How to Run
+
+1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
+
+2. **Execute the script**:
+
+ ```bash
+ bash batch_auto_tune.sh [gcs_upload_path]
+ ```
+
+ - ``: **Required.** Path to your JSON configuration file.
+ - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
+
+### Configuration File
+
+The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
+
+Here is an example `runs_config.json` with two benchmark configurations:
+
+```json
+[
+ {
+ "base": "/home/user",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "system": "TPU", # OR GPU
+ "tp": 8,
+ "input_len": 128,
+ "output_len": 2048,
+ "max_model_len": 2300,
+ "num_seqs_list": "128 256",
+ "num_batched_tokens_list": "8192 16384"
+ },
+ {
+ "base": "/home/user",
+ "model": "meta-llama/Llama-3.1-70B-Instruct",
+ "system": "TPU", # OR GPU
+ "tp": 8,
+ "input_len": 4000,
+ "output_len": 16,
+ "max_model_len": 4096,
+ "num_seqs_list": "64 128",
+ "num_batched_tokens_list": "4096 8192",
+ "max_latency_allowed_ms": 500
+ }
+]
+```
+
+### Output
+
+The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
+
+- `run_id`: A unique identifier for the run, derived from the timestamp.
+- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
+- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
+- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
+
+A summary of successful and failed runs is also printed to the console upon completion.
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 82c20ffa6554c..56b721cbb4021 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -5,25 +5,41 @@
TAG=$(date +"%Y_%m_%d_%H_%M")
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
-SYSTEM="TPU"
-TP=1
-DOWNLOAD_DIR=""
-INPUT_LEN=4000
-OUTPUT_LEN=16
-MAX_MODEL_LEN=4096
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000
-NUM_SEQS_LIST="128 256"
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM=${SYSTEM:-"TPU"}
+TP=${TP:-1}
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
-echo "result file: $RESULT"
-echo "model: $MODEL"
+echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="
rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
@@ -58,7 +74,7 @@ start_server() {
local vllm_log=$4
local profile_dir=$5
- pkill -if vllm
+ pkill -if "vllm serve" || true
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
@@ -80,17 +96,22 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
- VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+ VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
- VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
+ VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
+ local server_pid=$!
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
+ # This line checks whether the server is still alive or not,
+ # since that we should always have permission to send signal to the server process.
+ kill -0 $server_pid 2> /dev/null || break
+
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
@@ -102,7 +123,7 @@ start_server() {
done
if (( ! server_started )); then
- echo "server did not start within 10 minutes. Please check server log at $vllm_log".
+ echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
return 1
else
return 0
@@ -118,7 +139,7 @@ run_benchmark() {
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
- pkill -if vllm
+ pkill -if "vllm serve" || true
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
@@ -211,9 +232,9 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
- pkill -if vllm
+ pkill -if "vllm serve" || true
sleep 10
- printf '=%.0s' $(seq 1 20)
+ echo "===================="
return 0
}
@@ -287,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
else
echo "No configuration met the latency requirements. Skipping final profiling run."
fi
-pkill -if vllm
+pkill -if "vllm serve" || true
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
diff --git a/benchmarks/auto_tune/batch_auto_tune.sh b/benchmarks/auto_tune/batch_auto_tune.sh
new file mode 100755
index 0000000000000..57ef20daf6b71
--- /dev/null
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+INPUT_JSON="$1"
+GCS_PATH="$2" # Optional GCS path for uploading results for each run
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
+
+if [[ -z "$INPUT_JSON" ]]; then
+ echo "Error: Input JSON file not provided."
+ echo "Usage: $0 [gcs_upload_path]"
+ exit 1
+fi
+
+if [[ ! -f "$INPUT_JSON" ]]; then
+ echo "Error: File not found at '$INPUT_JSON'"
+ exit 1
+fi
+
+if ! command -v jq &> /dev/null; then
+ echo "Error: 'jq' command not found. Please install jq to process the JSON input."
+ exit 1
+fi
+
+if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
+ echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
+ exit 1
+fi
+
+SUCCESS_COUNT=0
+FAILURE_COUNT=0
+FAILED_RUNS=()
+SCRIPT_START_TIME=$(date +%s)
+
+json_content=$(cat "$INPUT_JSON")
+if ! num_runs=$(echo "$json_content" | jq 'length'); then
+ echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
+ exit 1
+fi
+
+echo "Found $num_runs benchmark configurations in $INPUT_JSON."
+echo "Starting benchmark runs..."
+echo "--------------------------------------------------"
+
+for i in $(seq 0 $(($num_runs - 1))); do
+ run_object=$(echo "$json_content" | jq ".[$i]")
+
+ RUN_START_TIME=$(date +%s)
+ ENV_VARS_ARRAY=()
+ # Dynamically create env vars from the JSON object's keys
+ for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
+ value=$(echo "$run_object" | jq -r ".$key")
+ var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
+ ENV_VARS_ARRAY+=("${var_name}=${value}")
+ done
+
+ echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
+
+ # Execute auto_tune.sh and capture output
+ RUN_OUTPUT_FILE=$(mktemp)
+ if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
+ STATUS="SUCCESS"
+ ((SUCCESS_COUNT++))
+ else
+ STATUS="FAILURE"
+ ((FAILURE_COUNT++))
+ FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
+ fi
+
+ RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
+ rm "$RUN_OUTPUT_FILE"
+
+ # Parse results and optionally upload them to GCS
+ RUN_ID=""
+ RESULTS=""
+ GCS_RESULTS_URL=""
+ if [[ "$STATUS" == "SUCCESS" ]]; then
+ RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
+
+ if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
+ RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
+ RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
+ RESULTS=$(cat "$RESULT_FILE_PATH")
+
+ if [[ -n "$GCS_PATH" ]]; then
+ GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
+ echo "Uploading results to GCS..."
+ if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
+ echo "GCS upload successful."
+ else
+ echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
+ fi
+ fi
+ else
+ echo "Warning: Could not find result file for a successful run."
+ STATUS="WARNING_NO_RESULT_FILE"
+ fi
+ fi
+
+ # Add the results back into the JSON object for this run
+ json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
+ '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
+
+ RUN_END_TIME=$(date +%s)
+ echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
+ echo "--------------------------------------------------"
+
+ # Save intermediate progress back to the file
+ echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
+
+done
+
+SCRIPT_END_TIME=$(date +%s)
+echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
+echo
+echo "====================== SUMMARY ======================"
+echo "Successful runs: $SUCCESS_COUNT"
+echo "Failed runs: $FAILURE_COUNT"
+echo "==================================================="
+
+if [[ $FAILURE_COUNT -gt 0 ]]; then
+ echo "Details of failed runs (see JSON file for full parameters):"
+ for failed in "${FAILED_RUNS[@]}"; do
+ echo " - $failed"
+ done
+fi
+
+echo "Updated results have been saved to '$INPUT_JSON'."
diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
index fd363c2ad0514..5434f8b6a4e44 100644
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -2,9 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
+from benchmark_utils import TimeCollector
from tabulate import tabulate
-from benchmark_utils import TimeCollector
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool
@@ -57,7 +57,7 @@ def invoke_main() -> None:
"--num-iteration",
type=int,
default=1000,
- help="Number of iterations to run to stablize final data readings",
+ help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--allocate-blocks",
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
deleted file mode 100644
index 2ea4f9ccaff2b..0000000000000
--- a/benchmarks/benchmark_dataset.py
+++ /dev/null
@@ -1,1288 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This module defines a framework for sampling benchmark requests from various
-datasets. Each dataset subclass of BenchmarkDataset must implement sample
-generation. Supported dataset types include:
- - ShareGPT
- - Random (synthetic)
- - Sonnet
- - BurstGPT
- - HuggingFace
- - VisionArena
-"""
-
-import base64
-import io
-import json
-import logging
-import random
-from abc import ABC, abstractmethod
-from collections.abc import Mapping
-from copy import deepcopy
-from dataclasses import dataclass
-from functools import cache
-from io import BytesIO
-from typing import Any, Callable, Optional, Union
-
-import numpy as np
-import pandas as pd
-from datasets import load_dataset
-from PIL import Image
-from transformers import PreTrainedTokenizerBase
-
-from vllm.lora.request import LoRARequest
-from vllm.lora.utils import get_adapter_absolute_path
-from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
-
-logger = logging.getLogger(__name__)
-
-# -----------------------------------------------------------------------------
-# Data Classes
-# -----------------------------------------------------------------------------
-
-
-@dataclass
-class SampleRequest:
- """
- Represents a single inference request for benchmarking.
- """
-
- prompt: Union[str, Any]
- prompt_len: int
- expected_output_len: int
- multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
- lora_request: Optional[LoRARequest] = None
- request_id: Optional[str] = None
-
-
-# -----------------------------------------------------------------------------
-# Benchmark Dataset Base Class
-# -----------------------------------------------------------------------------
-
-
-class BenchmarkDataset(ABC):
- DEFAULT_SEED = 0
- IS_MULTIMODAL = False
-
- def __init__(
- self,
- dataset_path: Optional[str] = None,
- random_seed: int = DEFAULT_SEED,
- ) -> None:
- """
- Initialize the BenchmarkDataset with an optional dataset path and random
- seed. Args:
- dataset_path (Optional[str]): Path to the dataset. If None, it
- indicates that a default or random dataset might be used.
- random_seed (int): Seed value for reproducible shuffling or
- sampling. Defaults to DEFAULT_SEED.
- """
- self.dataset_path = dataset_path
- # Set the random seed, ensuring that a None value is replaced with the
- # default seed.
- self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
- self.data = None
-
- def apply_multimodal_chat_transformation(
- self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
- ) -> list[dict]:
- """
- Transform a prompt and optional multimodal content into a chat format.
- This method is used for chat models that expect a specific conversation
- format.
- """
- content = [{"text": prompt, "type": "text"}]
- if mm_content is not None:
- content.append(mm_content)
- return [{"role": "user", "content": content}]
-
- def load_data(self) -> None:
- """
- Load data from the dataset path into self.data.
-
- This method must be overridden by subclasses since the method to load
- data will vary depending on the dataset format and source.
-
- Raises:
- NotImplementedError: If a subclass does not implement this method.
- """
- # TODO (jenniferzhao): add support for downloading data
- raise NotImplementedError("load_data must be implemented in subclasses.")
-
- def get_random_lora_request(
- self,
- tokenizer: PreTrainedTokenizerBase,
- max_loras: Optional[int] = None,
- lora_path: Optional[str] = None,
- ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
- """
- Optionally select a random LoRA request and return its associated
- tokenizer.
-
- This method is used when LoRA parameters are provided. It randomly
- selects a LoRA based on max_loras and retrieves a cached tokenizer for
- that LoRA if available. Otherwise, it returns the base tokenizer.
-
- Args:
- tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
- LoRA is selected. max_loras (Optional[int]): The maximum number of
- LoRAs available. If None, LoRA is not used. lora_path
- (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
- is not used.
-
- Returns:
- tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
- element is a LoRARequest (or None if not applicable) and the second
- element is the tokenizer associated with the LoRA request (or the
- base tokenizer).
- """
- if max_loras is None or lora_path is None:
- return None, tokenizer
-
- # Generate a random LoRA ID in the range [1, max_loras].
- lora_id = random.randint(1, max_loras)
- lora_request = LoRARequest(
- lora_name=str(lora_id),
- lora_int_id=lora_id,
- lora_path=lora_path_on_disk(lora_path),
- )
- if lora_id not in lora_tokenizer_cache:
- lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
- # Return lora_request and the cached tokenizer if available; otherwise,
- # return the base tokenizer
- return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
-
- @abstractmethod
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- request_id_prefix: str = "",
- ) -> list[SampleRequest]:
- """
- Abstract method to generate sample requests from the dataset.
-
- Subclasses must override this method to implement dataset-specific logic
- for generating a list of SampleRequest objects.
-
- Args:
- tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
- for processing the dataset's text.
- num_requests (int): The number of sample requests to generate.
- request_id_prefix (str) The prefix of request_id.
-
- Returns:
- list[SampleRequest]: A list of sample requests generated from the
- dataset.
- """
- raise NotImplementedError("sample must be implemented in subclasses.")
-
- def maybe_oversample_requests(
- self,
- requests: list[SampleRequest],
- num_requests: int,
- request_id_prefix: str = "",
- ) -> None:
- """
- Oversamples the list of requests if its size is less than the desired
- number.
-
- Args:
- requests (List[SampleRequest]): The current list of sampled
- requests.
- num_requests (int): The target number of requests.
- request_id_prefix (str) The prefix of the request ids.
- """
- if len(requests) < num_requests:
- random.seed(self.random_seed)
- additional = deepcopy(
- random.choices(requests, k=num_requests - len(requests))
- )
- for i in range(len(additional)):
- req = additional[i]
- req.request_id = request_id_prefix + str(len(requests) + i)
- requests.extend(additional)
- logger.info("Oversampled requests to reach %d total samples.", num_requests)
-
-
-# -----------------------------------------------------------------------------
-# Utility Functions and Global Caches
-# -----------------------------------------------------------------------------
-
-
-def is_valid_sequence(
- prompt_len: int,
- output_len: int,
- min_len: int = 4,
- max_prompt_len: int = 1024,
- max_total_len: int = 2048,
- skip_min_output_len_check: bool = False,
-) -> bool:
- """
- Validate a sequence based on prompt and output lengths.
-
- Default pruning criteria are copied from the original `sample_hf_requests`
- and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
- from `sample_requests` in benchmark_throughput.py.
- """
- # Check for invalid conditions
- prompt_too_short = prompt_len < min_len
- output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
- prompt_too_long = prompt_len > max_prompt_len
- combined_too_long = (prompt_len + output_len) > max_total_len
-
- # Return True if none of the invalid conditions are met
- return not (
- prompt_too_short or output_too_short or prompt_too_long or combined_too_long
- )
-
-
-@cache
-def lora_path_on_disk(lora_path: str) -> str:
- return get_adapter_absolute_path(lora_path)
-
-
-# Global cache for LoRA tokenizers.
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
-
-
-def process_image(image: Any) -> Mapping[str, Any]:
- """
- Process a single image input and return a multimedia content dictionary.
-
- Supports three input types:
-
- 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
- containing raw image data. - Loads the bytes as a PIL.Image.Image.
-
- 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
- a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
- a dictionary with the image as a base64 data URL.
-
- 3. String input: - Treats the string as a URL or local file path. -
- Prepends "file://" if the string doesn't start with "http://" or
- "file://". - Returns a dictionary with the image URL.
-
- Raises:
- ValueError: If the input is not a supported type.
- """
- if isinstance(image, dict) and "bytes" in image:
- image = Image.open(BytesIO(image["bytes"]))
- if isinstance(image, Image.Image):
- image = convert_image_mode(image, "RGB")
- with io.BytesIO() as image_data:
- image.save(image_data, format="JPEG")
- image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
- return {
- "type": "image_url",
- "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
- }
-
- if isinstance(image, str):
- image_url = (
- image if image.startswith(("http://", "file://")) else f"file://{image}"
- )
- return {"type": "image_url", "image_url": {"url": image_url}}
-
- raise ValueError(
- f"Invalid image input {image}. Must be a PIL.Image.Image"
- " or str or dictionary with raw image bytes."
- )
-
-
-def process_video(video: Any) -> Mapping[str, Any]:
- """
- Process a single video input and return a multimedia content dictionary.
-
- Supports the following input types:
-
- 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
- containing raw video data.
-
- 2. String input: - Treats the string as a URL or local file path. -
- Prepends "file://" if the string doesn't start with "http://" or
- "file://". - Returns a dictionary with the image URL.
-
- Raises:
- ValueError: If the input is not a supported type.
- """
- if isinstance(video, dict) and "bytes" in video:
- video_bytes = video["bytes"]
- video_base64 = base64.b64encode(video_bytes).decode("utf-8")
- return {
- "type": "video_url",
- "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
- }
-
- if isinstance(video, str):
- video_url = (
- video if video.startswith(("http://", "file://")) else f"file://{video}"
- )
- return {"type": "video_url", "video_url": {"url": video_url}}
-
- raise ValueError(
- f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
- )
-
-
-# -----------------------------------------------------------------------------
-# Random Dataset Implementation (Synthetic Data)
-# -----------------------------------------------------------------------------
-
-
-class RandomDataset(BenchmarkDataset):
- # Default values copied from benchmark_serving.py for the random dataset.
- DEFAULT_PREFIX_LEN = 0
- DEFAULT_RANGE_RATIO = 0.0
- DEFAULT_INPUT_LEN = 1024
- DEFAULT_OUTPUT_LEN = 128
-
- def __init__(
- self,
- **kwargs,
- ) -> None:
- super().__init__(**kwargs)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- prefix_len: int = DEFAULT_PREFIX_LEN,
- range_ratio: float = DEFAULT_RANGE_RATIO,
- input_len: int = DEFAULT_INPUT_LEN,
- output_len: int = DEFAULT_OUTPUT_LEN,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list[SampleRequest]:
- # Enforce range_ratio < 1
- assert range_ratio < 1.0, (
- "random_range_ratio must be < 1.0 to ensure a valid sampling range"
- )
-
- vocab_size = tokenizer.vocab_size
- num_special_tokens = tokenizer.num_special_tokens_to_add()
- real_input_len = input_len - num_special_tokens
-
- prefix_token_ids = (
- np.random.randint(0, vocab_size, size=prefix_len).tolist()
- if prefix_len > 0
- else []
- )
-
- # New sampling logic: [X * (1 - b), X * (1 + b)]
- input_low = int(real_input_len * (1 - range_ratio))
- input_high = int(real_input_len * (1 + range_ratio))
- output_low = int(output_len * (1 - range_ratio))
- # Ensure the lower bound for output length is at least 1 to prevent
- # sampling 0 tokens, which can cause request failures.
- output_low = max(output_low, 1)
- output_high = int(output_len * (1 + range_ratio))
-
- # Add logging for debugging
- logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
- logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
-
- input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
- output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
- offsets = np.random.randint(0, vocab_size, size=num_requests)
-
- requests = []
- for i in range(num_requests):
- inner_seq = (
- (offsets[i] + i + np.arange(input_lens[i])) % vocab_size
- ).tolist()
- token_sequence = prefix_token_ids + inner_seq
- prompt = tokenizer.decode(token_sequence)
- # After decoding the prompt we have to encode and decode it again.
- # This is done because in some cases N consecutive tokens
- # give a string tokenized into != N number of tokens.
- # For example for GPT2Tokenizer:
- # [6880, 6881] -> ['Ġcalls', 'here'] ->
- # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
- # To avoid uncontrolled change of the prompt length,
- # the encoded sequence is truncated before being decode again.
- total_input_len = prefix_len + int(input_lens[i])
- re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
- :total_input_len
- ]
- prompt = tokenizer.decode(re_encoded_sequence)
- total_input_len = len(re_encoded_sequence)
- requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=total_input_len,
- expected_output_len=int(output_lens[i]),
- request_id=request_id_prefix + str(i),
- )
- )
-
- return requests
-
-
-# -----------------------------------------------------------------------------
-# ShareGPT Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ShareGPTDataset(BenchmarkDataset):
- """
- Implements the ShareGPT dataset. Loads data from a JSON file and generates
- sample requests based on conversation turns.
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- with open(self.dataset_path, encoding="utf-8") as f:
- self.data = json.load(f)
- # Filter entries with at least two conversation turns.
- self.data = [
- entry
- for entry in self.data
- if "conversations" in entry and len(entry["conversations"]) >= 2
- ]
- random.seed(self.random_seed)
- random.shuffle(self.data)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- lora_path: Optional[str] = None,
- max_loras: Optional[int] = None,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- samples: list = []
- ind = 0
- for entry in self.data:
- if len(samples) >= num_requests:
- break
- prompt, completion = (
- entry["conversations"][0]["value"],
- entry["conversations"][1]["value"],
- )
-
- lora_request, tokenizer = self.get_random_lora_request(
- tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
- )
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- new_output_len = len(completion_ids) if output_len is None else output_len
- if not is_valid_sequence(
- prompt_len,
- new_output_len,
- skip_min_output_len_check=output_len is not None,
- ):
- continue
- if image_path := entry.get("image"):
- mm_content = process_image(image_path)
- elif video_path := entry.get("video"):
- mm_content = process_video(video_path)
- else:
- mm_content = None
- if enable_multimodal_chat:
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- samples.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=new_output_len,
- lora_request=lora_request,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
- return samples
-
-
-# -----------------------------------------------------------------------------
-# Custom Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class CustomDataset(BenchmarkDataset):
- """
- Implements the Custom dataset. Loads data from a JSONL file and generates
- sample requests based on conversation turns. E.g.,
- ```
- {"prompt": "What is the capital of India?"}
- {"prompt": "What is the capital of Iran?"}
- {"prompt": "What is the capital of China?"}
- ```
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- # self.data will be a list of dictionaries
- # e.g., [{"prompt": "What is the capital of India?"}, ...]
- # This will be the standardized format which load_data()
- # has to convert into depending on the filetype of dataset_path.
- # sample() will assume this standardized format of self.data
- self.data = []
-
- # Load the JSONL file
- if self.dataset_path.endswith(".jsonl"):
- jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
-
- # check if the JSONL file has a 'prompt' column
- if "prompt" not in jsonl_data.columns:
- raise ValueError("JSONL file must contain a 'prompt' column.")
-
- # Convert each row to a dictionary and append to self.data
- # This will convert the DataFrame to a list of dictionaries
- # where each dictionary corresponds to a row in the DataFrame.
- # This is the standardized format we want for self.data
- for _, row in jsonl_data.iterrows():
- self.data.append(row.to_dict())
- else:
- raise NotImplementedError(
- "Only JSONL format is supported for CustomDataset."
- )
-
- random.seed(self.random_seed)
- random.shuffle(self.data)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- lora_path: Optional[str] = None,
- max_loras: Optional[int] = None,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- skip_chat_template: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = item["prompt"]
-
- # apply template
- if not skip_chat_template:
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
-
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
-
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Sonnet Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class SonnetDataset(BenchmarkDataset):
- """
- Simplified implementation of the Sonnet dataset. Loads poem lines from a
- text file and generates sample requests. Default values here copied from
- `benchmark_serving.py` for the sonnet dataset.
- """
-
- DEFAULT_PREFIX_LEN = 200
- DEFAULT_INPUT_LEN = 550
- DEFAULT_OUTPUT_LEN = 150
-
- def __init__(
- self,
- **kwargs,
- ) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if not self.dataset_path:
- raise ValueError("dataset_path must be provided.")
- with open(self.dataset_path, encoding="utf-8") as f:
- self.data = f.readlines()
-
- def sample(
- self,
- tokenizer,
- num_requests: int,
- prefix_len: int = DEFAULT_PREFIX_LEN,
- input_len: int = DEFAULT_INPUT_LEN,
- output_len: int = DEFAULT_OUTPUT_LEN,
- return_prompt_formatted: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- # Calculate average token length for a poem line.
- tokenized_lines = [tokenizer(line).input_ids for line in self.data]
- avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
-
- # Build the base prompt.
- base_prompt = "Pick as many lines as you can from these poem lines:\n"
- base_msg = [{"role": "user", "content": base_prompt}]
- base_fmt = tokenizer.apply_chat_template(
- base_msg, add_generation_prompt=True, tokenize=False
- )
- base_offset = len(tokenizer(base_fmt).input_ids)
- if input_len <= base_offset:
- raise ValueError(
- f"'input_len' must be higher than the base prompt length "
- f"({base_offset})."
- )
-
- # Determine how many poem lines to use.
- num_input_lines = round((input_len - base_offset) / avg_len)
- num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
- prefix_lines = self.data[:num_prefix_lines]
-
- samples = []
- ind = 0
- while len(samples) < num_requests:
- extra_lines = random.choices(
- self.data, k=num_input_lines - num_prefix_lines
- )
- prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
- msg = [{"role": "user", "content": prompt}]
- prompt_formatted = tokenizer.apply_chat_template(
- msg, add_generation_prompt=True, tokenize=False
- )
- prompt_len = len(tokenizer(prompt_formatted).input_ids)
-
- if prompt_len <= input_len:
- samples.append(
- SampleRequest(
- prompt=prompt_formatted if return_prompt_formatted else prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- return samples
-
-
-# -----------------------------------------------------------------------------
-# BurstGPT Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class BurstGPTDataset(BenchmarkDataset):
- """
- Implements the BurstGPT dataset. Loads data from a CSV file and generates
- sample requests based on synthetic prompt generation. Only rows with Model
- "GPT-4" and positive response tokens are used.
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(
- self,
- ):
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- df = pd.read_csv(self.dataset_path)
- # Filter to keep only GPT-4 rows.
- gpt4_df = df[df["Model"] == "GPT-4"]
- # Remove failed requests (where Response tokens is 0 or less).
- gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
- # Sample the desired number of rows.
- self.data = gpt4_df
-
- def _sample_loaded_data(self, num_requests: int) -> list:
- if num_requests <= len(self.data):
- data = self.data.sample(n=num_requests, random_state=self.random_seed)
- else:
- data = self.data.sample(
- n=num_requests,
- random_state=self.random_seed,
- replace=True,
- )
- # Convert the dataframe to a list of lists.
- return data.values.tolist()
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- max_loras: Optional[int] = None,
- lora_path: Optional[str] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list[SampleRequest]:
- samples = []
- data = self._sample_loaded_data(num_requests=num_requests)
- for i in range(num_requests):
- input_len = int(data[i][2])
- output_len = int(data[i][3])
- lora_req, tokenizer = self.get_random_lora_request(
- tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
- )
- vocab_size = tokenizer.vocab_size
- # Generate a synthetic prompt: a list of token IDs computed as (i +
- # j) modulo vocab_size.
- token_ids = [(i + j) % vocab_size for j in range(input_len)]
- prompt = tokenizer.decode(token_ids)
- samples.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=input_len,
- expected_output_len=output_len,
- lora_request=lora_req,
- request_id=request_id_prefix + str(i),
- )
- )
- return samples
-
-
-# -----------------------------------------------------------------------------
-# HuggingFace Dataset Base Implementation
-# -----------------------------------------------------------------------------
-class HuggingFaceDataset(BenchmarkDataset):
- """Base class for datasets hosted on HuggingFace."""
-
- SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
-
- def __init__(
- self,
- dataset_path: str,
- dataset_split: str,
- no_stream: bool = False,
- dataset_subset: Optional[str] = None,
- **kwargs,
- ) -> None:
- super().__init__(dataset_path=dataset_path, **kwargs)
-
- self.dataset_split = dataset_split
- self.dataset_subset = dataset_subset
- self.load_stream = not no_stream
- self.load_data()
-
- def load_data(self) -> None:
- """Load data from HuggingFace datasets."""
- self.data = load_dataset(
- self.dataset_path,
- name=self.dataset_subset,
- split=self.dataset_split,
- streaming=self.load_stream,
- )
- self.data = self.data.shuffle(seed=self.random_seed)
-
-
-# -----------------------------------------------------------------------------
-# Conversation Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ConversationDataset(HuggingFaceDataset):
- """Dataset for conversation data with multimodal support."""
-
- SUPPORTED_DATASET_PATHS = {
- "lmms-lab/LLaVA-OneVision-Data",
- "Aeala/ShareGPT_Vicuna_unfiltered",
- }
- IS_MULTIMODAL = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- # Filter examples with at least 2 conversations
- filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
- sampled_requests = []
- dynamic_output = output_len is None
- ind = 0
-
- for item in filtered_data:
- if len(sampled_requests) >= num_requests:
- break
- conv = item["conversations"]
- prompt, completion = conv[0]["value"], conv[1]["value"]
-
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- completion_len = len(completion_ids)
- output_len = completion_len if dynamic_output else output_len
- assert isinstance(output_len, int) and output_len > 0
- if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
- continue
- mm_content = process_image(item["image"]) if "image" in item else None
- if enable_multimodal_chat:
- # Note: when chat is enabled the request prompt_len is no longer
- # accurate and we will be using request output to count the
- # actual prompt len and output len
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Vision Arena Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class VisionArenaDataset(HuggingFaceDataset):
- """
- Vision Arena Dataset.
- """
-
- DEFAULT_OUTPUT_LEN = 128
- SUPPORTED_DATASET_PATHS = {
- "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
- "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
- }
- IS_MULTIMODAL = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
- if parser_fn is None:
- raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
- prompt = parser_fn(item)
- mm_content = process_image(item["images"][0])
- prompt_len = len(tokenizer(prompt).input_ids)
- if enable_multimodal_chat:
- # Note: when chat is enabled the request prompt_len is no longer
- # accurate and we will be using request output to count the
- # actual prompt len
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Instruct Coder Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class InstructCoderDataset(HuggingFaceDataset):
- """
- InstructCoder Dataset.
- https://huggingface.co/datasets/likaixin/InstructCoder
-
- InstructCoder is the dataset designed for general code editing. It consists
- of 114,239 instruction-input-output triplets, and covers multiple distinct
- code editing scenario.
- """
-
- DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
- SUPPORTED_DATASET_PATHS = {
- "likaixin/InstructCoder",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = (
- f"{item['input']}\n\n{item['instruction']} Just output "
- "the code, do not include any explanation."
- )
-
- # apply template
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# MT-Bench Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class MTBenchDataset(HuggingFaceDataset):
- """
- MT-Bench Dataset.
- https://huggingface.co/datasets/philschmid/mt-bench
-
- We create a single turn dataset for MT-Bench.
- This is similar to Spec decoding benchmark setup in vLLM
- https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
- """ # noqa: E501
-
- DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
- SUPPORTED_DATASET_PATHS = {
- "philschmid/mt-bench",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
-
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = item["turns"][0]
-
- # apply template
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
-
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# AIMO Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class AIMODataset(HuggingFaceDataset):
- """
- Dataset class for processing a AIMO dataset with reasoning questions.
- """
-
- SUPPORTED_DATASET_PATHS = {
- "AI-MO/aimo-validation-aime",
- "AI-MO/NuminaMath-1.5",
- "AI-MO/NuminaMath-CoT",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- sampled_requests = []
- dynamic_output = output_len is None
- ind = 0
-
- for item in self.data:
- if len(sampled_requests) >= num_requests:
- break
- prompt, completion = item["problem"], item["solution"]
-
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- completion_len = len(completion_ids)
- output_len = completion_len if dynamic_output else output_len
- assert isinstance(output_len, int) and output_len > 0
- if dynamic_output and not is_valid_sequence(
- prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
- ):
- continue
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=None,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Next Edit Prediction Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-zeta_prompt = """### Instruction:
-You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
-
-### User Edits:
-
-{}
-
-### User Excerpt:
-
-{}
-
-### Response:
-
-""" # noqa: E501
-
-
-def _format_zeta_prompt(
- sample: dict, original_start_marker: str = "<|editable_region_start|>"
-) -> dict:
- """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
-
- This function formats examples from the NEP dataset
- into prompts and expected outputs. It could be
- further extended to support more NEP datasets.
-
- Args:
- sample: The dataset sample containing events,
- inputs, and outputs.
- original_start_marker: The marker indicating the
- start of the editable region. Defaults to
- "<|editable_region_start|>".
-
- Returns:
- A dictionary with the formatted prompts and expected outputs.
- """
- events = sample["events"]
- input = sample["input"]
- output = sample["output"]
- prompt = zeta_prompt.format(events, input)
-
- # following the original implementation, extract the focused region
- # from the raw output
- output_start_index = output.find(original_start_marker)
- output_focused_region = output[output_start_index:]
- expected_output = output_focused_region
-
- return {"prompt": prompt, "expected_output": expected_output}
-
-
-class NextEditPredictionDataset(HuggingFaceDataset):
- """
- Dataset class for processing a Next Edit Prediction dataset.
- """
-
- SUPPORTED_DATASET_PATHS = {
- "zed-industries/zeta",
- }
- MAPPING_PROMPT_FUNCS = {
- "zed-industries/zeta": _format_zeta_prompt,
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- request_id_prefix: str = "",
- **kwargs,
- ):
- formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
- if formatting_prompt_func is None:
- raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
- samples = []
- for i, sample in enumerate(self.data):
- sample = formatting_prompt_func(sample)
- samples.append(
- SampleRequest(
- prompt=sample["prompt"],
- prompt_len=len(tokenizer(sample["prompt"]).input_ids),
- expected_output_len=len(
- tokenizer(sample["expected_output"]).input_ids
- ),
- request_id=request_id_prefix + str(i),
- )
- )
- if len(samples) >= num_requests:
- break
- self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
- return samples
-
-
-# -----------------------------------------------------------------------------
-# ASR Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ASRDataset(HuggingFaceDataset):
- """
- Dataset class for processing a ASR dataset for transcription.
- Tested on the following set:
-
- +----------------+----------------------------------------+--------------------------+-----------------------------+
- | Dataset | Domain | Speaking Style | hf-subset |
- +----------------+----------------------------------------+--------------------------+-----------------------------+
- | TED-LIUM | TED talks | Oratory | release1, release2, release3|
- | | | | release3-speaker-adaptation |
- | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
- | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
- | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
- | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
- | AMI | Meetings | Spontaneous | ihm, sdm |
- +----------------+----------------------------------------+--------------------------+-----------------------------+
-
- """ # noqa: E501
-
- SUPPORTED_DATASET_PATHS = {
- "openslr/librispeech_asr",
- "facebook/voxpopuli",
- "LIUM/tedlium",
- "edinburghcstr/ami",
- "speechcolab/gigaspeech",
- "kensho/spgispeech",
- }
-
- DEFAULT_OUTPUT_LEN = 128
- IS_MULTIMODAL = True
-
- # TODO Whisper-specific. Abstract interface when more models are supported.
- TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
- skip_long_audios: bool = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- import librosa
-
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests = []
- skipped = 0
- ind = 0
- for item in self.data:
- if len(sampled_requests) >= num_requests:
- break
- audio = item["audio"]
- y, sr = audio["array"], audio["sampling_rate"]
- duration_s = librosa.get_duration(y=y, sr=sr)
- # Whisper max supported duration
- if self.skip_long_audios and duration_s > 30:
- skipped += 1
- continue
-
- mm_content = {"audio": (y, sr)}
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- if skipped:
- logger.warning(
- "%d samples discarded from dataset due to"
- " their length being greater than"
- " what Whisper supports.",
- skipped,
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d8b960edaa468..a7892f3f71243 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,191 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark the latency of processing a single batch of requests."""
-
-import argparse
-import dataclasses
-import json
-import os
-import time
-from typing import Any, Optional
-
-import numpy as np
-from tqdm import tqdm
-from typing_extensions import deprecated
-
-import vllm.envs as envs
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any]
-) -> None:
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={"latency": results["latencies"]},
- extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
- )
- if pt_records:
- pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-@deprecated(
- "benchmark_latency.py is deprecated and will be removed in a "
- "future version. Please use 'vllm bench latency' instead.",
-)
-def main(args: argparse.Namespace):
- print(args)
-
- engine_args = EngineArgs.from_cli_args(args)
-
- # NOTE(woosuk): If the request cannot be processed in a single batch,
- # the engine will automatically process the request in multiple batches.
- llm = LLM(**dataclasses.asdict(engine_args))
- assert llm.llm_engine.model_config.max_model_len >= (
- args.input_len + args.output_len
- ), (
- "Please ensure that max_model_len is greater than"
- " the sum of input_len and output_len."
- )
-
- sampling_params = SamplingParams(
- n=args.n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=args.output_len,
- detokenize=not args.disable_detokenize,
- )
- print(sampling_params)
- dummy_prompt_token_ids = np.random.randint(
- 10000, size=(args.batch_size, args.input_len)
- )
- dummy_prompts: list[PromptType] = [
- {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
- ]
-
- def llm_generate():
- if not args.use_beam_search:
- llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
- else:
- llm.beam_search(
- dummy_prompts,
- BeamSearchParams(
- beam_width=args.n,
- max_tokens=args.output_len,
- ignore_eos=True,
- ),
- )
-
- def run_to_completion(profile_dir: Optional[str] = None):
- if profile_dir:
- llm.start_profile()
- llm_generate()
- llm.stop_profile()
- else:
- start_time = time.perf_counter()
- llm_generate()
- end_time = time.perf_counter()
- latency = end_time - start_time
- return latency
-
- print("Warming up...")
- for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
- run_to_completion(profile_dir=None)
-
- if args.profile:
- profile_dir = envs.VLLM_TORCH_PROFILER_DIR
- print(f"Profiling (results will be saved to '{profile_dir}')...")
- run_to_completion(profile_dir=profile_dir)
- return
-
- # Benchmark.
- latencies = []
- for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
- latencies.append(run_to_completion(profile_dir=None))
- latencies = np.array(latencies)
- percentages = [10, 25, 50, 75, 90, 99]
- percentiles = np.percentile(latencies, percentages)
- print(f"Avg latency: {np.mean(latencies)} seconds")
- for percentage, percentile in zip(percentages, percentiles):
- print(f"{percentage}% percentile latency: {percentile} seconds")
-
- # Output JSON results if specified
- if args.output_json:
- results = {
- "avg_latency": np.mean(latencies),
- "latencies": latencies.tolist(),
- "percentiles": dict(zip(percentages, percentiles.tolist())),
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
- save_to_pytorch_benchmark_format(args, results)
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(
- description="Benchmark the latency of processing a single batch of "
- "requests till completion."
- )
- parser.add_argument("--input-len", type=int, default=32)
- parser.add_argument("--output-len", type=int, default=128)
- parser.add_argument("--batch-size", type=int, default=8)
- parser.add_argument(
- "--n",
- type=int,
- default=1,
- help="Number of generated sequences per prompt.",
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-iters-warmup",
- type=int,
- default=10,
- help="Number of iterations to run for warmup.",
- )
- parser.add_argument(
- "--num-iters", type=int, default=30, help="Number of iterations to run."
- )
- parser.add_argument(
- "--profile",
- action="store_true",
- help="profile the generation process of a single batch",
- )
- parser.add_argument(
- "--output-json",
- type=str,
- default=None,
- help="Path to save the latency results in JSON format.",
- )
- parser.add_argument(
- "--disable-detokenize",
- action="store_true",
- help=(
- "Do not detokenize responses (i.e. do not include "
- "detokenization time in the latency measurement)"
- ),
- )
-
- parser = EngineArgs.add_cli_args(parser)
- # V1 enables prefix caching by default which skews the latency
- # numbers. We need to disable prefix caching by default.
- parser.set_defaults(enable_prefix_caching=False)
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
- raise OSError(
- "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
- "Please set it to a valid path to use torch profiler."
- )
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench latency
+
+For help with the new command, run:
+ vllm bench latency --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench latency --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index c60040d05ab7a..626b150ee4ce0 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -1,17 +1,31 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
+import time
+from unittest import mock
import numpy as np
+from benchmark_utils import TimeCollector
from tabulate import tabulate
-from benchmark_utils import TimeCollector
-from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.config import (
+ CacheConfig,
+ DeviceConfig,
+ LoadConfig,
+ ModelConfig,
+ ParallelConfig,
+ SchedulerConfig,
+ SpeculativeConfig,
+ VllmConfig,
+)
+from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
-def main(args):
+def benchmark_propose(args):
rows = []
for max_ngram in args.max_ngram:
collector = TimeCollector(TimeCollector.US)
@@ -69,15 +83,93 @@ def main(args):
)
+def benchmark_batched_propose(args):
+ NUM_SPECULATIVE_TOKENS_NGRAM = 10
+ PROMPT_LOOKUP_MIN = 5
+ PROMPT_LOOKUP_MAX = 15
+ MAX_MODEL_LEN = int(1e7)
+ DEVICE = current_platform.device_type
+
+ model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
+
+ speculative_config = SpeculativeConfig(
+ target_model_config=model_config,
+ target_parallel_config=ParallelConfig(),
+ method="ngram",
+ num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
+ prompt_lookup_max=PROMPT_LOOKUP_MAX,
+ prompt_lookup_min=PROMPT_LOOKUP_MIN,
+ )
+
+ vllm_config = VllmConfig(
+ model_config=model_config,
+ cache_config=CacheConfig(),
+ speculative_config=speculative_config,
+ device_config=DeviceConfig(device=current_platform.device_type),
+ parallel_config=ParallelConfig(),
+ load_config=LoadConfig(),
+ scheduler_config=SchedulerConfig(),
+ )
+
+ # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
+ mock_pp_group = mock.MagicMock()
+ mock_pp_group.world_size = 1
+ with mock.patch(
+ "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
+ ):
+ runner = GPUModelRunner(vllm_config, DEVICE)
+
+ # hack max model len
+ runner.max_model_len = MAX_MODEL_LEN
+ runner.drafter.max_model_len = MAX_MODEL_LEN
+
+ dummy_input_batch = InputBatch(
+ max_num_reqs=args.num_req,
+ max_model_len=MAX_MODEL_LEN,
+ max_num_batched_tokens=args.num_req * args.num_token,
+ device=DEVICE,
+ pin_memory=False,
+ vocab_size=256000,
+ block_sizes=[16],
+ )
+ dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
+ dummy_input_batch.spec_decode_unsupported_reqs = ()
+ dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
+ dummy_input_batch.token_ids_cpu = np.random.randint(
+ 0, 20, (args.num_req, args.num_token)
+ )
+
+ runner.input_batch = dummy_input_batch
+
+ sampled_token_ids = [[0]] * args.num_req
+
+ print("Starting benchmark")
+ # first run is warmup so ignore it
+ for _ in range(args.num_iteration):
+ start = time.time()
+ runner.drafter.propose(
+ sampled_token_ids,
+ dummy_input_batch.req_ids,
+ dummy_input_batch.num_tokens_no_spec,
+ dummy_input_batch.token_ids_cpu,
+ dummy_input_batch.spec_decode_unsupported_reqs,
+ )
+ end = time.time()
+ print(f"Iteration time (s): {end - start}")
+
+
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of N-gram speculative decode drafting"
)
+ parser.add_argument(
+ "--batched", action="store_true", help="consider time to prepare batch"
+ )
parser.add_argument(
"--num-iteration",
type=int,
default=100,
- help="Number of iterations to run to stablize final data readings",
+ help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--num-req", type=int, default=128, help="Number of requests in the batch"
@@ -105,8 +197,17 @@ def invoke_main() -> None:
help="Number of speculative tokens to generate",
)
args = parser.parse_args()
- main(args)
+
+ if not args.batched:
+ benchmark_propose(args)
+ else:
+ benchmark_batched_propose(args)
+"""
+# Example command lines:
+# time python3 benchmarks/benchmark_ngram_proposer.py
+# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
+""" # noqa: E501
if __name__ == "__main__":
invoke_main() # pragma: no cover
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 02f5f585c0c16..76cf51498020b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,1324 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-r"""Benchmark online serving throughput.
-
-On the server side, run one of the following commands:
- vLLM OpenAI API server
- vllm serve \
- --swap-space 16
-
-On the client side, run:
- python benchmarks/benchmark_serving.py \
- --backend \
- --model \
- --dataset-name sharegpt \
- --dataset-path \
- --request-rate \ # By default is inf
- --num-prompts # By default is 1000
-
- when using tgi backend, add
- --endpoint /generate_stream
- to the end of the command above.
-"""
-
-import argparse
-import asyncio
-import gc
-import json
-import os
-import random
-import time
-import warnings
-from collections.abc import Iterable
-from dataclasses import dataclass
-from datetime import datetime
-from typing import Any, Literal, Optional
-
-import numpy as np
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from backend_request_func import (
- ASYNC_REQUEST_FUNCS,
- OPENAI_COMPATIBLE_BACKENDS,
- RequestFuncInput,
- RequestFuncOutput,
-)
-
-try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
- from backend_request_func import get_tokenizer
-
-try:
- from vllm.utils import FlexibleArgumentParser
-except ImportError:
- from argparse import ArgumentParser as FlexibleArgumentParser
-
-from benchmark_dataset import (
- AIMODataset,
- ASRDataset,
- BurstGPTDataset,
- ConversationDataset,
- CustomDataset,
- HuggingFaceDataset,
- InstructCoderDataset,
- MTBenchDataset,
- NextEditPredictionDataset,
- RandomDataset,
- SampleRequest,
- ShareGPTDataset,
- SonnetDataset,
- VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.benchmarks.serve import get_request
-
-MILLISECONDS_TO_SECONDS_CONVERSION = 1000
-
-
-@dataclass
-class BenchmarkMetrics:
- completed: int
- total_input: int
- total_output: int
- request_throughput: float
- request_goodput: float
- output_throughput: float
- total_token_throughput: float
- mean_ttft_ms: float
- median_ttft_ms: float
- std_ttft_ms: float
- percentiles_ttft_ms: list[tuple[float, float]]
- mean_tpot_ms: float
- median_tpot_ms: float
- std_tpot_ms: float
- percentiles_tpot_ms: list[tuple[float, float]]
- mean_itl_ms: float
- median_itl_ms: float
- std_itl_ms: float
- percentiles_itl_ms: list[tuple[float, float]]
- # E2EL stands for end-to-end latency per request.
- # It is the time taken on the client side from sending
- # a request to receiving a complete response.
- mean_e2el_ms: float
- median_e2el_ms: float
- std_e2el_ms: float
- percentiles_e2el_ms: list[tuple[float, float]]
-
-
-def calculate_metrics(
- input_requests: list[SampleRequest],
- outputs: list[RequestFuncOutput],
- dur_s: float,
- tokenizer: PreTrainedTokenizerBase,
- selected_percentile_metrics: list[str],
- selected_percentiles: list[float],
- goodput_config_dict: dict[str, float],
-) -> tuple[BenchmarkMetrics, list[int]]:
- actual_output_lens: list[int] = []
- total_input = 0
- completed = 0
- good_completed = 0
- itls: list[float] = []
- tpots: list[float] = []
- all_tpots: list[float] = []
- ttfts: list[float] = []
- e2els: list[float] = []
- for i in range(len(outputs)):
- if outputs[i].success:
- output_len = outputs[i].output_tokens
-
- if not output_len:
- # We use the tokenizer to count the number of output tokens
- # for some serving backends instead of looking at
- # len(outputs[i].itl) since multiple output tokens may be
- # bundled together
- # Note : this may inflate the output token count slightly
- output_len = len(
- tokenizer(
- outputs[i].generated_text, add_special_tokens=False
- ).input_ids
- )
- actual_output_lens.append(output_len)
- total_input += input_requests[i].prompt_len
- tpot = 0
- if output_len > 1:
- latency_minus_ttft = outputs[i].latency - outputs[i].ttft
- tpot = latency_minus_ttft / (output_len - 1)
- tpots.append(tpot)
- # Note: if output_len <= 1, we regard tpot as 0 for goodput
- all_tpots.append(tpot)
- itls += outputs[i].itl
- ttfts.append(outputs[i].ttft)
- e2els.append(outputs[i].latency)
- completed += 1
- else:
- actual_output_lens.append(0)
-
- if goodput_config_dict:
- valid_metrics = []
- slo_values = []
-
- if "ttft" in goodput_config_dict:
- valid_metrics.append(ttfts)
- slo_values.append(
- goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
- if "tpot" in goodput_config_dict:
- valid_metrics.append(all_tpots)
- slo_values.append(
- goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
- if "e2el" in goodput_config_dict:
- valid_metrics.append(e2els)
- slo_values.append(
- goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
-
- for req_metric in zip(*valid_metrics):
- is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
- if is_good_req:
- good_completed += 1
-
- if completed == 0:
- warnings.warn(
- "All requests failed. This is likely due to a misconfiguration "
- "on the benchmark arguments.",
- stacklevel=2,
- )
- metrics = BenchmarkMetrics(
- completed=completed,
- total_input=total_input,
- total_output=sum(actual_output_lens),
- request_throughput=completed / dur_s,
- request_goodput=good_completed / dur_s,
- output_throughput=sum(actual_output_lens) / dur_s,
- total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
- mean_ttft_ms=np.mean(ttfts or 0)
- * 1000, # ttfts is empty if streaming is not supported by backend
- std_ttft_ms=np.std(ttfts or 0) * 1000,
- median_ttft_ms=np.median(ttfts or 0) * 1000,
- percentiles_ttft_ms=[
- (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_tpot_ms=np.mean(tpots or 0) * 1000,
- std_tpot_ms=np.std(tpots or 0) * 1000,
- median_tpot_ms=np.median(tpots or 0) * 1000,
- percentiles_tpot_ms=[
- (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_itl_ms=np.mean(itls or 0) * 1000,
- std_itl_ms=np.std(itls or 0) * 1000,
- median_itl_ms=np.median(itls or 0) * 1000,
- percentiles_itl_ms=[
- (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_e2el_ms=np.mean(e2els or 0) * 1000,
- std_e2el_ms=np.std(e2els or 0) * 1000,
- median_e2el_ms=np.median(e2els or 0) * 1000,
- percentiles_e2el_ms=[
- (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
- ],
- )
-
- return metrics, actual_output_lens
-
-
-async def benchmark(
- backend: str,
- api_url: str,
- base_url: str,
- model_id: str,
- model_name: str,
- tokenizer: PreTrainedTokenizerBase,
- input_requests: list[SampleRequest],
- logprobs: Optional[int],
- request_rate: float,
- burstiness: float,
- disable_tqdm: bool,
- profile: bool,
- selected_percentile_metrics: list[str],
- selected_percentiles: list[float],
- ignore_eos: bool,
- goodput_config_dict: dict[str, float],
- max_concurrency: Optional[int],
- lora_modules: Optional[Iterable[str]],
- extra_body: Optional[dict],
- ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
- ramp_up_start_rps: Optional[int] = None,
- ramp_up_end_rps: Optional[int] = None,
-):
- if backend in ASYNC_REQUEST_FUNCS:
- request_func = ASYNC_REQUEST_FUNCS[backend]
- else:
- raise ValueError(f"Unknown backend: {backend}")
-
- print("Starting initial single prompt test run...")
- test_prompt, test_prompt_len, test_output_len, test_mm_content = (
- input_requests[0].prompt,
- input_requests[0].prompt_len,
- input_requests[0].expected_output_len,
- input_requests[0].multi_modal_data,
- )
-
- assert (
- test_mm_content is None
- or isinstance(test_mm_content, dict)
- or (
- isinstance(test_mm_content, list)
- and all(isinstance(item, dict) for item in test_mm_content)
- )
- ), "multi_modal_data must be a dict or list[dict]"
- test_input = RequestFuncInput(
- model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=api_url,
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- multi_modal_content=test_mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- )
-
- test_output = await request_func(request_func_input=test_input)
- if not test_output.success:
- raise ValueError(
- "Initial test run failed - Please make sure benchmark arguments "
- f"are correctly specified. Error: {test_output.error}"
- )
- else:
- print("Initial test run completed. Starting main benchmark run...")
-
- if lora_modules:
- # For each input request, choose a LoRA module at random.
- lora_modules = iter(
- [random.choice(lora_modules) for _ in range(len(input_requests))]
- )
-
- if profile:
- print("Starting profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=base_url + "/start_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- multi_modal_content=test_mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- )
- profile_output = await request_func(request_func_input=profile_input)
- if profile_output.success:
- print("Profiler started")
-
- distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
-
- if ramp_up_strategy is not None:
- print(
- f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
- f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
- "the duration of the benchmark."
- )
- else:
- print(f"Traffic request rate: {request_rate} RPS.")
-
- print(f"Burstiness factor: {burstiness} ({distribution})")
- print(f"Maximum request concurrency: {max_concurrency}")
-
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-
- # This can be used once the minimum Python version is 3.10 or higher,
- # and it will simplify the code in limited_request_func.
- # semaphore = (asyncio.Semaphore(max_concurrency)
- # if max_concurrency else contextlib.nullcontext())
- semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
-
- async def limited_request_func(request_func_input, pbar):
- if semaphore is None:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
- async with semaphore:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
-
- benchmark_start_time = time.perf_counter()
- tasks: list[asyncio.Task] = []
-
- rps_change_events = []
- last_int_rps = -1
- if ramp_up_strategy is not None and ramp_up_start_rps is not None:
- last_int_rps = ramp_up_start_rps
- rps_change_events.append(
- {
- "rps": last_int_rps,
- "timestamp": datetime.now().isoformat(),
- }
- )
-
- async for request, current_request_rate in get_request(
- input_requests,
- request_rate,
- burstiness,
- ramp_up_strategy,
- ramp_up_start_rps,
- ramp_up_end_rps,
- ):
- if ramp_up_strategy is not None:
- current_int_rps = int(current_request_rate)
- if current_int_rps > last_int_rps:
- timestamp = datetime.now().isoformat()
- for rps_val in range(last_int_rps + 1, current_int_rps + 1):
- rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
- last_int_rps = current_int_rps
-
- prompt, prompt_len, output_len, mm_content, request_id = (
- request.prompt,
- request.prompt_len,
- request.expected_output_len,
- request.multi_modal_data,
- request.request_id,
- )
- req_model_id, req_model_name = model_id, model_name
- if lora_modules:
- req_lora_module = next(lora_modules)
- req_model_id, req_model_name = req_lora_module, req_lora_module
-
- request_func_input = RequestFuncInput(
- model=req_model_id,
- model_name=req_model_name,
- prompt=prompt,
- api_url=api_url,
- prompt_len=prompt_len,
- output_len=output_len,
- logprobs=logprobs,
- multi_modal_content=mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- request_id=request_id,
- )
- task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
- tasks.append(asyncio.create_task(task))
- outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
-
- if pbar is not None:
- pbar.close()
-
- benchmark_duration = time.perf_counter() - benchmark_start_time
-
- metrics, actual_output_lens = calculate_metrics(
- input_requests=input_requests,
- outputs=outputs,
- dur_s=benchmark_duration,
- tokenizer=tokenizer,
- selected_percentile_metrics=selected_percentile_metrics,
- selected_percentiles=selected_percentiles,
- goodput_config_dict=goodput_config_dict,
- )
-
- print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
- print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
- if max_concurrency is not None:
- print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
- if request_rate != float("inf"):
- print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
- print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
- print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
- print(
- "{:<40} {:<10.2f}".format(
- "Request throughput (req/s):", metrics.request_throughput
- )
- )
- if goodput_config_dict:
- print(
- "{:<40} {:<10.2f}".format(
- "Request goodput (req/s):", metrics.request_goodput
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- "Output token throughput (tok/s):", metrics.output_throughput
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- "Total Token throughput (tok/s):", metrics.total_token_throughput
- )
- )
-
- result = {
- "duration": benchmark_duration,
- "completed": metrics.completed,
- "total_input_tokens": metrics.total_input,
- "total_output_tokens": metrics.total_output,
- "request_throughput": metrics.request_throughput,
- "request_goodput": metrics.request_goodput if goodput_config_dict else None,
- "output_throughput": metrics.output_throughput,
- "total_token_throughput": metrics.total_token_throughput,
- "input_lens": [output.prompt_len for output in outputs],
- "output_lens": actual_output_lens,
- "ttfts": [output.ttft for output in outputs],
- "itls": [output.itl for output in outputs],
- "generated_texts": [output.generated_text for output in outputs],
- "errors": [output.error for output in outputs],
- }
-
- if rps_change_events:
- result["rps_change_events"] = rps_change_events
-
- def process_one_metric(
- # E.g., "ttft"
- metric_attribute_name: str,
- # E.g., "TTFT"
- metric_name: str,
- # E.g., "Time to First Token"
- metric_header: str,
- ):
- # This function prints and adds statistics of the specified
- # metric.
- if metric_attribute_name not in selected_percentile_metrics:
- return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
- print(
- "{:<40} {:<10.2f}".format(
- f"Mean {metric_name} (ms):",
- getattr(metrics, f"mean_{metric_attribute_name}_ms"),
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- f"Median {metric_name} (ms):",
- getattr(metrics, f"median_{metric_attribute_name}_ms"),
- )
- )
- result[f"mean_{metric_attribute_name}_ms"] = getattr(
- metrics, f"mean_{metric_attribute_name}_ms"
- )
- result[f"median_{metric_attribute_name}_ms"] = getattr(
- metrics, f"median_{metric_attribute_name}_ms"
- )
- result[f"std_{metric_attribute_name}_ms"] = getattr(
- metrics, f"std_{metric_attribute_name}_ms"
- )
- for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
- p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
- result[f"p{p_word}_{metric_attribute_name}_ms"] = value
-
- process_one_metric("ttft", "TTFT", "Time to First Token")
- process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
- process_one_metric("itl", "ITL", "Inter-token Latency")
- process_one_metric("e2el", "E2EL", "End-to-end Latency")
-
- print("=" * 50)
-
- if profile:
- print("Stopping profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- prompt=test_prompt,
- api_url=base_url + "/stop_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- )
- profile_output = await request_func(request_func_input=profile_input)
- if profile_output.success:
- print("Profiler stopped")
-
- return result
-
-
-def check_goodput_args(args):
- # Check and parse goodput arguments
- goodput_config_dict = {}
- VALID_NAMES = ["ttft", "tpot", "e2el"]
- if args.goodput:
- goodput_config_dict = parse_goodput(args.goodput)
- for slo_name, slo_val in goodput_config_dict.items():
- if slo_name not in VALID_NAMES:
- raise ValueError(
- f"Invalid metric name found, {slo_name}: {slo_val}. "
- "The service level objective name should be one of "
- f"{str(VALID_NAMES)}. "
- )
- if slo_val < 0:
- raise ValueError(
- f"Invalid value found, {slo_name}: {slo_val}. "
- "The service level objective value should be "
- "non-negative."
- )
- return goodput_config_dict
-
-
-def parse_goodput(slo_pairs):
- goodput_config_dict = {}
- try:
- for slo_pair in slo_pairs:
- slo_name, slo_val = slo_pair.split(":")
- goodput_config_dict[slo_name] = float(slo_val)
- except ValueError as err:
- raise argparse.ArgumentTypeError(
- "Invalid format found for service level objectives. "
- 'Specify service level objectives for goodput as "KEY:VALUE" '
- "pairs, where the key is a metric name, and the value is a "
- "number in milliseconds."
- ) from err
- return goodput_config_dict
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any], file_name: str
-) -> None:
- metrics = [
- "median_ttft_ms",
- "mean_ttft_ms",
- "std_ttft_ms",
- "p99_ttft_ms",
- "mean_tpot_ms",
- "median_tpot_ms",
- "std_tpot_ms",
- "p99_tpot_ms",
- "median_itl_ms",
- "mean_itl_ms",
- "std_itl_ms",
- "p99_itl_ms",
- ]
- # These raw data might be useful, but they are rather big. They can be added
- # later if needed
- ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={k: [results[k]] for k in metrics},
- extra_info={
- k: results[k]
- for k in results
- if k not in metrics and k not in ignored_metrics
- },
- )
- if pt_records:
- # Don't use json suffix here as we don't want CI to pick it up
- pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-@deprecated(
- "benchmark_serving.py is deprecated and will be removed in a future "
- "version. Please use 'vllm bench serve' instead.",
-)
-def main(args: argparse.Namespace):
- print(args)
- random.seed(args.seed)
- np.random.seed(args.seed)
-
- backend = args.backend
- model_id = args.model
- model_name = args.served_model_name
- tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
- tokenizer_mode = args.tokenizer_mode
-
- # Validate ramp-up arguments
- if args.ramp_up_strategy is not None:
- if args.request_rate != float("inf"):
- raise ValueError(
- "When using ramp-up, do not specify --request-rate. "
- "The request rate will be controlled by ramp-up parameters. "
- "Please remove the --request-rate argument."
- )
- if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
- raise ValueError(
- "When using --ramp-up-strategy, both --ramp-up-start-rps and "
- "--ramp-up-end-rps must be specified"
- )
- if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
- raise ValueError("Ramp-up start and end RPS must be non-negative")
- if args.ramp_up_start_rps > args.ramp_up_end_rps:
- raise ValueError("Ramp-up start RPS must be less than end RPS")
- if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
- raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
-
- if args.base_url is not None:
- api_url = f"{args.base_url}{args.endpoint}"
- base_url = f"{args.base_url}"
- else:
- api_url = f"http://{args.host}:{args.port}{args.endpoint}"
- base_url = f"http://{args.host}:{args.port}"
-
- tokenizer = get_tokenizer(
- tokenizer_id,
- tokenizer_mode=tokenizer_mode,
- trust_remote_code=args.trust_remote_code,
- )
-
- if args.dataset_name is None:
- raise ValueError(
- "Please specify '--dataset-name' and the corresponding "
- "'--dataset-path' if required."
- )
-
- if args.dataset_name == "custom":
- dataset = CustomDataset(dataset_path=args.dataset_path)
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- output_len=args.custom_output_len,
- skip_chat_template=args.custom_skip_chat_template,
- request_id_prefix=args.request_id_prefix,
- )
-
- elif args.dataset_name == "sonnet":
- dataset = SonnetDataset(dataset_path=args.dataset_path)
- # For the "sonnet" dataset, formatting depends on the backend.
- if args.backend == "openai-chat":
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- return_prompt_formatted=False,
- request_id_prefix=args.request_id_prefix,
- )
- else:
- assert tokenizer.chat_template or tokenizer.default_chat_template, (
- "Tokenizer/model must have chat template for sonnet dataset."
- )
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- return_prompt_formatted=True,
- request_id_prefix=args.request_id_prefix,
- )
-
- elif args.dataset_name == "hf":
- # all following datasets are implemented from the
- # HuggingFaceDataset base class
- if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = VisionArenaDataset
- args.hf_split = "train"
- args.hf_subset = None
- elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = InstructCoderDataset
- args.hf_split = "train"
- elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = MTBenchDataset
- args.hf_split = "train"
- elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = ConversationDataset
- elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
- dataset_class = AIMODataset
- args.hf_split = "train"
- elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501
- dataset_class = NextEditPredictionDataset
- args.hf_split = "train"
- elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = ASRDataset
- args.hf_split = "train"
- else:
- supported_datasets = set(
- [
- dataset_name
- for cls in HuggingFaceDataset.__subclasses__()
- for dataset_name in cls.SUPPORTED_DATASET_PATHS
- ]
- )
- raise ValueError(
- f"Unsupported dataset path: {args.dataset_path}. "
- "Huggingface dataset only supports dataset_path"
- f" from one of following: {supported_datasets}. "
- "Please consider contributing if you would "
- "like to add support for additional dataset formats."
- )
-
- if dataset_class.IS_MULTIMODAL and backend not in [
- "openai-chat",
- "openai-audio",
- ]:
- # multi-modal benchmark is only available on OpenAI Chat backend.
- raise ValueError(
- "Multi-modal content is only supported on 'openai-chat' and "
- "'openai-audio' backend."
- )
- input_requests = dataset_class(
- dataset_path=args.dataset_path,
- dataset_subset=args.hf_subset,
- dataset_split=args.hf_split,
- random_seed=args.seed,
- no_stream=args.no_stream,
- ).sample(
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- output_len=args.hf_output_len,
- request_id_prefix=args.request_id_prefix,
- )
-
- else:
- # For datasets that follow a similar structure, use a mapping.
- dataset_mapping = {
- "sharegpt": lambda: ShareGPTDataset(
- random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- output_len=args.sharegpt_output_len,
- request_id_prefix=args.request_id_prefix,
- ),
- "burstgpt": lambda: BurstGPTDataset(
- random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- request_id_prefix=args.request_id_prefix,
- ),
- "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- prefix_len=args.random_prefix_len,
- input_len=args.random_input_len,
- output_len=args.random_output_len,
- range_ratio=args.random_range_ratio,
- request_id_prefix=args.request_id_prefix,
- ),
- }
-
- try:
- input_requests = dataset_mapping[args.dataset_name]()
- except KeyError as err:
- raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
- goodput_config_dict = check_goodput_args(args)
-
- # Collect the sampling parameters.
- sampling_params = {
- k: v
- for k, v in {
- "top_p": args.top_p,
- "top_k": args.top_k,
- "min_p": args.min_p,
- "temperature": args.temperature,
- }.items()
- if v is not None
- }
-
- # Sampling parameters are only supported by openai-compatible backend.
- if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
- raise ValueError(
- "Sampling parameters are only supported by openai-compatible backends."
- )
-
- if "temperature" not in sampling_params:
- sampling_params["temperature"] = 0.0 # Default to greedy decoding.
-
- if args.backend == "llama.cpp":
- # Disable prompt caching in llama.cpp backend
- sampling_params["cache_prompt"] = False
-
- # Avoid GC processing "static" data - reduce pause times.
- gc.collect()
- gc.freeze()
-
- benchmark_result = asyncio.run(
- benchmark(
- backend=backend,
- api_url=api_url,
- base_url=base_url,
- model_id=model_id,
- model_name=model_name,
- tokenizer=tokenizer,
- input_requests=input_requests,
- logprobs=args.logprobs,
- request_rate=args.request_rate,
- burstiness=args.burstiness,
- disable_tqdm=args.disable_tqdm,
- profile=args.profile,
- selected_percentile_metrics=args.percentile_metrics.split(","),
- selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
- ignore_eos=args.ignore_eos,
- goodput_config_dict=goodput_config_dict,
- max_concurrency=args.max_concurrency,
- lora_modules=args.lora_modules,
- extra_body=sampling_params,
- ramp_up_strategy=args.ramp_up_strategy,
- ramp_up_start_rps=args.ramp_up_start_rps,
- ramp_up_end_rps=args.ramp_up_end_rps,
- )
- )
-
- # Save config and results to json
- if args.save_result or args.append_result:
- result_json: dict[str, Any] = {}
-
- # Setup
- current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
- result_json["date"] = current_dt
- result_json["backend"] = backend
- result_json["model_id"] = model_id
- result_json["tokenizer_id"] = tokenizer_id
- result_json["num_prompts"] = args.num_prompts
-
- # Metadata
- if args.metadata:
- for item in args.metadata:
- if "=" in item:
- kvstring = item.split("=")
- result_json[kvstring[0].strip()] = kvstring[1].strip()
- else:
- raise ValueError(
- "Invalid metadata format. Please use KEY=VALUE format."
- )
- # Traffic
- result_json["request_rate"] = (
- args.request_rate if args.request_rate < float("inf") else "inf"
- )
- result_json["burstiness"] = args.burstiness
- result_json["max_concurrency"] = args.max_concurrency
-
- if args.ramp_up_strategy is not None:
- result_json["ramp_up_strategy"] = args.ramp_up_strategy
- result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
- result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
-
- # Merge with benchmark result
- result_json = {**result_json, **benchmark_result}
-
- if not args.save_detailed:
- # Remove fields with too many data points
- for field in [
- "input_lens",
- "output_lens",
- "ttfts",
- "itls",
- "generated_texts",
- "errors",
- ]:
- if field in result_json:
- del result_json[field]
- if field in benchmark_result:
- del benchmark_result[field]
-
- # Save to file
- base_model_id = model_id.split("/")[-1]
- max_concurrency_str = (
- f"-concurrency{args.max_concurrency}"
- if args.max_concurrency is not None
- else ""
- )
- if args.ramp_up_strategy is not None:
- file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
- else:
- file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
- if args.result_filename:
- file_name = args.result_filename
- if args.result_dir:
- os.makedirs(args.result_dir, exist_ok=True)
- file_name = os.path.join(args.result_dir, file_name)
- with open(
- file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
- ) as outfile:
- # Append a newline.
- if args.append_result and outfile.tell() != 0:
- outfile.write("\n")
- json.dump(result_json, outfile)
- save_to_pytorch_benchmark_format(args, result_json, file_name)
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(
- description="Benchmark the online serving throughput."
- )
- parser.add_argument(
- "--backend",
- type=str,
- default="vllm",
- choices=list(ASYNC_REQUEST_FUNCS.keys()),
- )
- parser.add_argument(
- "--base-url",
- type=str,
- default=None,
- help="Server or API base url if not using http host and port.",
- )
- # Use 127.0.0.1 here instead of localhost to force the use of ipv4
- parser.add_argument("--host", type=str, default="127.0.0.1")
- parser.add_argument("--port", type=int, default=8000)
- parser.add_argument(
- "--endpoint",
- type=str,
- default="/v1/completions",
- help="API endpoint.",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- default="sharegpt",
- choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
- help="Name of the dataset to benchmark on.",
- )
- parser.add_argument(
- "--dataset-path",
- type=str,
- default=None,
- help="Path to the sharegpt/sonnet dataset. "
- "Or the huggingface dataset ID if using HF dataset.",
- )
- parser.add_argument(
- "--no-stream",
- action="store_true",
- help="Do not load the dataset in streaming mode.",
- )
- parser.add_argument(
- "--max-concurrency",
- type=int,
- default=None,
- help="Maximum number of concurrent requests. This can be used "
- "to help simulate an environment where a higher level component "
- "is enforcing a maximum number of concurrent requests. While the "
- "--request-rate argument controls the rate at which requests are "
- "initiated, this argument will control how many are actually allowed "
- "to execute at a time. This means that when used in combination, the "
- "actual request rate may be lower than specified with --request-rate, "
- "if the server is not processing requests fast enough to keep up.",
- )
-
- parser.add_argument(
- "--model",
- type=str,
- required=True,
- help="Name of the model.",
- )
- parser.add_argument(
- "--tokenizer",
- type=str,
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-prompts",
- type=int,
- default=1000,
- help="Number of prompts to process.",
- )
- parser.add_argument(
- "--logprobs",
- type=int,
- default=None,
- help=(
- "Number of logprobs-per-token to compute & return as part of "
- "the request. If unspecified, then either (1) if beam search "
- "is disabled, no logprobs are computed & a single dummy "
- "logprob is returned for each token; or (2) if beam search "
- "is enabled 1 logprob per token is computed"
- ),
- )
- parser.add_argument(
- "--request-rate",
- type=float,
- default=float("inf"),
- help="Number of requests per second. If this is inf, "
- "then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process or gamma distribution "
- "to synthesize the request arrival times.",
- )
- parser.add_argument(
- "--burstiness",
- type=float,
- default=1.0,
- help="Burstiness factor of the request generation. "
- "Only take effect when request_rate is not inf. "
- "Default value is 1, which follows Poisson process. "
- "Otherwise, the request intervals follow a gamma distribution. "
- "A lower burstiness value (0 < burstiness < 1) results in more "
- "bursty requests. A higher burstiness value (burstiness > 1) "
- "results in a more uniform arrival of requests.",
- )
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument(
- "--trust-remote-code",
- action="store_true",
- help="Trust remote code from huggingface",
- )
- parser.add_argument(
- "--disable-tqdm",
- action="store_true",
- help="Specify to disable tqdm progress bar.",
- )
- parser.add_argument(
- "--profile",
- action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
- )
- parser.add_argument(
- "--save-result",
- action="store_true",
- help="Specify to save benchmark results to a json file",
- )
- parser.add_argument(
- "--save-detailed",
- action="store_true",
- help="When saving the results, whether to include per request "
- "information such as response, error, ttfs, tpots, etc.",
- )
- parser.add_argument(
- "--append-result",
- action="store_true",
- help="Append the benchmark result to the existing json file.",
- )
- parser.add_argument(
- "--metadata",
- metavar="KEY=VALUE",
- nargs="*",
- help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
- "for metadata of this run to be saved in the result JSON file "
- "for record keeping purposes.",
- )
- parser.add_argument(
- "--result-dir",
- type=str,
- default=None,
- help="Specify directory to save benchmark json results."
- "If not specified, results are saved in the current directory.",
- )
- parser.add_argument(
- "--result-filename",
- type=str,
- default=None,
- help="Specify the filename to save benchmark json results."
- "If not specified, results will be saved in "
- "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
- " format.",
- )
- parser.add_argument(
- "--ignore-eos",
- action="store_true",
- help="Set ignore_eos flag when sending the benchmark request."
- "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
- )
- parser.add_argument(
- "--percentile-metrics",
- type=str,
- default="ttft,tpot,itl",
- help="Comma-separated list of selected metrics to report percentils. "
- "This argument specifies the metrics to report percentiles. "
- 'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
- 'Default value is "ttft,tpot,itl".',
- )
- parser.add_argument(
- "--metric-percentiles",
- type=str,
- default="99",
- help="Comma-separated list of percentiles for selected metrics. "
- 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
- 'Default value is "99". '
- 'Use "--percentile-metrics" to select metrics.',
- )
- parser.add_argument(
- "--goodput",
- nargs="+",
- required=False,
- help='Specify service level objectives for goodput as "KEY:VALUE" '
- "pairs, where the key is a metric name, and the value is in "
- 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
- "separated by spaces. Allowed request level metric names are "
- '"ttft", "tpot", "e2el". For more context on the definition of '
- "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
- "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
- )
- parser.add_argument(
- "--request-id-prefix",
- type=str,
- required=False,
- default="benchmark-serving",
- help="Specify the prefix of request id.",
- )
-
- # group for dataset specific arguments
- custom_group = parser.add_argument_group("custom dataset options")
- custom_group.add_argument(
- "--custom-output-len",
- type=int,
- default=256,
- help="Number of output tokens per request, used only for custom dataset.",
- )
- custom_group.add_argument(
- "--custom-skip-chat-template",
- action="store_true",
- help="Skip applying chat template to prompt, used only for custom dataset.",
- )
-
- sonnet_group = parser.add_argument_group("sonnet dataset options")
- sonnet_group.add_argument(
- "--sonnet-input-len",
- type=int,
- default=550,
- help="Number of input tokens per request, used only for sonnet dataset.",
- )
- sonnet_group.add_argument(
- "--sonnet-output-len",
- type=int,
- default=150,
- help="Number of output tokens per request, used only for sonnet dataset.",
- )
- sonnet_group.add_argument(
- "--sonnet-prefix-len",
- type=int,
- default=200,
- help="Number of prefix tokens per request, used only for sonnet dataset.",
- )
-
- sharegpt_group = parser.add_argument_group("sharegpt dataset options")
- sharegpt_group.add_argument(
- "--sharegpt-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output length "
- "from the ShareGPT dataset.",
- )
-
- random_group = parser.add_argument_group("random dataset options")
- random_group.add_argument(
- "--random-input-len",
- type=int,
- default=1024,
- help="Number of input tokens per request, used only for random sampling.",
- )
- random_group.add_argument(
- "--random-output-len",
- type=int,
- default=128,
- help="Number of output tokens per request, used only for random sampling.",
- )
- random_group.add_argument(
- "--random-range-ratio",
- type=float,
- default=0.0,
- help="Range ratio for sampling input/output length, "
- "used only for random sampling. Must be in the range [0, 1) to define "
- "a symmetric sampling range"
- "[length * (1 - range_ratio), length * (1 + range_ratio)].",
- )
- random_group.add_argument(
- "--random-prefix-len",
- type=int,
- default=0,
- help=(
- "Number of fixed prefix tokens before the random context "
- "in a request. "
- "The total input length is the sum of `random-prefix-len` and "
- "a random "
- "context length sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]."
- ),
- )
-
- hf_group = parser.add_argument_group("hf dataset options")
- hf_group.add_argument(
- "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
- )
- hf_group.add_argument(
- "--hf-split", type=str, default=None, help="Split of the HF dataset."
- )
- hf_group.add_argument(
- "--hf-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output lengths "
- "from the sampled HF dataset.",
- )
-
- sampling_group = parser.add_argument_group("sampling parameters")
- sampling_group.add_argument(
- "--top-p",
- type=float,
- default=None,
- help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--top-k",
- type=int,
- default=None,
- help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--min-p",
- type=float,
- default=None,
- help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--temperature",
- type=float,
- default=None,
- help="Temperature sampling parameter. Only has effect on "
- "openai-compatible backends. If not specified, default to greedy "
- "decoding (i.e. temperature==0.0).",
- )
-
- parser.add_argument(
- "--tokenizer-mode",
- type=str,
- default="auto",
- choices=["auto", "slow", "mistral", "custom"],
- help='The tokenizer mode.\n\n* "auto" will use the '
- 'fast tokenizer if available.\n* "slow" will '
- "always use the slow tokenizer. \n* "
- '"mistral" will always use the `mistral_common` tokenizer. \n*'
- '"custom" will use --tokenizer to select the preregistered tokenizer.',
- )
-
- parser.add_argument(
- "--served-model-name",
- type=str,
- default=None,
- help="The model name used in the API. "
- "If not specified, the model name will be the "
- "same as the ``--model`` argument. ",
- )
-
- parser.add_argument(
- "--lora-modules",
- nargs="+",
- default=None,
- help="A subset of LoRA module names passed in when "
- "launching the server. For each request, the "
- "script chooses a LoRA module at random.",
- )
-
- parser.add_argument(
- "--ramp-up-strategy",
- type=str,
- default=None,
- choices=["linear", "exponential"],
- help="The ramp-up strategy. This would be used to "
- "ramp up the request rate from initial RPS to final "
- "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
- "over the duration of the benchmark.",
- )
- parser.add_argument(
- "--ramp-up-start-rps",
- type=int,
- default=None,
- help="The starting request rate for ramp-up (RPS). "
- "Needs to be specified when --ramp-up-strategy is used.",
- )
- parser.add_argument(
- "--ramp-up-end-rps",
- type=int,
- default=None,
- help="The ending request rate for ramp-up (RPS). "
- "Needs to be specified when --ramp-up-strategy is used.",
- )
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench serve
+
+For help with the new command, run:
+ vllm bench serve --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench serve --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index ca6843a72aa36..58b9767d09390 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -37,14 +37,13 @@ from typing import Optional
import datasets
import numpy as np
import pandas as pd
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
RequestFuncInput,
RequestFuncOutput,
)
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -449,7 +448,8 @@ async def benchmark(
def prepare_extra_body(request) -> dict:
extra_body = {}
# Add the schema to the extra_body
- extra_body[request.structure_type] = request.schema
+ extra_body["structured_outputs"] = {}
+ extra_body["structured_outputs"][request.structure_type] = request.schema
return extra_body
print("Starting initial single prompt test run...")
@@ -696,11 +696,11 @@ def evaluate(ret, args):
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
- if args.structure_type == "guided_json":
+ if args.structure_type == "json":
return _eval_correctness_json(expected, actual)
- elif args.structure_type == "guided_regex":
+ elif args.structure_type == "regex":
return _eval_correctness_regex(expected, actual)
- elif args.structure_type == "guided_choice":
+ elif args.structure_type == "choice":
return _eval_correctness_choice(expected, actual)
else:
return None
@@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
)
if args.dataset == "grammar":
- args.structure_type = "guided_grammar"
+ args.structure_type = "grammar"
elif args.dataset == "regex":
- args.structure_type = "guided_regex"
+ args.structure_type = "regex"
elif args.dataset == "choice":
- args.structure_type = "guided_choice"
+ args.structure_type = "choice"
else:
- args.structure_type = "guided_json"
+ args.structure_type = "json"
if args.no_structured_output:
args.structured_output_ratio = 0
if args.save_results:
- result_file_name = f"{args.structured_output_ratio}guided"
+ result_file_name = f"{args.structured_output_ratio}so"
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"
@@ -909,13 +909,13 @@ def create_argument_parser():
parser.add_argument(
"--tokenizer",
type=str,
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
+ help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--tokenizer-mode",
type=str,
default="auto",
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
+ help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--num-prompts",
@@ -998,7 +998,7 @@ def create_argument_parser():
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
- help="Comma-separated list of selected metrics to report percentils. "
+ help="Comma-separated list of selected metrics to report percentiles. "
"This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'Default value is "ttft,tpot,itl".',
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c7f290e1eb88e..b6dc0918fd4d1 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,742 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark offline inference throughput."""
-
-import argparse
-import dataclasses
-import json
-import os
-import random
-import time
-import warnings
-from typing import Any, Optional, Union
-
-import torch
-import uvloop
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from benchmark_dataset import (
- AIMODataset,
- BurstGPTDataset,
- ConversationDataset,
- InstructCoderDataset,
- RandomDataset,
- SampleRequest,
- ShareGPTDataset,
- SonnetDataset,
- VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
- build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TextPrompt, TokensPrompt
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-
-
-def run_vllm(
- requests: list[SampleRequest],
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
- from vllm import LLM, SamplingParams
-
- llm = LLM(**dataclasses.asdict(engine_args))
- assert all(
- llm.llm_engine.model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of"
- " prompt_len and expected_output_len for all requests."
- )
- # Add the requests to the engine.
- prompts: list[Union[TextPrompt, TokensPrompt]] = []
- sampling_params: list[SamplingParams] = []
- for request in requests:
- prompts.append(
- TokensPrompt(
- prompt_token_ids=request.prompt["prompt_token_ids"],
- multi_modal_data=request.multi_modal_data,
- )
- if "prompt_token_ids" in request.prompt
- else TextPrompt(
- prompt=request.prompt, multi_modal_data=request.multi_modal_data
- )
- )
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- lora_requests: Optional[list[LoRARequest]] = None
- if engine_args.enable_lora:
- lora_requests = [request.lora_request for request in requests]
-
- use_beam_search = False
-
- outputs = None
- if not use_beam_search:
- start = time.perf_counter()
- outputs = llm.generate(
- prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
- )
- end = time.perf_counter()
- else:
- assert lora_requests is None, "BeamSearch API does not support LoRA"
- prompts = [request.prompt for request in requests]
- # output_len should be the same for all requests.
- output_len = requests[0].expected_output_len
- for request in requests:
- assert request.expected_output_len == output_len
- start = time.perf_counter()
- llm.beam_search(
- prompts,
- BeamSearchParams(
- beam_width=n,
- max_tokens=output_len,
- ignore_eos=True,
- ),
- )
- end = time.perf_counter()
- return end - start, outputs
-
-
-def run_vllm_chat(
- requests: list[SampleRequest],
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-) -> tuple[float, list[RequestOutput]]:
- """
- Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
- multimodal models as it properly handles multimodal inputs and chat
- formatting. For non-multimodal models, use run_vllm() instead.
- """
- from vllm import LLM, SamplingParams
-
- llm = LLM(**dataclasses.asdict(engine_args))
-
- assert all(
- llm.llm_engine.model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of "
- "prompt_len and expected_output_len for all requests."
- )
-
- prompts = []
- sampling_params: list[SamplingParams] = []
- for request in requests:
- prompts.append(request.prompt)
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- start = time.perf_counter()
- outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
- end = time.perf_counter()
- return end - start, outputs
-
-
-async def run_vllm_async(
- requests: list[SampleRequest],
- n: int,
- engine_args: AsyncEngineArgs,
- disable_frontend_multiprocessing: bool = False,
- disable_detokenize: bool = False,
-) -> float:
- from vllm import SamplingParams
-
- async with build_async_engine_client_from_engine_args(
- engine_args,
- disable_frontend_multiprocessing=disable_frontend_multiprocessing,
- ) as llm:
- model_config = await llm.get_model_config()
- assert all(
- model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of"
- " prompt_len and expected_output_len for all requests."
- )
-
- # Add the requests to the engine.
- prompts: list[Union[TextPrompt, TokensPrompt]] = []
- sampling_params: list[SamplingParams] = []
- lora_requests: list[Optional[LoRARequest]] = []
- for request in requests:
- prompts.append(
- TokensPrompt(
- prompt_token_ids=request.prompt["prompt_token_ids"],
- multi_modal_data=request.multi_modal_data,
- )
- if "prompt_token_ids" in request.prompt
- else TextPrompt(
- prompt=request.prompt, multi_modal_data=request.multi_modal_data
- )
- )
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- lora_requests.append(request.lora_request)
-
- generators = []
- start = time.perf_counter()
- for i, (prompt, sp, lr) in enumerate(
- zip(prompts, sampling_params, lora_requests)
- ):
- generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
- generators.append(generator)
- all_gens = merge_async_iterators(*generators)
- async for i, res in all_gens:
- pass
- end = time.perf_counter()
- return end - start
-
-
-def run_hf(
- requests: list[SampleRequest],
- model: str,
- tokenizer: PreTrainedTokenizerBase,
- n: int,
- max_batch_size: int,
- trust_remote_code: bool,
- disable_detokenize: bool = False,
-) -> float:
- llm = AutoModelForCausalLM.from_pretrained(
- model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
- )
- if llm.config.model_type == "llama":
- # To enable padding in the HF backend.
- tokenizer.pad_token = tokenizer.eos_token
- llm = llm.cuda()
-
- pbar = tqdm(total=len(requests))
- start = time.perf_counter()
- batch: list[str] = []
- max_prompt_len = 0
- max_output_len = 0
- for i in range(len(requests)):
- prompt = requests[i].prompt
- prompt_len = requests[i].prompt_len
- output_len = requests[i].expected_output_len
- # Add the prompt to the batch.
- batch.append(prompt)
- max_prompt_len = max(max_prompt_len, prompt_len)
- max_output_len = max(max_output_len, output_len)
- if len(batch) < max_batch_size and i != len(requests) - 1:
- # Check if we can add more requests to the batch.
- next_prompt_len = requests[i + 1].prompt_len
- next_output_len = requests[i + 1].expected_output_len
- if (
- max(max_prompt_len, next_prompt_len)
- + max(max_output_len, next_output_len)
- ) <= 2048:
- # We can add more requests to the batch.
- continue
-
- # Generate the sequences.
- input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
- llm_outputs = llm.generate(
- input_ids=input_ids.cuda(),
- do_sample=True,
- num_return_sequences=n,
- temperature=1.0,
- top_p=1.0,
- use_cache=True,
- max_new_tokens=max_output_len,
- )
- if not disable_detokenize:
- # Include the decoding time.
- tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
- pbar.update(len(batch))
-
- # Clear the batch.
- batch = []
- max_prompt_len = 0
- max_output_len = 0
- end = time.perf_counter()
- return end - start
-
-
-def run_mii(
- requests: list[SampleRequest],
- model: str,
- tensor_parallel_size: int,
- output_len: int,
-) -> float:
- from mii import client, serve
-
- llm = serve(model, tensor_parallel=tensor_parallel_size)
- prompts = [request.prompt for request in requests]
-
- start = time.perf_counter()
- llm.generate(prompts, max_new_tokens=output_len)
- end = time.perf_counter()
- client = client(model)
- client.terminate_server()
- return end - start
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any]
-) -> None:
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={
- "requests_per_second": [results["requests_per_second"]],
- "tokens_per_second": [results["tokens_per_second"]],
- },
- extra_info={
- k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
- },
- )
- if pt_records:
- # Don't use json suffix here as we don't want CI to pick it up
- pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-def get_requests(args, tokenizer):
- # Common parameters for all dataset types.
- common_kwargs = {
- "dataset_path": args.dataset_path,
- "random_seed": args.seed,
- }
- sample_kwargs = {
- "tokenizer": tokenizer,
- "lora_path": args.lora_path,
- "max_loras": args.max_loras,
- "num_requests": args.num_prompts,
- "input_len": args.input_len,
- "output_len": args.output_len,
- }
-
- if args.dataset_path is None or args.dataset_name == "random":
- sample_kwargs["range_ratio"] = args.random_range_ratio
- sample_kwargs["prefix_len"] = args.prefix_len
- dataset_cls = RandomDataset
- elif args.dataset_name == "sharegpt":
- dataset_cls = ShareGPTDataset
- if args.backend == "vllm-chat":
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_name == "sonnet":
- assert tokenizer.chat_template or tokenizer.default_chat_template, (
- "Tokenizer/model must have chat template for sonnet dataset."
- )
- dataset_cls = SonnetDataset
- sample_kwargs["prefix_len"] = args.prefix_len
- sample_kwargs["return_prompt_formatted"] = True
- elif args.dataset_name == "burstgpt":
- dataset_cls = BurstGPTDataset
- elif args.dataset_name == "hf":
- common_kwargs["no_stream"] = args.no_stream
- if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = VisionArenaDataset
- common_kwargs["dataset_subset"] = None
- common_kwargs["dataset_split"] = "train"
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = InstructCoderDataset
- common_kwargs["dataset_split"] = "train"
- elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = ConversationDataset
- common_kwargs["dataset_subset"] = args.hf_subset
- common_kwargs["dataset_split"] = args.hf_split
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = AIMODataset
- common_kwargs["dataset_subset"] = None
- common_kwargs["dataset_split"] = "train"
- else:
- raise ValueError(f"Unknown dataset name: {args.dataset_name}")
- # Remove None values
- sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
- return dataset_cls(**common_kwargs).sample(**sample_kwargs)
-
-
-@deprecated(
- "benchmark_throughput.py is deprecated and will be removed in a "
- "future version. Please use 'vllm bench throughput' instead.",
-)
-def main(args: argparse.Namespace):
- if args.seed is None:
- args.seed = 0
- print(args)
- random.seed(args.seed)
- # Sample the requests.
- tokenizer = AutoTokenizer.from_pretrained(
- args.tokenizer, trust_remote_code=args.trust_remote_code
- )
- requests = get_requests(args, tokenizer)
- is_multi_modal = any(request.multi_modal_data is not None for request in requests)
- request_outputs: Optional[list[RequestOutput]] = None
- if args.backend == "vllm":
- if args.async_engine:
- elapsed_time = uvloop.run(
- run_vllm_async(
- requests,
- args.n,
- AsyncEngineArgs.from_cli_args(args),
- args.disable_frontend_multiprocessing,
- args.disable_detokenize,
- )
- )
- else:
- elapsed_time, request_outputs = run_vllm(
- requests,
- args.n,
- EngineArgs.from_cli_args(args),
- args.disable_detokenize,
- )
- elif args.backend == "hf":
- assert args.tensor_parallel_size == 1
- elapsed_time = run_hf(
- requests,
- args.model,
- tokenizer,
- args.n,
- args.hf_max_batch_size,
- args.trust_remote_code,
- args.disable_detokenize,
- )
- elif args.backend == "mii":
- elapsed_time = run_mii(
- requests, args.model, args.tensor_parallel_size, args.output_len
- )
- elif args.backend == "vllm-chat":
- elapsed_time, request_outputs = run_vllm_chat(
- requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
- )
- else:
- raise ValueError(f"Unknown backend: {args.backend}")
-
- if request_outputs:
- # Note: with the vllm and vllm-chat backends,
- # we have request_outputs, which we use to count tokens.
- total_prompt_tokens = 0
- total_output_tokens = 0
- for ro in request_outputs:
- if not isinstance(ro, RequestOutput):
- continue
- total_prompt_tokens += (
- len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
- )
- total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
- total_num_tokens = total_prompt_tokens + total_output_tokens
- else:
- total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
- total_output_tokens = sum(r.expected_output_len for r in requests)
- total_prompt_tokens = total_num_tokens - total_output_tokens
-
- if is_multi_modal and args.backend != "vllm-chat":
- print(
- "\033[91mWARNING\033[0m: Multi-modal request with "
- f"{args.backend} backend detected. The "
- "following metrics are not accurate because image tokens are not"
- " counted. See vllm-project/vllm/issues/9778 for details."
- )
- # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
- # vllm-chat backend counts the image tokens now
-
- print(
- f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
- f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
- f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
- )
- print(f"Total num prompt tokens: {total_prompt_tokens}")
- print(f"Total num output tokens: {total_output_tokens}")
-
- # Output JSON results if specified
- if args.output_json:
- results = {
- "elapsed_time": elapsed_time,
- "num_requests": len(requests),
- "total_num_tokens": total_num_tokens,
- "requests_per_second": len(requests) / elapsed_time,
- "tokens_per_second": total_num_tokens / elapsed_time,
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
- save_to_pytorch_benchmark_format(args, results)
-
-
-def validate_args(args):
- """
- Validate command-line arguments.
- """
-
- # === Deprecation and Defaulting ===
- if args.dataset is not None:
- warnings.warn(
- "The '--dataset' argument will be deprecated in the next release. "
- "Please use '--dataset-name' and '--dataset-path' instead.",
- stacklevel=2,
- )
- args.dataset_path = args.dataset
-
- if not getattr(args, "tokenizer", None):
- args.tokenizer = args.model
-
- # === Backend Validation ===
- valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
- if args.backend not in valid_backends:
- raise ValueError(f"Unsupported backend: {args.backend}")
-
- # === Dataset Configuration ===
- if not args.dataset and not args.dataset_path:
- print("When dataset path is not set, it will default to random dataset")
- args.dataset_name = "random"
- if args.input_len is None:
- raise ValueError("input_len must be provided for a random dataset")
-
- # === Dataset Name Specific Checks ===
- # --hf-subset and --hf-split: only used
- # when dataset_name is 'hf'
- if args.dataset_name != "hf" and (
- getattr(args, "hf_subset", None) is not None
- or getattr(args, "hf_split", None) is not None
- ):
- warnings.warn(
- "--hf-subset and --hf-split will be ignored \
- since --dataset-name is not 'hf'.",
- stacklevel=2,
- )
- elif args.dataset_name == "hf":
- if args.dataset_path in (
- VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
- | ConversationDataset.SUPPORTED_DATASET_PATHS
- ):
- assert args.backend == "vllm-chat", (
- f"{args.dataset_path} needs to use vllm-chat as the backend."
- ) # noqa: E501
- elif args.dataset_path in (
- InstructCoderDataset.SUPPORTED_DATASET_PATHS
- | AIMODataset.SUPPORTED_DATASET_PATHS
- ):
- assert args.backend == "vllm", (
- f"{args.dataset_path} needs to use vllm as the backend."
- ) # noqa: E501
- else:
- raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
-
- # --random-range-ratio: only used when dataset_name is 'random'
- if args.dataset_name != "random" and args.random_range_ratio is not None:
- warnings.warn(
- "--random-range-ratio will be ignored since \
- --dataset-name is not 'random'.",
- stacklevel=2,
- )
-
- # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
- # set.
- if (
- args.dataset_name not in {"random", "sonnet", None}
- and args.prefix_len is not None
- ):
- warnings.warn(
- "--prefix-len will be ignored since --dataset-name\
- is not 'random', 'sonnet', or not set.",
- stacklevel=2,
- )
-
- # === LoRA Settings ===
- if getattr(args, "enable_lora", False) and args.backend != "vllm":
- raise ValueError("LoRA benchmarking is only supported for vLLM backend")
- if getattr(args, "enable_lora", False) and args.lora_path is None:
- raise ValueError("LoRA path must be provided when enable_lora is True")
-
- # === Backend-specific Validations ===
- if args.backend == "hf" and args.hf_max_batch_size is None:
- raise ValueError("HF max batch size is required for HF backend")
- if args.backend != "hf" and args.hf_max_batch_size is not None:
- raise ValueError("HF max batch size is only for HF backend.")
-
- if (
- args.backend in {"hf", "mii"}
- and getattr(args, "quantization", None) is not None
- ):
- raise ValueError("Quantization is only for vLLM backend.")
-
- if args.backend == "mii" and args.dtype != "auto":
- raise ValueError("dtype must be auto for MII backend.")
- if args.backend == "mii" and args.n != 1:
- raise ValueError("n must be 1 for MII backend.")
- if args.backend == "mii" and args.tokenizer != args.model:
- raise ValueError("Tokenizer must be the same as the model for MII backend.")
-
- # --data-parallel is not supported currently.
- # https://github.com/vllm-project/vllm/issues/16222
- if args.data_parallel_size > 1:
- raise ValueError(
- "Data parallel is not supported in offline benchmark, "
- "please use benchmark serving instead"
- )
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(description="Benchmark the throughput.")
- parser.add_argument(
- "--backend",
- type=str,
- choices=["vllm", "hf", "mii", "vllm-chat"],
- default="vllm",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
- help="Name of the dataset to benchmark on.",
- default="sharegpt",
- )
- parser.add_argument(
- "--no-stream",
- action="store_true",
- help="Do not load the dataset in streaming mode.",
- )
- parser.add_argument(
- "--dataset",
- type=str,
- default=None,
- help="Path to the ShareGPT dataset, will be deprecated in\
- the next release. The dataset is expected to "
- "be a json in form of list[dict[..., conversations: "
- "list[dict[..., value: ]]]]",
- )
- parser.add_argument(
- "--dataset-path", type=str, default=None, help="Path to the dataset"
- )
- parser.add_argument(
- "--input-len",
- type=int,
- default=None,
- help="Input prompt length for each request",
- )
- parser.add_argument(
- "--output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the "
- "output length from the dataset.",
- )
- parser.add_argument(
- "--n", type=int, default=1, help="Number of generated sequences per prompt."
- )
- parser.add_argument(
- "--num-prompts", type=int, default=1000, help="Number of prompts to process."
- )
- parser.add_argument(
- "--hf-max-batch-size",
- type=int,
- default=None,
- help="Maximum batch size for HF backend.",
- )
- parser.add_argument(
- "--output-json",
- type=str,
- default=None,
- help="Path to save the throughput results in JSON format.",
- )
- parser.add_argument(
- "--async-engine",
- action="store_true",
- default=False,
- help="Use vLLM async engine rather than LLM class.",
- )
- parser.add_argument(
- "--disable-frontend-multiprocessing",
- action="store_true",
- default=False,
- help="Disable decoupled async engine frontend.",
- )
- parser.add_argument(
- "--disable-detokenize",
- action="store_true",
- help=(
- "Do not detokenize the response (i.e. do not include "
- "detokenization time in the measurement)"
- ),
- )
- # LoRA
- parser.add_argument(
- "--lora-path",
- type=str,
- default=None,
- help="Path to the LoRA adapters to use. This can be an absolute path, "
- "a relative path, or a Hugging Face model identifier.",
- )
- parser.add_argument(
- "--prefix-len",
- type=int,
- default=None,
- help=f"Number of prefix tokens to be used in RandomDataset "
- "and SonnetDataset. For RandomDataset, the total input "
- "length is the sum of prefix-len (default: "
- f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
- "sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]. For SonnetDataset, "
- f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
- "controls how much of the input is fixed lines versus "
- "random lines, but the total input length remains approximately "
- "input_len tokens.",
- )
- # random dataset
- parser.add_argument(
- "--random-range-ratio",
- type=float,
- default=None,
- help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
- "for sampling input/output length, "
- "used only for RandomDataset. Must be in the range [0, 1) to "
- "define a symmetric sampling range "
- "[length * (1 - range_ratio), length * (1 + range_ratio)].",
- )
-
- # hf dtaset
- parser.add_argument(
- "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
- )
- parser.add_argument(
- "--hf-split", type=str, default=None, help="Split of the HF dataset."
- )
-
- parser = AsyncEngineArgs.add_cli_args(parser)
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- if args.tokenizer is None:
- args.tokenizer = args.model
- validate_args(args)
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench throughput
+
+For help with the new command, run:
+ vllm bench throughput --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench throughput --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index a5a5b52f60397..02f8c593392c4 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- w8a8_block_fp8_matmul,
+ w8a8_triton_block_scaled_mm,
)
from vllm.utils import FlexibleArgumentParser, cdiv
@@ -158,7 +158,7 @@ def bench_fp8(
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
),
- "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
+ "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
),
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 92f97ffabea2a..d683835db96a4 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -55,24 +55,20 @@ benchmark() {
output_len=$2
- CUDA_VISIBLE_DEVICES=0 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
- CUDA_VISIBLE_DEVICES=1 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index af2bcba3ea57a..35c86cc845221 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -38,16 +38,12 @@ wait_for_server() {
launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
- CUDA_VISIBLE_DEVICES=0 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--enable-chunked-prefill \
--gpu-memory-utilization 0.6 &
- CUDA_VISIBLE_DEVICES=1 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--enable-chunked-prefill \
@@ -62,23 +58,19 @@ launch_chunked_prefill() {
launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
- CUDA_VISIBLE_DEVICES=0 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
- CUDA_VISIBLE_DEVICES=1 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
new file mode 100644
index 0000000000000..f1e504499eaf6
--- /dev/null
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+ CUTLASS_BLOCK_FP8_SUPPORTED,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+ "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+ (512 + 64, 7168),
+ (2112, 7168),
+ ((128 + 64) * 128, 7168),
+ (128 * (128 + 128), 512),
+ (7168, 16384),
+ (7168, 18432),
+ (18432 * 2, 7168),
+ (24576, 1536),
+ (12288, 7168),
+ (4096, 7168),
+ (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
+ """Build runner function for w8a8 block fp8 matmul."""
+ factor_for_scale = 1e-2
+
+ fp8_info = torch.finfo(torch.float8_e4m3fn)
+ fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+ # Create random FP8 tensors
+ A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+
+ B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+ B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+ # Create scales
+ block_n, block_k = block_size[0], block_size[1]
+ n_tiles = (N + block_n - 1) // block_n
+ k_tiles = (K + block_k - 1) // block_k
+
+ Bs = (
+ torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+ * factor_for_scale
+ )
+
+ # SM90 CUTLASS requires row-major format for scales
+ if use_cutlass and current_platform.is_device_capability(90):
+ Bs = Bs.T.contiguous()
+
+ def run():
+ if use_cutlass:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+ )
+ else:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+ )
+
+ return run
+
+
+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+ available_providers.append("w8a8-block-fp8-cutlass")
+
+
+@vllm_triton.testing.perf_report(
+ vllm_triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+ x_log=False,
+ line_arg="provider",
+ line_vals=available_providers,
+ line_names=available_providers,
+ ylabel="TFLOP/s (larger is better)",
+ plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+ args={},
+ )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+ M = batch_size
+ device = "cuda"
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "torch-bf16":
+ a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+ b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+ )
+ elif provider == "w8a8-block-fp8-triton":
+ run_w8a8_triton = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=False
+ )
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_triton(), quantiles=quantiles
+ )
+ elif provider == "w8a8-block-fp8-cutlass":
+ run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=True
+ )
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_cutlass(), quantiles=quantiles
+ )
+ else:
+ raise ValueError(f"Unknown provider: {provider}")
+
+ to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+ return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+ block_size = (128, 128)
+
+ for N, K in DEEPSEEK_V3_SHAPES:
+ print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+ print(f"TFLOP/s comparison (block_size={block_size}):")
+ benchmark_tflops.run(
+ print_data=True,
+ # show_plots=False,
+ # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+ N=N,
+ K=K,
+ block_size=block_size,
+ )
+
+ print("\nBenchmark finished!")
diff --git a/benchmarks/kernels/bench_mxfp4_qutlass.py b/benchmarks/kernels/bench_mxfp4_qutlass.py
new file mode 100644
index 0000000000000..dfc7721876a17
--- /dev/null
+++ b/benchmarks/kernels/bench_mxfp4_qutlass.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import copy
+import itertools
+
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+ "torch-bf16": dict(enabled=True),
+ "mxfp4": dict(no_a_quant=False, enabled=True),
+ "mxfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+ return (
+ deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+ * group_size**-0.5
+ )
+
+
+def _quant_weight_mxfp4(
+ b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
+):
+ weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
+ b, forward_hadamard_matrix, method="abs_max"
+ )
+ weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
+ return weight_hf_e2m1, weight_hf_scale_block
+
+
+def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
+ weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
+ b, forward_hadamard_matrix, device
+ )
+ alpha = torch.tensor([1.0], device="cuda")
+
+ if cfg["no_a_quant"]:
+ # Pre-quantize activation
+ input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
+ a, forward_hadamard_matrix, method="abs_max"
+ )
+ input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
+
+ def run():
+ return matmul_mxf4_bf16_tn(
+ input_hf_e2m1,
+ weight_hf_e2m1,
+ input_hf_scale_block,
+ weight_hf_scale_block,
+ alpha,
+ )
+
+ return run
+
+ # Quantize activation on-the-fly
+ def run():
+ input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
+ a, forward_hadamard_matrix, method="abs_max"
+ )
+ input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
+ return matmul_mxf4_bf16_tn(
+ input_hf_e2m1,
+ weight_hf_e2m1,
+ input_hf_scale_block,
+ weight_hf_scale_block,
+ alpha,
+ )
+
+ return run
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[
+ 1,
+ 4,
+ 8,
+ 16,
+ 32,
+ 64,
+ 128,
+ 256,
+ 512,
+ 1024,
+ 2048,
+ 4096,
+ 8192,
+ 16384,
+ 24576,
+ 32768,
+ ],
+ x_log=False,
+ line_arg="provider",
+ line_vals=_enabled,
+ line_names=_enabled,
+ ylabel="TFLOP/s (larger is better)",
+ plot_name="BF16 vs MXFP4 GEMMs",
+ args={},
+ )
+)
+def benchmark(batch_size, provider, N, K, had_size):
+ M = batch_size
+ device = "cuda"
+ dtype = torch.bfloat16
+
+ a = torch.randn((M, K), device=device, dtype=dtype)
+ b = torch.randn((N, K), device=device, dtype=dtype)
+ forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "torch-bf16":
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
+ )
+ else:
+ cfg = PROVIDER_CFGS[provider]
+ run_quant = build_mxfp4_runner(
+ cfg, a, b, forward_hadamard_matrix, dtype, device
+ )
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: run_quant(), rep=200, quantiles=quantiles
+ )
+
+ to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+ return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+ out = []
+ for model, tp_size in itertools.product(args.models, args.tp_sizes):
+ for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+ KN[tp_dim] //= tp_size
+ KN.append(model)
+ out.append(KN)
+ return out
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=["meta-llama/Llama-3.3-70B-Instruct"],
+ choices=list(WEIGHT_SHAPES.keys()),
+ )
+ parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+ args = parser.parse_args()
+
+ for K, N, model in prepare_shapes(args):
+ for had_size in [32, 64, 128]:
+ print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
+ benchmark.run(
+ print_data=True,
+ show_plots=True,
+ save_path=f"bench_mxfp4_res_n{N}_k{K}",
+ N=N,
+ K=K,
+ had_size=had_size,
+ )
+
+ print("Benchmark finished!")
diff --git a/benchmarks/kernels/bench_nvfp4_gemm.py b/benchmarks/kernels/bench_nvfp4_gemm.py
index 9e832c9faa8e8..6b19eb113f3e7 100644
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
@@ -3,6 +3,7 @@
import argparse
import copy
import itertools
+import os
import torch
from weight_shapes import WEIGHT_SHAPES
@@ -23,21 +24,45 @@ PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+ "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
+ "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
}
+_needs_fbgemm = any(
+ v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
+)
+if _needs_fbgemm:
+ try:
+ from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
+ triton_scale_nvfp4_quant,
+ )
+ except ImportError:
+ print(
+ "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
+ "These providers will be skipped. Please install fbgemm_gpu with: "
+ "'pip install fbgemm-gpu-genai' to run them."
+ )
+ # Disable FBGEMM providers so the benchmark can run.
+ for cfg in PROVIDER_CFGS.values():
+ if cfg.get("fbgemm"):
+ cfg["enabled"] = False
+
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
-def _quant_weight_nvfp4(b: torch.Tensor, device: str):
+def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
# Compute global scale for weight
b_amax = torch.abs(b).max().to(torch.float32)
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
- b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
+ if "fbgemm" in cfg and cfg["fbgemm"]:
+ b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
+ else:
+ b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
return b_fp4, scale_b_fp4, b_global_scale
def build_nvfp4_runner(cfg, a, b, dtype, device):
- b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
+ b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
# Compute global scale for activation
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
@@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
# Alpha for the GEMM operation
alpha = 1.0 / (a_global_scale * b_global_scale)
+ if "fbgemm" in cfg and cfg["fbgemm"]:
+ if cfg["no_a_quant"]:
+ a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
+
+ def run():
+ return torch.ops.fbgemm.f4f4bf16(
+ a_fp4,
+ b_fp4,
+ scale_a_fp4,
+ scale_b_fp4,
+ global_scale=alpha,
+ use_mx=False,
+ )
+
+ return run
+ else:
+
+ def run():
+ a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
+ return torch.ops.fbgemm.f4f4bf16(
+ a_fp4,
+ b_fp4,
+ scale_a_fp4,
+ scale_b_fp4,
+ global_scale=alpha,
+ use_mx=False,
+ )
+
+ return run
if cfg["no_a_quant"]:
# Pre-quantize activation
@@ -130,10 +184,13 @@ if __name__ == "__main__":
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+ save_dir = f"bench_nvfp4_res_n{N}_k{K}"
+ os.makedirs(save_dir, exist_ok=True)
+
benchmark.run(
print_data=True,
show_plots=True,
- save_path=f"bench_nvfp4_res_n{N}_k{K}",
+ save_path=save_dir,
N=N,
K=K,
)
diff --git a/benchmarks/kernels/bench_nvfp4_qutlass.py b/benchmarks/kernels/bench_nvfp4_qutlass.py
new file mode 100644
index 0000000000000..6fecc816f9466
--- /dev/null
+++ b/benchmarks/kernels/bench_nvfp4_qutlass.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import copy
+import itertools
+
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
+from vllm._custom_ops import fusedQuantizeNv
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+ "torch-bf16": dict(enabled=True),
+ "nvfp4": dict(no_a_quant=False, enabled=True),
+ "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+ return (
+ deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+ * group_size**-0.5
+ )
+
+
+def _quant_weight_nvfp4(
+ b: torch.Tensor,
+ forward_hadamard_matrix: torch.Tensor,
+ global_scale: torch.Tensor,
+ device: str,
+ M: int,
+ N: int,
+ K: int,
+):
+ weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
+ b, forward_hadamard_matrix, global_scale
+ )
+ weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
+ -1, K // 16
+ )
+ return weight_hf_e2m1, weight_hf_scale_block
+
+
+def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
+ alpha = torch.tensor([1.0], device="cuda")
+ global_scale = torch.tensor([1.0], device="cuda")
+ weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
+ b, forward_hadamard_matrix, global_scale, device, M, N, K
+ )
+
+ if cfg["no_a_quant"]:
+ # Pre-quantize activation
+ input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
+ a, forward_hadamard_matrix, global_scale
+ )
+ input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
+ -1, K // 16
+ )
+
+ def run():
+ return ops.cutlass_scaled_fp4_mm(
+ input_hf_e2m1,
+ weight_hf_e2m1,
+ input_hf_scale_block,
+ weight_hf_scale_block,
+ alpha,
+ torch.bfloat16,
+ )
+
+ return run
+
+ # Quantize activation on-the-fly
+ def run():
+ input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
+ a, forward_hadamard_matrix, global_scale
+ )
+ input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
+ -1, K // 16
+ )
+ return ops.cutlass_scaled_fp4_mm(
+ input_hf_e2m1,
+ weight_hf_e2m1,
+ input_hf_scale_block,
+ weight_hf_scale_block,
+ alpha,
+ torch.bfloat16,
+ )
+
+ return run
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[
+ 1,
+ 4,
+ 8,
+ 16,
+ 32,
+ 64,
+ 128,
+ 256,
+ 512,
+ 1024,
+ 2048,
+ 4096,
+ 8192,
+ 16384,
+ 24576,
+ 32768,
+ ],
+ x_log=False,
+ line_arg="provider",
+ line_vals=_enabled,
+ line_names=_enabled,
+ ylabel="TFLOP/s (larger is better)",
+ plot_name="BF16 vs NVFP4 GEMMs",
+ args={},
+ )
+)
+def benchmark(batch_size, provider, N, K, had_size):
+ M = batch_size
+ device = "cuda"
+ dtype = torch.bfloat16
+
+ a = torch.randn((M, K), device=device, dtype=dtype)
+ b = torch.randn((N, K), device=device, dtype=dtype)
+ forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "torch-bf16":
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
+ )
+ else:
+ cfg = PROVIDER_CFGS[provider]
+ run_quant = build_nvfp4_runner(
+ cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
+ )
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: run_quant(), rep=200, quantiles=quantiles
+ )
+
+ to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+ return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+ out = []
+ for model, tp_size in itertools.product(args.models, args.tp_sizes):
+ for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+ KN[tp_dim] //= tp_size
+ KN.append(model)
+ out.append(KN)
+ return out
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=["meta-llama/Llama-3.3-70B-Instruct"],
+ choices=list(WEIGHT_SHAPES.keys()),
+ )
+ parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+ args = parser.parse_args()
+
+ for K, N, model in prepare_shapes(args):
+ for had_size in [16, 32, 64, 128]:
+ print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+ benchmark.run(
+ print_data=True,
+ show_plots=True,
+ save_path=f"bench_nvfp4_res_n{N}_k{K}",
+ N=N,
+ K=K,
+ had_size=had_size,
+ )
+
+ print("Benchmark finished!")
diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py
index 923d678f1f2db..e08e5680c191e 100644
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -2,14 +2,25 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
+from unittest.mock import patch
+import pandas as pd
import torch
-from vllm import _custom_ops as ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+
+def with_triton_mode(fn):
+ """Temporarily force the Triton fallback path"""
+
+ def wrapped(*args, **kwargs):
+ with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+ return fn(*args, **kwargs)
+
+ return wrapped
# TODO(luka): use standalone_compile utility
@@ -21,78 +32,238 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
return inner
-torch._dynamo.config.recompile_limit = 8888
-compilation_config = CompilationConfig(custom_ops=["none"])
-with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
- torch_per_token_quant_fp8 = torch.compile(
- QuantFP8(False, GroupShape.PER_TOKEN),
- fullgraph=True,
- dynamic=False, # recompile for different shapes
- )
+def bench_compile(fn: Callable):
+ # recompile for different shapes
+ fwd = torch.compile(fn, fullgraph=True, dynamic=False)
# First dim is explicitly dynamic to simulate vLLM usage
- torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
+ return with_dyn_arg(fwd, 0, 0)
-def cuda_per_token_quant_fp8(
- input: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
- return ops.scaled_fp8_quant(input)
+torch._dynamo.config.recompile_limit = 8888
-def calculate_diff(batch_size: int, seq_len: int):
- """Calculate difference between Triton and CUDA implementations."""
+def calculate_diff(
+ batch_size: int,
+ hidden_size: int,
+ group_shape: GroupShape,
+ dtype: torch.dtype,
+):
+ """Calculate the difference between Inductor and CUDA implementations."""
device = torch.device("cuda")
- x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
+ x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
- torch_out, torch_scale = torch_per_token_quant_fp8(x)
- cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
+ quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
- if torch.allclose(
- cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
- ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
+ torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
+ torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
+ cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
+
+ try:
+ torch.testing.assert_close(
+ cuda_out.to(torch.float32),
+ torch_out.to(torch.float32),
+ rtol=1e-3,
+ atol=1e-5,
+ )
+ torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
+ torch.testing.assert_close(
+ cuda_out.to(torch.float32),
+ torch_eager_out.to(torch.float32),
+ rtol=1e-3,
+ atol=1e-5,
+ )
+ torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
print("✅ All implementations match")
- else:
+ except AssertionError as e:
print("❌ Implementations differ")
+ print(e)
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
-
-configs = list(itertools.product(batch_size_range, seq_len_range))
+configs = []
-@triton.testing.perf_report(
- triton.testing.Benchmark(
- x_names=["batch_size", "seq_len"],
- x_vals=configs,
- line_arg="provider",
- line_vals=["torch", "cuda"],
- line_names=["Torch", "CUDA"],
- styles=[("blue", "-"), ("green", "-")],
- ylabel="us",
- plot_name="per-token-dynamic-quant-fp8-performance",
- args={},
- )
-)
-def benchmark_quantization(batch_size, seq_len, provider):
- dtype = torch.float16
+def benchmark_quantization(
+ batch_size,
+ hidden_size,
+ provider,
+ group_shape: GroupShape,
+ col_major: bool,
+ dtype: torch.dtype,
+):
device = torch.device("cuda")
- x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
+ x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
+ quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
if provider == "torch":
- fn = lambda: torch_per_token_quant_fp8(x.clone())
+ fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
elif provider == "cuda":
- fn = lambda: cuda_per_token_quant_fp8(x.clone())
+ fn = lambda: quant_fp8.forward_cuda(x.clone())
+ elif provider == "triton":
+ if not group_shape.is_per_group():
+ # Triton only supported for per-group
+ return 0, 0, 0
+
+ fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+# TODO(luka) extract to utils
+def compute_geomean_speedups(
+ df: pd.DataFrame,
+ baseline_col: str,
+ speedup_cols: list[str],
+ groupby_cols: list[str] | None = None,
+) -> pd.DataFrame:
+ """
+ Compute geometric mean speedups over a baseline column.
+
+ Args:
+ df: Input dataframe
+ baseline_col: Column to use as baseline
+ speedup_cols: Columns to compute speedups for
+ groupby_cols: Columns to group by. If None, compute over entire df.
+
+ Returns:
+ pd.DataFrame with geometric mean speedups
+ """
+ from scipy.stats import gmean
+
+ def geo_speedup(group: pd.DataFrame) -> pd.Series:
+ ratios = {
+ col: (group[baseline_col] / group[col]).values for col in speedup_cols
+ }
+ return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
+
+ if groupby_cols is None:
+ result = geo_speedup(df).to_frame().T
+ else:
+ result = (
+ df.groupby(groupby_cols)
+ .apply(geo_speedup, include_groups=False)
+ .reset_index()
+ )
+
+ return result
+
+
if __name__ == "__main__":
- calculate_diff(batch_size=4, seq_len=4096)
- benchmark_quantization.run(print_data=True)
+ parser = FlexibleArgumentParser(
+ description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
+ )
+ parser.add_argument("-c", "--check", action="store_true")
+ parser.add_argument(
+ "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+ )
+ parser.add_argument(
+ "--hidden-sizes",
+ type=int,
+ nargs="+",
+ default=[896, 1024, 2048, 4096, 7168],
+ help="Hidden sizes to benchmark",
+ )
+ parser.add_argument(
+ "--batch-sizes",
+ type=int,
+ nargs="+",
+ default=[1, 16, 128, 512, 1024],
+ help="Batch sizes to benchmark",
+ )
+ parser.add_argument(
+ "--group-sizes",
+ type=int,
+ nargs="+",
+ default=None,
+ help="Group sizes for GroupShape(1,N) to benchmark. "
+ "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
+ )
+ parser.add_argument(
+ "--no-column-major",
+ action="store_true",
+ help="Disable column-major scales testing",
+ )
+
+ args = parser.parse_args()
+ assert args
+
+ dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+ hidden_sizes = args.hidden_sizes
+ batch_sizes = args.batch_sizes
+
+ if args.group_sizes is not None:
+ group_shapes = []
+ for size in args.group_sizes:
+ if size == 0:
+ group_shapes.append(GroupShape.PER_TENSOR)
+ elif size == -1:
+ group_shapes.append(GroupShape.PER_TOKEN)
+ else:
+ group_shapes.append(GroupShape(1, size))
+ else:
+ group_shapes = [
+ GroupShape.PER_TENSOR,
+ GroupShape.PER_TOKEN,
+ GroupShape(1, 64),
+ GroupShape(1, 128),
+ ]
+
+ column_major_scales = [False] if args.no_column_major else [True, False]
+
+ config_gen = itertools.product(
+ group_shapes,
+ column_major_scales,
+ batch_sizes,
+ hidden_sizes,
+ )
+
+ # filter out column-major scales for non-group, reverse order
+ configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
+
+ print(f"Running {len(configs)} configurations:")
+ print(f" Hidden sizes: {hidden_sizes}")
+ print(f" Batch sizes: {batch_sizes}")
+ print(f" Group shapes: {[str(g) for g in group_shapes]}")
+ print(f" Column major scales: {column_major_scales}")
+ print()
+
+ if args.check:
+ for group_shape in group_shapes:
+ group_size = group_shape[1]
+ print(f"{group_size=}")
+ calculate_diff(
+ batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
+ )
+
+ benchmark = triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
+ x_vals=configs,
+ line_arg="provider",
+ line_vals=["torch", "cuda", "triton"],
+ line_names=["Torch (Compiled)", "CUDA", "Triton"],
+ styles=[("blue", "-"), ("green", "-"), ("black", "-")],
+ ylabel="us",
+ plot_name="QuantFP8 performance",
+ args={},
+ )
+ )(benchmark_quantization)
+
+ df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
+
+ # Print geomean speedups
+ geo_table_grouped = compute_geomean_speedups(
+ df,
+ baseline_col="Torch (Compiled)",
+ speedup_cols=["CUDA", "Triton"],
+ groupby_cols=["col_major", "group_shape"],
+ )
+
+ print("Speedup over Torch (Compiled)")
+ print(geo_table_grouped.to_string(index=False))
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
new file mode 100644
index 0000000000000..93edbcc9391fc
--- /dev/null
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# benchmark custom activation op performance
+import itertools
+
+import torch
+
+import vllm.model_executor.layers.activation # noqa F401
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+intermediate_size = [3072, 9728, 12288]
+configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
+
+
+def benchmark_activation(
+ batch_size: int,
+ seq_len: int,
+ intermediate_size: int,
+ provider: str,
+ func_name: str,
+ dtype: torch.dtype,
+):
+ device = "cuda"
+ num_tokens = batch_size * seq_len
+ dim = intermediate_size
+ current_platform.seed_everything(42)
+ torch.set_default_device(device)
+
+ if func_name == "gelu_and_mul":
+ layer = CustomOp.op_registry[func_name](approximate="none")
+ elif func_name == "gelu_and_mul_tanh":
+ layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+ elif func_name == "fatrelu_and_mul":
+ threshold = 0.5
+ layer = CustomOp.op_registry[func_name](threshold)
+ else:
+ layer = CustomOp.op_registry[func_name]()
+
+ x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
+ compiled_layer = torch.compile(layer.forward_native)
+
+ if provider == "custom":
+ fn = lambda: layer(x)
+ elif provider == "compiled":
+ fn = lambda: compiled_layer(x)
+
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ fn, quantiles=[0.5, 0.2, 0.8]
+ )
+ return ms, max_ms, min_ms
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
+ parser.add_argument(
+ "--func-name",
+ type=str,
+ choices=[
+ "mul_and_silu",
+ "silu_and_mul",
+ "gelu_and_mul",
+ "gelu_and_mul_tanh",
+ "fatrelu_and_mul",
+ "swigluoai_and_mul",
+ "gelu_new",
+ "gelu_fast",
+ "quick_gelu",
+ ],
+ default="silu_and_mul",
+ )
+ parser.add_argument(
+ "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+ )
+ args = parser.parse_args()
+ assert args
+
+ func_name = args.func_name
+ dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+ perf_report = triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size", "seq_len", "intermediate_size"],
+ x_vals=configs,
+ line_arg="provider",
+ line_vals=["custom", "compiled"],
+ line_names=["Custom OP", "Compiled"],
+ styles=[("blue", "-"), ("green", "-")],
+ ylabel="ms",
+ plot_name=f"{func_name}-op-performance",
+ args={},
+ )
+ )
+
+ perf_report(
+ lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
+ batch_size, seq_len, intermediate_size, provider, func_name, dtype
+ )
+ ).run(print_data=True)
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
index 35c20ee41b9a9..726a2a371d109 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import (
+ fp8_w8a8_moe_quant_config,
+ nvfp4_moe_quant_config,
+)
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
@@ -140,6 +144,12 @@ def bench_run(
a_fp8_scale: torch.Tensor,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_fp8_scale,
+ )
+
for _ in range(num_repeats):
fused_experts(
a,
@@ -147,10 +157,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_fp8_scale,
+ quant_config=quant_config,
)
def run_cutlass_moe_fp4(
@@ -172,25 +179,27 @@ def bench_run(
device: torch.device,
num_repeats: int,
):
+ quant_config = nvfp4_moe_quant_config(
+ a1_gscale=a1_gs,
+ a2_gscale=a2_gs,
+ w1_scale=w1_blockscale,
+ w2_scale=w2_blockscale,
+ g1_alphas=w1_gs,
+ g2_alphas=w2_gs,
+ )
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp4", color="green"):
cutlass_moe_fp4(
a=a,
- a1_gscale=a1_gs,
- a2_gscale=a2_gs,
w1_fp4=w1_fp4,
- w1_blockscale=w1_blockscale,
- w1_alphas=w1_gs,
w2_fp4=w2_fp4,
- w2_blockscale=w2_blockscale,
- w2_alphas=w2_gs,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
- device=device,
+ quant_config=quant_config,
)
def run_cutlass_from_graph(
@@ -211,26 +220,29 @@ def bench_run(
e: int,
device: torch.device,
):
+ quant_config = nvfp4_moe_quant_config(
+ a1_gscale=a1_gs,
+ a2_gscale=a2_gs,
+ w1_scale=w1_blockscale,
+ w2_scale=w2_blockscale,
+ g1_alphas=w1_gs,
+ g2_alphas=w2_gs,
+ )
+
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
return cutlass_moe_fp4(
a=a,
- a1_gscale=a1_gs,
w1_fp4=w1_fp4,
- w1_blockscale=w1_blockscale,
- w1_alphas=w1_alphas,
- a2_gscale=a2_gs,
w2_fp4=w2_fp4,
- w2_blockscale=w2_blockscale,
- w2_alphas=w2_alphas,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
- device=device,
+ quant_config=quant_config,
)
def run_triton_from_graph(
@@ -246,16 +258,18 @@ def bench_run(
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_fp8_scale,
+ )
return fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_fp8_scale,
+ quant_config=quant_config,
)
def replay_graph(graph, num_repeats):
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
new file mode 100644
index 0000000000000..b419b2fa0e3eb
--- /dev/null
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
+kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
+but use different quantization strategies and backends.
+"""
+
+import nvtx
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+# Weight shapes for different models: [num_experts, topk, hidden_size,
+# intermediate_size]
+WEIGHT_SHAPES_MOE = {
+ "mixtral-8x7b": [
+ [8, 2, 4096, 14336],
+ ],
+ "deepseek-v2": [
+ [160, 6, 5120, 12288],
+ ],
+ "custom-small": [
+ [8, 2, 2048, 7168],
+ ],
+ "glm45-fp8": [
+ [128, 8, 4096, 1408],
+ ],
+ "Llama-4-Maverick-17B-128E-Instruct-FP8": [
+ [128, 1, 5120, 8192],
+ ],
+}
+
+DEFAULT_MODELS = [
+ "mixtral-8x7b",
+]
+
+DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False, True]
+PER_OUT_CH_OPTS = [False, True]
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+def bench_run(
+ results: list,
+ model: str,
+ num_experts: int,
+ topk: int,
+ per_act_token: bool,
+ per_out_ch: bool,
+ mkn: tuple[int, int, int],
+):
+ (m, k, n) = mkn
+
+ dtype = torch.half
+ device = "cuda"
+
+ # Create input activations
+ a = torch.randn((m, k), device=device, dtype=dtype) / 10
+
+ # Create weights
+ w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
+ w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
+
+ # Create FP8 quantized weights and scales for both kernels
+ w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
+ w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
+
+ # Create scales based on quantization strategy
+ if per_out_ch:
+ # Per-channel quantization
+ w1_scale = torch.empty(
+ (num_experts, 2 * n, 1), device=device, dtype=torch.float32
+ )
+ w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
+ else:
+ # Per-tensor quantization
+ w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+ w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+
+ # Quantize weights
+ for expert in range(num_experts):
+ if per_out_ch:
+ # Per-channel quantization - not yet implemented properly
+ # For now, fall back to per-tensor quantization
+ w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
+ w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
+ # Expand scalar scales to the expected per-channel shape
+ w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
+ w2_scale[expert] = w2_scale_temp.expand(k, 1)
+ else:
+ # Per-tensor quantization
+ w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
+ w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
+ # Store scalar scales in [1, 1] tensors
+ w1_scale[expert, 0, 0] = w1_scale_temp
+ w2_scale[expert, 0, 0] = w2_scale_temp
+
+ # Prepare weights for CUTLASS (no transpose needed)
+ w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
+ w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
+
+ # Create router scores and get topk
+ score = torch.randn((m, num_experts), device=device, dtype=dtype)
+ topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+
+ # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
+ # Force per-tensor quantization for all cases to match working e2e setup
+ a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
+ a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
+
+ # Force per-tensor quantization for all cases
+ per_act_token = False
+
+ # Create stride tensors for CUTLASS
+ ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
+ ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
+ c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
+ c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
+
+ def run_triton_moe(
+ a: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ w1_scale: torch.Tensor,
+ w2_scale: torch.Tensor,
+ a1_scale: torch.Tensor,
+ a2_scale: torch.Tensor,
+ num_repeats: int,
+ ):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ per_act_token_quant=per_act_token,
+ per_out_ch_quant=per_out_ch,
+ )
+
+ for _ in range(num_repeats):
+ fused_experts(
+ a,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ quant_config=quant_config,
+ )
+
+ def run_cutlass_moe_fp8(
+ a: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ ab_strides1: torch.Tensor,
+ ab_strides2: torch.Tensor,
+ c_strides1: torch.Tensor,
+ c_strides2: torch.Tensor,
+ w1_scale: torch.Tensor,
+ w2_scale: torch.Tensor,
+ a1_scale: torch.Tensor,
+ a2_scale: torch.Tensor,
+ num_repeats: int,
+ ):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ per_act_token_quant=per_act_token,
+ per_out_ch_quant=per_out_ch,
+ )
+
+ for _ in range(num_repeats):
+ with nvtx.annotate("cutlass_moe_fp8", color="blue"):
+ cutlass_moe_fp8(
+ a=a,
+ w1_q=w1,
+ w2_q=w2,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ ab_strides1=ab_strides1,
+ ab_strides2=ab_strides2,
+ c_strides1=c_strides1,
+ c_strides2=c_strides2,
+ quant_config=quant_config,
+ activation="silu",
+ global_num_experts=num_experts,
+ )
+
+ # Pre-create quantization config to avoid creating it inside CUDA graph
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ per_act_token_quant=per_act_token,
+ per_out_ch_quant=per_out_ch,
+ )
+
+ # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
+ cutlass_stream = torch.cuda.Stream()
+ cutlass_graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+ # Capture 10 invocations like benchmark_moe.py
+ for _ in range(10):
+ cutlass_moe_fp8(
+ a=a,
+ w1_q=w1_fp8q_cutlass,
+ w2_q=w2_fp8q_cutlass,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ ab_strides1=ab_strides1,
+ ab_strides2=ab_strides2,
+ c_strides1=c_strides1,
+ c_strides2=c_strides2,
+ quant_config=quant_config,
+ activation="silu",
+ global_num_experts=num_experts,
+ )
+ torch.cuda.synchronize()
+
+ # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
+ triton_stream = torch.cuda.Stream()
+ triton_graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(triton_graph, stream=triton_stream):
+ # Capture 10 invocations like benchmark_moe.py
+ for _ in range(10):
+ fused_experts(
+ a,
+ w1_fp8q,
+ w2_fp8q,
+ topk_weights,
+ topk_ids,
+ quant_config=quant_config,
+ )
+ torch.cuda.synchronize()
+
+ def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
+ """Benchmark CUDA graph using events like benchmark_moe.py"""
+ # Warmup
+ for _ in range(num_warmup):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ # Timing
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+
+ latencies = []
+ for _ in range(num_iters):
+ torch.cuda.synchronize()
+ start_event.record()
+ graph.replay()
+ end_event.record()
+ end_event.synchronize()
+ latencies.append(start_event.elapsed_time(end_event))
+
+ # Divide by 10 since graph contains 10 calls
+ return sum(latencies) / (num_iters * 10)
+
+ # Benchmark parameters
+ num_warmup = 5
+ num_iters = 100
+
+ # Benchmark only CUDA graphs (more reliable and faster)
+ # Benchmark Triton MoE with CUDA graphs
+ triton_graph_time = bench_cuda_graph(
+ triton_graph, num_warmup=num_warmup, num_iters=num_iters
+ )
+
+ # Benchmark CUTLASS MoE with CUDA graphs
+ cutlass_graph_time = bench_cuda_graph(
+ cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
+ )
+
+ # Convert ms to us and return results
+ triton_time_us = triton_graph_time * 1000
+ cutlass_time_us = cutlass_graph_time * 1000
+
+ return {
+ "batch_size": m,
+ "triton_time_us": triton_time_us,
+ "cutlass_time_us": cutlass_time_us,
+ }
+
+
+def main(args):
+ print("Benchmarking models:")
+ for i, model in enumerate(args.models):
+ print(f"[{i}] {model}")
+
+ all_results = []
+
+ for model in args.models:
+ for tp in args.tp_sizes:
+ for layer in WEIGHT_SHAPES_MOE[model]:
+ num_experts = layer[0]
+ topk = layer[1]
+ size_k = layer[2]
+ size_n = layer[3] // tp
+
+ if len(args.limit_k) > 0 and size_k not in args.limit_k:
+ continue
+
+ if len(args.limit_n) > 0 and size_n not in args.limit_n:
+ continue
+
+ for per_act_token in args.per_act_token_opts:
+ for per_out_ch in args.per_out_ch_opts:
+ print(
+ f"\n=== {model}, experts={num_experts}, topk={topk},"
+ f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
+ )
+
+ config_results = []
+ for size_m in args.batch_sizes:
+ mkn = (size_m, size_k, size_n)
+ result = bench_run(
+ [], # Not used anymore
+ model,
+ num_experts,
+ topk,
+ per_act_token,
+ per_out_ch,
+ mkn,
+ )
+ if result:
+ config_results.append(result)
+
+ # Print results table for this configuration
+ if config_results:
+ print(
+ f"\n{'Batch Size':<12}"
+ f"{'Triton (us)':<15}"
+ f"{'CUTLASS (us)':<15}"
+ )
+ print("-" * 45)
+ for result in config_results:
+ print(
+ f"{result['batch_size']:<12}"
+ f"{result['triton_time_us']:<15.2f}"
+ f"{result['cutlass_time_us']:<15.2f}"
+ )
+
+ all_results.extend(config_results)
+
+ print(f"\nTotal benchmarks completed: {len(all_results)}")
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
+ across specified models/shapes/batches
+
+ Example usage:
+ python benchmark_cutlass_moe_fp8.py \
+ --model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
+ --tp-sizes 8 \
+ --batch-size 2 4 8 \
+ --per-act-token-opts false \
+ --per-out-ch-opts false
+
+ """
+ )
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=DEFAULT_MODELS,
+ choices=WEIGHT_SHAPES_MOE.keys(),
+ )
+ parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
+ parser.add_argument(
+ "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+ )
+ parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+ parser.add_argument(
+ "--per-act-token-opts",
+ nargs="+",
+ type=lambda x: x.lower() == "true",
+ default=[False, True],
+ help="Per-activation token quantization options (true/false)",
+ )
+ parser.add_argument(
+ "--per-out-ch-opts",
+ nargs="+",
+ type=lambda x: x.lower() == "true",
+ default=[False, True],
+ help="Per-output channel quantization options (true/false)",
+ )
+
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
new file mode 100644
index 0000000000000..4cbdde5a5b2ca
--- /dev/null
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -0,0 +1,508 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark script for device communicators:
+CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
+and SymmMemCommunicator (multimem, two-shot).
+
+for NCCL symmetric memory you need to set the environment variables
+NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
+not use fast NVLS implementation for all reduce.
+
+Usage:
+ torchrun --nproc_per_node= benchmark_device_communicators.py [options]
+
+Example:
+ torchrun --nproc_per_node=2 benchmark_device_communicators.py
+ --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
+"""
+
+import json
+import os
+import time
+from contextlib import nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.pynccl import (
+ PyNcclCommunicator,
+ register_nccl_symmetric_ops,
+)
+from vllm.distributed.device_communicators.pynccl_allocator import (
+ set_graph_pool_id,
+)
+from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+# Default sequence lengths to benchmark
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+
+# Fixed hidden size and dtype for all benchmarks
+HIDDEN_SIZE = 8192
+BENCHMARK_DTYPE = torch.bfloat16
+
+# CUDA graph settings
+CUDA_GRAPH_CAPTURE_CYCLES = 10
+
+
+class CommunicatorBenchmark:
+ """Benchmark class for testing device communicators."""
+
+ def __init__(
+ self,
+ rank: int,
+ world_size: int,
+ device: torch.device,
+ cpu_group: ProcessGroup,
+ sequence_lengths: list[int],
+ ):
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.cpu_group = cpu_group
+
+ # Calculate max_size_override based on largest sequence length
+ max_seq_len = max(sequence_lengths)
+ max_tensor_elements = max_seq_len * HIDDEN_SIZE
+ self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
+
+ # Initialize communicators
+ self.custom_allreduce = None
+ self.pynccl_comm = None
+ self.symm_mem_comm = None
+ self.symm_mem_comm_multimem = None
+ self.symm_mem_comm_two_shot = None
+
+ self._init_communicators()
+
+ def _init_communicators(self):
+ """Initialize all available communicators."""
+ try:
+ self.custom_allreduce = CustomAllreduce(
+ group=self.cpu_group,
+ device=self.device,
+ max_size=self.max_size_override,
+ )
+ if not self.custom_allreduce.disabled:
+ logger.info("Rank %s: CustomAllreduce initialized", self.rank)
+ else:
+ logger.info("Rank %s: CustomAllreduce disabled", self.rank)
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
+ )
+ self.custom_allreduce = None
+
+ try:
+ self.pynccl_comm = PyNcclCommunicator(
+ group=self.cpu_group, device=self.device
+ )
+ if not self.pynccl_comm.disabled:
+ logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+ register_nccl_symmetric_ops(self.pynccl_comm)
+ else:
+ logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
+ self.pynccl_comm = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
+ )
+ self.pynccl_comm = None
+
+ # Initialize variants for SymmMemCommunicator
+ try:
+ self.symm_mem_comm_multimem = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=True,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_multimem.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_multimem = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_multimem = None
+
+ try:
+ self.symm_mem_comm_two_shot = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=False,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_two_shot.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_two_shot = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_two_shot = None
+
+ def benchmark_allreduce(
+ self, sequence_length: int, num_warmup: int, num_trials: int
+ ) -> dict[str, float]:
+ """Benchmark allreduce operations for all available communicators."""
+
+ results = {}
+
+ # Define communicators with their benchmark functions
+ communicators = []
+
+ if self.custom_allreduce is not None:
+ comm = self.custom_allreduce
+ # CustomAllreduce one-shot
+ communicators.append(
+ (
+ "ca_1stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "1stage", # env variable value
+ )
+ )
+ # CustomAllreduce two-shot
+ communicators.append(
+ (
+ "ca_2stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "2stage", # env variable value
+ )
+ )
+
+ if self.pynccl_comm is not None:
+ comm = self.pynccl_comm
+ communicators.append(
+ (
+ "pynccl",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t: True, # Always available if initialized
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+ communicators.append(
+ (
+ "pynccl-symm",
+ lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
+ lambda t: True, # Always available if initialized
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_multimem is not None:
+ comm = self.symm_mem_comm_multimem
+ communicators.append(
+ (
+ "symm_mem_multimem",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_two_shot is not None:
+ comm = self.symm_mem_comm_two_shot
+ communicators.append(
+ (
+ "symm_mem_two_shot",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ # Benchmark each communicator
+ for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+ # Set environment variable if needed
+ if env_var is not None:
+ os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+ else:
+ # Clear the environment variable to avoid interference
+ os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+ latency = self.benchmark_allreduce_single(
+ sequence_length,
+ allreduce_fn,
+ should_use_fn,
+ context,
+ num_warmup,
+ num_trials,
+ )
+ if latency is not None:
+ results[name] = latency
+
+ return results
+
+ def benchmark_allreduce_single(
+ self,
+ sequence_length: int,
+ allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+ should_use_fn: Callable[[torch.Tensor], bool],
+ context,
+ num_warmup: int,
+ num_trials: int,
+ ) -> Optional[float]:
+ """Benchmark method with CUDA graph optimization."""
+ try:
+ # Create test tensor (2D: sequence_length x hidden_size)
+ tensor = torch.randn(
+ sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
+ )
+ if not should_use_fn(tensor):
+ return None
+
+ torch.cuda.synchronize()
+ stream = torch.cuda.Stream()
+ with torch.cuda.stream(stream):
+ graph_input = tensor.clone()
+
+ # Warmup before capture
+ for _ in range(3):
+ allreduce_fn(graph_input)
+
+ # Capture the graph using context manager
+ with context:
+ graph = torch.cuda.CUDAGraph()
+ graph_pool = torch.cuda.graph_pool_handle()
+ set_graph_pool_id(graph_pool)
+ with torch.cuda.graph(graph, pool=graph_pool):
+ for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
+ allreduce_fn(graph_input)
+
+ torch.cuda.synchronize()
+ for _ in range(num_warmup):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+
+ for _ in range(num_trials):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ end_time = time.perf_counter()
+
+ # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
+ return (
+ (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
+ )
+
+ except Exception as e:
+ logger.error("CUDA graph benchmark failed: %s", e)
+ raise RuntimeError(
+ f"CUDA graph benchmark failed for communicator: {e}"
+ ) from e
+
+
+def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
+ """Calculate speedup information for a single tensor size."""
+ if not comm_results:
+ return "N/A"
+
+ # Find the fastest communicator
+ fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
+ fastest_time = comm_results[fastest_comm]
+
+ # Calculate speedup vs PyNccl if available
+ if "pynccl" in comm_results:
+ pynccl_time = comm_results["pynccl"]
+ speedup = pynccl_time / fastest_time
+ return f"{fastest_comm} ({speedup:.2f}x)"
+ else:
+ return f"{fastest_comm} (N/A)"
+
+
+def print_results(
+ results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
+):
+ """Print benchmark results in a formatted table."""
+
+ print(f"\n{'=' * 130}")
+ print("Device Communicator Benchmark Results")
+ print(
+ f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
+ f"Hidden Size: {HIDDEN_SIZE}"
+ )
+ print(f"{'=' * 130}")
+
+ # Get all communicator names
+ all_comms = set()
+ for size_results in results.values():
+ all_comms.update(size_results.keys())
+
+ all_comms = sorted(list(all_comms))
+
+ # Print header
+ header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
+ for comm in all_comms:
+ header += f"{comm:<20}"
+ header += f"{'Best (Speedup vs PyNccl)':<30}"
+ print(header)
+ print("-" * len(header))
+
+ # Print results for each sequence length
+ for seq_len in sequence_lengths:
+ if seq_len in results:
+ # Calculate tensor size in elements and bytes
+ tensor_elements = seq_len * HIDDEN_SIZE
+ tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
+
+ # Format tensor size (MB)
+ tensor_size_mb = tensor_bytes / (1024 * 1024)
+ tensor_size_str = f"{tensor_size_mb:.2f} MB"
+
+ # Format tensor shape
+ tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
+
+ row = f"{tensor_shape:<20}{tensor_size_str:<15}"
+ for comm in all_comms:
+ if comm in results[seq_len]:
+ row += f"{results[seq_len][comm]:<20.3f}"
+ else:
+ row += f"{'N/A':<20}"
+
+ # Calculate speedup information
+ speedup_info = _calculate_speedup_info(results[seq_len])
+ row += f"{speedup_info:<30}"
+
+ print(row)
+
+ print(f"{'=' * 130}")
+ print("All times are in milliseconds (ms) per allreduce operation")
+ print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
+
+
+def main():
+ parser = FlexibleArgumentParser(description="Benchmark device communicators")
+
+ parser.add_argument(
+ "--sequence-lengths",
+ type=int,
+ nargs="+",
+ default=DEFAULT_SEQUENCE_LENGTHS,
+ help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
+ )
+
+ parser.add_argument(
+ "--num-warmup", type=int, default=5, help="Number of warmup iterations"
+ )
+
+ parser.add_argument(
+ "--num-trials", type=int, default=50, help="Number of benchmark trials"
+ )
+
+ parser.add_argument("--output-json", type=str, help="Output results to JSON file")
+
+ args = parser.parse_args()
+
+ # Initialize distributed
+ if not dist.is_initialized():
+ dist.init_process_group(backend="gloo")
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+
+ # Set device
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(device)
+
+ # Get CPU process group
+ cpu_group = dist.new_group(backend="gloo")
+
+ # Disable USE_SYMM_MEM to avoid affecting the max_sizes
+ # in symm_mem and custom_all_reduce for benchmark
+ os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+ # Initialize benchmark
+ benchmark = CommunicatorBenchmark(
+ rank, world_size, device, cpu_group, args.sequence_lengths
+ )
+
+ # Run benchmarks
+ all_results = {}
+
+ for seq_len in args.sequence_lengths:
+ if rank == 0:
+ logger.info(
+ "Benchmarking sequence length: %s (tensor shape: %s x %s)",
+ seq_len,
+ seq_len,
+ HIDDEN_SIZE,
+ )
+
+ results = benchmark.benchmark_allreduce(
+ sequence_length=seq_len,
+ num_warmup=args.num_warmup,
+ num_trials=args.num_trials,
+ )
+
+ all_results[seq_len] = results
+
+ # Synchronize between ranks
+ dist.barrier()
+
+ # Print results (only rank 0)
+ if rank == 0:
+ print_results(all_results, args.sequence_lengths, world_size)
+
+ # Save to JSON if requested
+ if args.output_json:
+ # Add speedup information to results
+ enhanced_results = {}
+ for seq_len, comm_results in all_results.items():
+ enhanced_results[seq_len] = {
+ "timings": comm_results,
+ "speedup_info": _calculate_speedup_info(comm_results),
+ }
+
+ output_data = {
+ "world_size": world_size,
+ "dtype": str(BENCHMARK_DTYPE),
+ "hidden_size": HIDDEN_SIZE,
+ "sequence_lengths": args.sequence_lengths,
+ "num_warmup": args.num_warmup,
+ "num_trials": args.num_trials,
+ "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
+ "results": enhanced_results,
+ }
+
+ with open(args.output_json, "w") as f:
+ json.dump(output_data, f, indent=2)
+
+ logger.info("Results saved to %s", args.output_json)
+
+ # Cleanup
+ if cpu_group != dist.group.WORLD:
+ dist.destroy_process_group(cpu_group)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index a6b42406b5cb0..14330ae6f03c5 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
@@ -96,6 +97,11 @@ def bench_run(
a_scale: torch.Tensor,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_scale,
+ )
for _ in range(num_repeats):
fused_experts(
a,
@@ -103,10 +109,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_scale,
+ quant_config=quant_config,
)
def run_cutlass_moe(
@@ -125,6 +128,12 @@ def bench_run(
per_act_token: bool,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ per_act_token_quant=per_act_token,
+ )
+
for _ in range(num_repeats):
cutlass_moe_fp8(
a,
@@ -132,14 +141,11 @@ def bench_run(
w2,
topk_weights,
topk_ids,
- w1_scale,
- w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
- per_act_token,
- a1_scale=None,
+ quant_config=quant_config,
)
def run_cutlass_from_graph(
@@ -156,6 +162,12 @@ def bench_run(
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ per_act_token_quant=per_act_token,
+ )
+
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@@ -165,14 +177,11 @@ def bench_run(
w2_q,
topk_weights,
topk_ids,
- w1_scale,
- w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
- per_act_token,
- a1_scale=None,
+ quant_config=quant_config,
)
def run_triton_from_graph(
@@ -185,6 +194,11 @@ def bench_run(
w2_scale: torch.Tensor,
a_scale: torch.Tensor,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_scale,
+ )
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@@ -194,10 +208,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_scale,
+ quant_config=quant_config,
)
def replay_graph(graph, num_repeats):
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 3d38d4b3534e8..799b16999873f 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(
def make_rand_tensors(
- a_shape: tuple[int],
- b_shape: tuple[int],
- c_shape: tuple[int],
+ a_shape: tuple[int, ...],
+ b_shape: tuple[int, ...],
+ c_shape: tuple[int, ...],
a_dtype: torch.dtype,
b_dtype: torch.dtype,
c_dtype: torch.dtype,
@@ -243,7 +243,7 @@ class OpType(Enum):
lora_rank: int,
num_loras: int,
num_slices: int,
- ) -> tuple[tuple[int], tuple[int], tuple[int]]:
+ ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
"""
Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type
@@ -464,7 +464,11 @@ class BenchmarkTensors:
for field_name in LoRAKernelMeta.__dataclass_fields__:
field = getattr(self.lora_kernel_meta, field_name)
assert isinstance(field, torch.Tensor)
- setattr(self.lora_kernel_meta, field_name, to_device(field))
+ setattr(
+ self.lora_kernel_meta,
+ field_name,
+ to_device(field) if field_name != "no_lora_flag_cpu" else field,
+ )
def metadata(self) -> tuple[int, int, int]:
"""
@@ -512,6 +516,7 @@ class BenchmarkTensors:
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"scaling": 1.0,
+ "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
@@ -552,6 +557,7 @@ class BenchmarkTensors:
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"offset_start": 0,
"add_inputs": add_inputs,
+ "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def bench_fn_kwargs(
@@ -637,7 +643,7 @@ def bench_optype(
# Clear LoRA optimization hash-maps.
_LORA_A_PTR_DICT.clear()
_LORA_B_PTR_DICT.clear()
- # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+ # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
for kwargs in kwargs_list:
op_type.bench_fn()(**kwargs)
torch.cuda.synchronize()
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index a9c4d30d9b189..1b1c3b321cce4 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -284,6 +284,25 @@ def machete_create_bench_fn(
)
+def cutlass_w4a8_create_bench_fn(
+ bt: BenchmarkTensors, out_type=torch.dtype, schedule=None
+) -> Callable:
+ w_q = bt.w_q.t().contiguous().t() # make col major
+ w_q = ops.cutlass_encode_and_reorder_int4b(w_q)
+ # expects fp8 scales
+ w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn))
+
+ return lambda: ops.cutlass_w4a8_mm(
+ a=bt.a,
+ b_q=w_q,
+ b_group_scales=w_s,
+ b_group_size=bt.group_size,
+ b_channel_scales=bt.w_ch_s,
+ a_token_scales=bt.w_tok_s,
+ maybe_schedule=schedule,
+ )
+
+
# impl
# bench
@@ -385,6 +404,20 @@ def bench(
)
)
+ # cutlass w4a8
+ if types.act_type == torch.float8_e4m3fn and group_size == 128:
+ timers.append(
+ bench_fns(
+ label,
+ sub_label,
+ f"cutlass w4a8 ({name_type_string})",
+ [
+ cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type)
+ for bt in benchmark_tensors
+ ],
+ )
+ )
+
if sweep_schedules:
global _SWEEP_SCHEDULES_RESULTS
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 752c2d0082167..d3040e9738f7b 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -14,6 +14,10 @@ import ray
import torch
from ray.experimental.tqdm_ray import tqdm
+from vllm.model_executor.layers.fused_moe.config import (
+ FusedMoEQuantConfig,
+ _get_config_dtype_str,
+)
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
@@ -134,43 +138,36 @@ def benchmark_config(
def run():
from vllm.model_executor.layers.fused_moe import override_config
+ if use_fp8_w8a8:
+ quant_dtype = torch.float8_e4m3fn
+ elif use_int8_w8a16:
+ quant_dtype = torch.int8
+ else:
+ quant_dtype = None
+
+ quant_config = FusedMoEQuantConfig.make(
+ quant_dtype=quant_dtype,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_quant_shape,
+ )
+
with override_config(config):
- if use_deep_gemm:
- topk_weights, topk_ids, token_expert_indices = fused_topk(
- x, input_gating, topk, False
- )
- return fused_experts(
- x,
- w1,
- w2,
- topk_weights,
- topk_ids,
- inplace=True,
- use_fp8_w8a8=use_fp8_w8a8,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- block_shape=block_quant_shape,
- allow_deep_gemm=True,
- )
- else:
- fused_moe(
- x,
- w1,
- w2,
- input_gating,
- topk,
- renormalize=True,
- inplace=True,
- use_fp8_w8a8=use_fp8_w8a8,
- use_int8_w8a16=use_int8_w8a16,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- block_shape=block_quant_shape,
- )
+ topk_weights, topk_ids, token_expert_indices = fused_topk(
+ x, input_gating, topk, renormalize=not use_deep_gemm
+ )
+ return fused_experts(
+ x,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ inplace=True,
+ quant_config=quant_config,
+ allow_deep_gemm=use_deep_gemm,
+ )
# JIT compilation & warmup
run()
@@ -414,13 +411,15 @@ class BenchmarkWorker:
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
- dtype_str = get_config_dtype_str(
+ dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
+ block_n = block_quant_shape[0] if block_quant_shape else None
+ block_k = block_quant_shape[1] if block_quant_shape else None
op_config = get_moe_configs(
- num_experts, shard_intermediate_size // 2, dtype_str
+ num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
)
if op_config is None:
config = get_default_config(
@@ -430,6 +429,7 @@ class BenchmarkWorker:
hidden_size,
topk,
dtype_str,
+ block_quant_shape,
)
else:
config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
@@ -544,7 +544,7 @@ def save_configs(
block_quant_shape: list[int],
save_dir: str,
) -> None:
- dtype_str = get_config_dtype_str(
+ dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
@@ -557,7 +557,7 @@ def save_configs(
filename = os.path.join(save_dir, filename)
print(f"Writing best config to {filename}...")
with open(filename, "w") as f:
- json.dump(configs, f, indent=4)
+ json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
f.write("\n")
@@ -579,26 +579,42 @@ def main(args: argparse.Namespace):
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
+ hidden_size = config.hidden_size
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
+ hidden_size = config.hidden_size
elif config.architectures[0] in (
- "DeepseekV3ForCausalLM",
"DeepseekV2ForCausalLM",
+ "DeepseekV3ForCausalLM",
+ "DeepseekV32ForCausalLM",
"Glm4MoeForCausalLM",
):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
- elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+ hidden_size = config.hidden_size
+ elif config.architectures[0] in (
+ "Qwen2MoeForCausalLM",
+ "Qwen3MoeForCausalLM",
+ "Qwen3NextForCausalLM",
+ ):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
+ hidden_size = config.hidden_size
+ elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
+ text_config = config.get_text_config()
+ E = text_config.num_experts
+ topk = text_config.num_experts_per_tok
+ intermediate_size = text_config.moe_intermediate_size
+ hidden_size = text_config.hidden_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
+ hidden_size = config.hidden_size
else:
# Support for llama4
config = config.get_text_config()
@@ -606,6 +622,7 @@ def main(args: argparse.Namespace):
E = config.num_local_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
+ hidden_size = config.hidden_size
enable_ep = bool(args.enable_expert_parallel)
if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts")
@@ -614,7 +631,6 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
- hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
@@ -675,7 +691,11 @@ def main(args: argparse.Namespace):
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
print(f"Start tuning over {len(search_space)} configurations...")
-
+ if use_deep_gemm:
+ raise ValueError(
+ "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
+ "kernels. Please remove the flag."
+ )
start = time.time()
configs = _distribute(
"tune",
diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py
new file mode 100644
index 0000000000000..9ac8f5e6594e4
--- /dev/null
+++ b/benchmarks/kernels/benchmark_polynorm.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+
+def polynorm_naive(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+
+ def norm(x, eps: float):
+ return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+ x = x.float()
+ return (
+ (
+ weight[0] * norm(x**3, eps)
+ + weight[1] * norm(x**2, eps)
+ + weight[2] * norm(x, eps)
+ + bias
+ )
+ .to(weight.dtype)
+ .view(orig_shape)
+ )
+
+
+def polynorm_vllm(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+
+ out = torch.empty_like(x)
+ vllm_ops.poly_norm(out, x, weight, bias, eps)
+ output = out
+
+ output = output.view(orig_shape)
+ return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_dim):
+ dtype = torch.bfloat16
+ x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+ weight = torch.ones(3, dtype=dtype, device="cuda")
+ bias = torch.ones(1, dtype=dtype, device="cuda")
+
+ output_naive = polynorm_naive(x, weight, bias)
+ output_vllm = polynorm_vllm(x, weight, bias)
+
+ if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+ print("✅ All implementations match")
+ else:
+ print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+dim_range = [2048, 4096]
+configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark():
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["dim", "batch_size", "seq_len"],
+ x_vals=[list(_) for _ in configs],
+ line_arg="provider",
+ line_vals=["naive", "vllm"],
+ line_names=["Naive", "vLLM"],
+ styles=[("blue", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name="polynorm-perf",
+ args={},
+ )
+ )
+ def benchmark(dim, batch_size, seq_len, provider):
+ dtype = torch.bfloat16
+ hidden_dim = dim * 4
+
+ x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+ weight = torch.ones(3, dtype=dtype, device="cuda")
+ bias = torch.ones(1, dtype=dtype, device="cuda")
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "naive":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: polynorm_naive(x, weight, bias),
+ quantiles=quantiles,
+ )
+ else:
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: polynorm_vllm(x, weight, bias),
+ quantiles=quantiles,
+ )
+
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+ return benchmark
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=4,
+ help="Batch size",
+ )
+ parser.add_argument(
+ "--seq-len",
+ type=int,
+ default=128,
+ help="Sequence length",
+ )
+ parser.add_argument(
+ "--hidden-dim",
+ type=int,
+ default=8192,
+ help="Intermediate size of MLP",
+ )
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./configs/polnorm/",
+ help="Path to save polnorm benchmark results",
+ )
+
+ args = parser.parse_args()
+
+ # Run correctness test
+ calculate_diff(
+ batch_size=args.batch_size,
+ seq_len=args.seq_len,
+ hidden_dim=args.hidden_dim,
+ )
+
+ benchmark = get_benchmark()
+ # Run performance benchmark
+ benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py
new file mode 100644
index 0000000000000..af9841daadf24
--- /dev/null
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import random
+import time
+
+import torch
+from tabulate import tabulate
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import (
+ STR_DTYPE_TO_TORCH_DTYPE,
+ FlexibleArgumentParser,
+ create_kv_caches_with_random,
+)
+
+logger = init_logger(__name__)
+
+
+@torch.inference_mode()
+def run_benchmark(
+ num_tokens: int,
+ num_heads: int,
+ head_size: int,
+ block_size: int,
+ num_blocks: int,
+ dtype: torch.dtype,
+ kv_cache_dtype: str,
+ num_iters: int,
+ benchmark_mode: str,
+ device: str = "cuda",
+) -> float:
+ """Return latency (seconds) for given num_tokens."""
+
+ if kv_cache_dtype == "fp8" and head_size % 16:
+ raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
+
+ current_platform.seed_everything(42)
+ torch.set_default_device(device)
+
+ # create random key / value tensors [T, H, D].
+ key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
+ value = torch.randn_like(key)
+
+ # prepare the slot mapping.
+ # each token is assigned a unique slot in the KV-cache.
+ num_slots = block_size * num_blocks
+ if num_tokens > num_slots:
+ raise ValueError("num_tokens cannot exceed the total number of cache slots")
+ slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+ slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+
+ key_caches, value_caches = create_kv_caches_with_random(
+ num_blocks,
+ block_size,
+ 1, # num_layers
+ num_heads,
+ head_size,
+ kv_cache_dtype,
+ dtype,
+ device=device,
+ )
+ key_cache, value_cache = key_caches[0], value_caches[0]
+ # to free unused memory
+ del key_caches, value_caches
+
+ # compute per-kernel scaling factors for fp8 conversion (if used).
+ k_scale = (key.amax() / 64.0).to(torch.float32)
+ v_scale = (value.amax() / 64.0).to(torch.float32)
+
+ function_under_test = lambda: ops.reshape_and_cache(
+ key, # noqa: F821
+ value, # noqa: F821
+ key_cache, # noqa: F821
+ value_cache, # noqa: F821
+ slot_mapping, # noqa: F821
+ kv_cache_dtype,
+ k_scale,
+ v_scale,
+ )
+
+ if benchmark_mode == "cudagraph":
+ g = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(g):
+ function_under_test()
+ torch.cuda.synchronize()
+ function_under_test = lambda: g.replay()
+
+ def run_cuda_benchmark(n_iters: int) -> float:
+ nonlocal key, value, key_cache, value_cache, slot_mapping
+ torch.cuda.synchronize()
+ start = time.perf_counter()
+ for _ in range(n_iters):
+ function_under_test()
+ torch.cuda.synchronize()
+ end = time.perf_counter()
+ return (end - start) / n_iters
+
+ # warm-up
+ run_cuda_benchmark(3)
+
+ lat = run_cuda_benchmark(num_iters)
+
+ # free tensors to mitigate OOM when sweeping
+ del key, value, key_cache, value_cache, slot_mapping
+ torch.cuda.empty_cache()
+
+ return lat
+
+
+def main(args):
+ rows = []
+ for exp in range(1, 17):
+ n_tok = 2**exp
+ lat = run_benchmark(
+ num_tokens=n_tok,
+ num_heads=args.num_heads,
+ head_size=args.head_size,
+ block_size=args.block_size,
+ num_blocks=args.num_blocks,
+ dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+ kv_cache_dtype=args.kv_cache_dtype,
+ num_iters=args.iters,
+ benchmark_mode=args.mode,
+ device="cuda",
+ )
+ rows.append([n_tok, lat * 1e6]) # convert to microseconds
+
+ print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
+ print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser()
+
+ parser.add_argument("--num-heads", type=int, default=128)
+ parser.add_argument(
+ "--head-size",
+ type=int,
+ choices=[64, 80, 96, 112, 120, 128, 192, 256],
+ default=128,
+ )
+ parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+ parser.add_argument("--num-blocks", type=int, default=128 * 128)
+
+ parser.add_argument(
+ "--dtype",
+ type=str,
+ choices=["half", "bfloat16", "float"],
+ default="bfloat16",
+ )
+
+ parser.add_argument(
+ "--kv-cache-dtype",
+ type=str,
+ choices=["auto", "fp8"],
+ default="auto",
+ )
+
+ parser.add_argument("--iters", type=int, default=200)
+
+ parser.add_argument(
+ "--mode",
+ type=str,
+ choices=["cudagraph", "no_graph"],
+ default="cudagraph",
+ )
+
+ args = parser.parse_args()
+
+ main(args)
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
index d4648c18f31d5..0aace571064a0 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -9,6 +9,9 @@ import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
+from vllm.attention.ops.triton_reshape_and_cache_flash import (
+ triton_reshape_and_cache_flash,
+)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
@@ -31,6 +34,8 @@ def run_benchmark(
kv_cache_dtype: str,
kv_cache_layout: str,
num_iters: int,
+ implementation: str,
+ benchmark_mode: str,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
@@ -38,6 +43,14 @@ def run_benchmark(
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
+ if implementation not in ("cuda", "triton"):
+ raise ValueError(
+ f"Unsupported implementation: {implementation}. "
+ "Only 'cuda' and 'triton' are supported."
+ )
+ if implementation == "triton" and kv_cache_layout == "HND":
+ return float("nan") # Triton does not support HND layout yet.
+
current_platform.seed_everything(42)
torch.set_default_device(device)
@@ -65,27 +78,49 @@ def run_benchmark(
cache_layout=kv_cache_layout,
)
key_cache, value_cache = key_caches[0], value_caches[0]
+ # to free unused memory
+ del key_caches, value_caches
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
+ if implementation == "cuda":
+ function_under_test = lambda: ops.reshape_and_cache_flash(
+ key, # noqa: F821
+ value, # noqa: F821
+ key_cache, # noqa: F821
+ value_cache, # noqa: F821
+ slot_mapping, # noqa: F821
+ kv_cache_dtype,
+ k_scale,
+ v_scale,
+ )
+ else:
+ function_under_test = lambda: triton_reshape_and_cache_flash(
+ key, # noqa: F821
+ value, # noqa: F821
+ key_cache, # noqa: F821
+ value_cache, # noqa: F821
+ slot_mapping, # noqa: F821
+ kv_cache_dtype,
+ k_scale,
+ v_scale,
+ )
+ if benchmark_mode == "cudagraph":
+ g = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(g):
+ function_under_test()
+ torch.cuda.synchronize()
+ function_under_test = lambda: g.replay()
+
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
- ops.reshape_and_cache_flash(
- key,
- value,
- key_cache,
- value_cache,
- slot_mapping,
- kv_cache_dtype,
- k_scale,
- v_scale,
- )
- torch.cuda.synchronize()
+ function_under_test()
+ torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
@@ -116,10 +151,16 @@ def main(args):
kv_cache_dtype=args.kv_cache_dtype,
kv_cache_layout=layout,
num_iters=args.iters,
+ implementation=args.implementation,
+ benchmark_mode=args.mode,
device="cuda",
)
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
+ print(
+ f"Benchmark results for implementation {args.implementation}"
+ f" (measuring with {args.mode}):"
+ )
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
@@ -151,6 +192,21 @@ if __name__ == "__main__":
)
parser.add_argument("--iters", type=int, default=100)
+
+ parser.add_argument(
+ "--implementation",
+ type=str,
+ choices=["cuda", "triton"],
+ default="cuda",
+ )
+
+ parser.add_argument(
+ "--mode",
+ type=str,
+ choices=["cudagraph", "no_graph"],
+ default="cudagraph",
+ )
+
args = parser.parse_args()
main(args)
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index 0650cbf3cc18e..a5887aafd30d6 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -1,77 +1,720 @@
-#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
+"""
+Comprehensive 3-way SiLU Benchmark Suite
+
+This benchmark compares three SiLU implementations:
+1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
+2. Triton Kernel - Triton-based implementation
+
+The suite generates detailed performance comparisons including:
+- Memory bandwidth utilization
+- Speedup ratios (baseline vs optimized implementations)
+- Performance across different expert configurations and token distributions
+"""
+
+from collections.abc import Callable
+
+import matplotlib.pyplot as plt
+import numpy as np
import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
- silu_mul_fp8_quant_deep_gemm,
+ persistent_masked_m_silu_mul_quant,
)
from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
-def benchmark(E, T, H, G=128, runs=50):
- current_platform.seed_everything(42)
- y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
- tokens_per_expert = torch.randint(
- T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
+@triton.jit
+def _silu_mul_fp8_quant_deep_gemm(
+ # Pointers ------------------------------------------------------------
+ input_ptr, # 16-bit activations (E, T, 2*H)
+ y_q_ptr, # fp8 quantized activations (E, T, H)
+ y_s_ptr, # 16-bit scales (E, T, G)
+ counts_ptr, # int32 num tokens per expert (E)
+ # Sizes ---------------------------------------------------------------
+ H: tl.constexpr, # hidden dimension (per output)
+ GROUP_SIZE: tl.constexpr, # elements per group (usually 128)
+ # Strides for input (elements) ---------------------------------------
+ stride_i_e,
+ stride_i_t,
+ stride_i_h,
+ # Strides for y_q (elements) -----------------------------------------
+ stride_yq_e,
+ stride_yq_t,
+ stride_yq_h,
+ # Strides for y_s (elements) -----------------------------------------
+ stride_ys_e,
+ stride_ys_t,
+ stride_ys_g,
+ # Stride for counts (elements)
+ stride_counts_e,
+ # Numeric params ------------------------------------------------------
+ eps: tl.constexpr,
+ fp8_min: tl.constexpr,
+ fp8_max: tl.constexpr,
+ use_ue8m0: tl.constexpr,
+ # Meta ---------------------------------------------------------------
+ BLOCK: tl.constexpr,
+ NUM_STAGES: tl.constexpr,
+):
+ G = H // GROUP_SIZE
+
+ # map program id -> (e, g)
+ pid = tl.program_id(0)
+ e = pid // G
+ g = pid % G
+
+ e = e.to(tl.int64)
+ g = g.to(tl.int64)
+
+ # number of valid tokens for this expert
+ n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
+
+ cols = tl.arange(0, BLOCK).to(tl.int64)
+ mask = cols < BLOCK
+
+ base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
+ base_gate_offset = base_input_offset + cols * stride_i_h
+ base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
+ base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
+ base_ys_offset = e * stride_ys_e + g * stride_ys_g
+
+ for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
+ gate = tl.load(
+ input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
+ ).to(tl.float32)
+ up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
+
+ gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
+ y = gate * up
+
+ y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
+ if use_ue8m0:
+ y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
+
+ y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+ tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
+ tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
+
+
+def silu_mul_fp8_quant_deep_gemm_triton(
+ y: torch.Tensor, # (E, T, 2*H)
+ tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert
+ num_parallel_tokens,
+ group_size: int = 128,
+ eps: float = 1e-10,
+ expert_offsets: torch.Tensor = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
+
+ y has shape (E, T, 2*H). The first half of the last dimension is
+ silu-activated, multiplied by the second half, then quantized into FP8.
+
+ Returns `(y_q, y_s)` where
+ * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
+ * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+ """
+ assert y.ndim == 3, "y must be (E, T, 2*H)"
+ E, T, H2 = y.shape
+ assert H2 % 2 == 0, "last dim of y must be even (2*H)"
+ H = H2 // 2
+ G = (H + group_size - 1) // group_size
+ assert H % group_size == 0, "H must be divisible by group_size"
+ assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
+ "tokens_per_expert must be shape (E,)"
+ )
+ tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
+
+ # allocate outputs
+ fp8_dtype = torch.float8_e4m3fn
+ y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+
+ # strides (elements)
+ stride_i_e, stride_i_t, stride_i_h = y.stride()
+ stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+
+ # desired scale strides (elements): (T*G, 1, T)
+ stride_ys_e = T * G
+ stride_ys_t = 1
+ stride_ys_g = T
+ y_s = torch.empty_strided(
+ (E, T, G),
+ (stride_ys_e, stride_ys_t, stride_ys_g),
+ dtype=torch.float32,
+ device=y.device,
)
+ stride_cnt_e = tokens_per_expert.stride()[0]
+
+ # Static grid over experts and H-groups.
+ # A loop inside the kernel handles the token dim
+ grid = (E * G,)
+
+ f_info = torch.finfo(fp8_dtype)
+ fp8_max = f_info.max
+ fp8_min = f_info.min
+
+ _silu_mul_fp8_quant_deep_gemm[grid](
+ y,
+ y_q,
+ y_s,
+ tokens_per_expert,
+ H,
+ group_size,
+ stride_i_e,
+ stride_i_t,
+ stride_i_h,
+ stride_yq_e,
+ stride_yq_t,
+ stride_yq_h,
+ stride_ys_e,
+ stride_ys_t,
+ stride_ys_g,
+ stride_cnt_e,
+ eps,
+ fp8_min,
+ fp8_max,
+ is_deep_gemm_e8m0_used(),
+ BLOCK=group_size,
+ NUM_STAGES=4,
+ num_warps=1,
+ )
+
+ return y_q, y_s
+
+
+# Parse generation strategies
+strategies = ["random_imbalanced", "uniform", "max_t"]
+
+
+def benchmark(
+ kernel: Callable,
+ E: int,
+ T: int,
+ H: int,
+ total_tokens: int,
+ num_parallel_tokens: int = 64,
+ G: int = 128,
+ runs: int = 200,
+ num_warmups: int = 20,
+ gen_strategy: str = "default",
+ iterations_per_run: int = 20,
+):
+ def generate_data(seed_offset=0):
+ """Generate input data with given seed offset"""
+ current_platform.seed_everything(42 + seed_offset)
+ y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
+
+ if gen_strategy == "random_imbalanced":
+
+ def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
+ mean = total_tokens // n_e
+ min_max = mean // ratio
+ e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
+ e[0] = min_max
+ r = torch.rand(size=(E - 1,))
+ r /= r.sum()
+ r *= total_tokens - min_max
+ r = r.round().long()
+ e[1:] = r.to(device=device)
+ return e
+
+ tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
+ elif gen_strategy == "uniform":
+ r = torch.rand(size=(E,))
+ r /= r.sum()
+ r *= total_tokens
+ r = r.round().long()
+ tokens_per_expert = r
+ elif gen_strategy == "max_t":
+ tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
+ tokens_per_expert.fill_(total_tokens / E)
+ elif gen_strategy == "first_t":
+ tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
+ tokens_per_expert[0] = min(T, total_tokens)
+ else:
+ raise ValueError(f"Unknown generation strategy: {gen_strategy}")
+ return y, tokens_per_expert
+
+ dataset_count = 4
+ # Pre-generate different input matrices for each iteration to avoid cache effects
+ data_sets = [generate_data(i) for i in range(dataset_count)]
+
# Warmup
- for _ in range(10):
- silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
- torch.cuda.synchronize()
+ y, tokens_per_expert = data_sets[0]
+ for _ in range(num_warmups):
+ kernel(
+ y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
+ )
+ torch.cuda.synchronize()
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
# Benchmark
- torch.cuda.synchronize()
- start = time.perf_counter()
+ latencies: list[float] = []
for _ in range(runs):
- silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
- torch.cuda.synchronize()
+ torch.cuda.synchronize()
- avg_time = (time.perf_counter() - start) / runs * 1000
+ start_event.record()
+ for i in range(iterations_per_run):
+ y, tokens_per_expert = data_sets[i % dataset_count]
+ kernel(
+ y,
+ tokens_per_expert,
+ num_parallel_tokens=num_parallel_tokens,
+ group_size=G,
+ )
+ end_event.record()
+ end_event.synchronize()
- # Calculate actual work done (only count valid tokens)
+ total_time_ms = start_event.elapsed_time(end_event)
+ per_iter_time_ms = total_time_ms / iterations_per_run
+ latencies.append(per_iter_time_ms)
+
+ # Use median instead of average for better outlier handling
+ median_time_ms = np.median(latencies)
+ median_time_s = median_time_ms / 1000
+
+ # Calculate actual work done (using first dataset for consistency)
+ _, tokens_per_expert = data_sets[0]
actual_tokens = tokens_per_expert.sum().item()
actual_elements = actual_tokens * H
# GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
ops_per_element = 8
total_ops = actual_elements * ops_per_element
- gflops = total_ops / (avg_time / 1000) / 1e9
+ gflops = total_ops / median_time_s / 1e9
# Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs
output_bytes = actual_tokens * H * 1 # H fp8 outputs
scale_bytes = actual_tokens * (H // G) * 4 # scales in float32
total_bytes = input_bytes + output_bytes + scale_bytes
- memory_bw = total_bytes / (avg_time / 1000) / 1e9
+ memory_bw = total_bytes / median_time_s / 1e9
- return avg_time, gflops, memory_bw
+ HOPPER_BANDWIDTH_TBPS = 3.35
+ return (
+ median_time_ms,
+ gflops,
+ memory_bw,
+ (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
+ )
+def create_comparison_plot(
+ ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
+):
+ fig, ax = plt.subplots(1, 1, figsize=(18, 6))
+
+ # Configure x-axis positions
+ x = np.arange(len(config_labels))
+ width = 0.25
+
+ # Execution Time plot (lower is better)
+ ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
+ ax.bar(
+ x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
+ )
+
+ # Add speedup labels over each bar trio
+ for i in range(len(x)):
+ triton_v2_speedup = ratios[i][1] # triton/v2
+ max_height = max(silu_v2_times[i], triton_times[i])
+
+ # Triton/V2 speedup
+ ax.text(
+ x[i] + width / 2,
+ max_height + max_height * 0.02,
+ f"{triton_v2_speedup:.2f}x",
+ ha="center",
+ va="bottom",
+ fontweight="bold",
+ fontsize=8,
+ )
+
+ ax.set_xlabel("Configuration")
+ ax.set_ylabel("% Utilization")
+ ax.set_title(
+ f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(config_labels, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ return fig, ax
+
+
+def create_combined_plot(all_results):
+ num_strategies = len(all_results)
+ fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
+
+ if num_strategies == 1:
+ axes = [axes]
+
+ for idx, (
+ strategy_name,
+ all_ratios,
+ all_silu_v2_results,
+ all_triton_results,
+ config_labels,
+ config_x_axis,
+ ) in enumerate(all_results):
+ ax = axes[idx]
+
+ # Flatten the nested results to get bandwidth percentages for plotting
+ silu_v2_bandwidths = []
+ triton_bandwidths = []
+ flat_ratios = []
+
+ for config_results in all_silu_v2_results:
+ for result in config_results:
+ silu_v2_bandwidths.append(result[3]) # bandwidth percentage
+
+ for config_results in all_triton_results:
+ for result in config_results:
+ triton_bandwidths.append(result[3]) # bandwidth percentage
+
+ for config_ratios in all_ratios:
+ for ratio in config_ratios:
+ flat_ratios.append(ratio)
+
+ # Configure x-axis positions
+ x = np.arange(len(config_labels))
+ width = 0.25
+
+ # Bandwidth utilization plot (higher is better)
+ ax.bar(
+ x,
+ silu_v2_bandwidths,
+ width,
+ label="SiLU V2 (CUDA)",
+ alpha=0.8,
+ color="blue",
+ )
+ ax.bar(
+ x + width,
+ triton_bandwidths,
+ width,
+ label="Triton Kernel",
+ alpha=0.8,
+ color="green",
+ )
+
+ # Add speedup labels over each bar trio
+ for i in range(len(x)):
+ triton_v2_speedup = flat_ratios[i] # triton/v2
+ max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
+
+ # Triton/V2 speedup
+ ax.text(
+ x[i] + width / 2,
+ max_height + max_height * 0.02,
+ f"{triton_v2_speedup:.2f}x",
+ ha="center",
+ va="bottom",
+ fontweight="bold",
+ fontsize=8,
+ )
+
+ ax.set_xlabel("Configuration")
+ ax.set_ylabel("% Utilization")
+ ax.set_title(
+ f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(config_labels, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ filename = "silu_benchmark_combined_3way.png"
+ plt.savefig(filename, dpi=300, bbox_inches="tight")
+ plt.show()
+
+ return filename
+
+
+outer_dim = 7168
configs = [
- (8, 32, 1024),
- (16, 64, 2048),
- (32, 128, 4096),
# DeepSeekV3 Configs
- (256, 16, 7168),
- (256, 32, 7168),
- (256, 64, 7168),
- (256, 128, 7168),
- (256, 256, 7168),
- (256, 512, 7168),
+ # (1, 56, 7168),
+ (8, 1024, 7168),
+ # (32, 56, 7168),
+ # DeepSeekV3 Configs
+ (32, 1024, 7168),
+ # DeepSeekV3 Configs
(256, 1024, 7168),
]
-print(f"GPU: {torch.cuda.get_device_name()}")
-print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
-print("-" * 50)
+runs = 100
+num_warmups = 20
-for E, T, H in configs:
- try:
- time_ms, gflops, gbps = benchmark(E, T, H)
- print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
- except Exception:
- print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
+strategy_descriptions = {
+ "uniform": "Uniform Random",
+ "random_imbalanced": "Imbalanced Random",
+ "max_t": "Even Assignment",
+ "first_t": "experts[0] = T, experts[1:] = 0",
+}
+
+print(f"GPU: {torch.cuda.get_device_name()}")
+print(f"Testing strategies: {', '.join(strategies)}")
+print(f"Configurations: {len(configs)} configs")
+
+all_results = []
+
+# Run benchmarks for each strategy
+for id, strategy in enumerate(strategies):
+ print(f"\n{'=' * 60}")
+ print(f"Testing strategy: {strategy_descriptions[strategy]}")
+ print(f"{'=' * 60}")
+
+ # Collect benchmark data for all three algorithms
+ config_labels = []
+ config_x_axis = []
+ all_silu_v2_results = []
+ all_triton_results = []
+ all_ratios = []
+
+ for E, T, H in configs:
+ total_tokens_config = []
+ for i in [8, 16, 32, 64, 128, 256, 512]:
+ if i <= T:
+ total_tokens_config.append(i * E)
+ config_x_axis.append(total_tokens_config)
+
+ silu_v2_results = []
+ triton_results = []
+ ratios = []
+
+ for total_tokens in total_tokens_config:
+ config_label = f"E={E},T={T},H={H},TT={total_tokens}"
+ config_labels.append(config_label)
+
+ # SiLU V2 (CUDA kernel) results
+ time_ms_silu_v2, gflops, gbps, perc = benchmark(
+ persistent_masked_m_silu_mul_quant,
+ E,
+ T,
+ H,
+ total_tokens,
+ runs=runs,
+ num_warmups=num_warmups,
+ gen_strategy=strategy,
+ )
+ silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
+
+ # Triton kernel results
+ time_ms_triton, gflops, gbps, perc = benchmark(
+ silu_mul_fp8_quant_deep_gemm_triton,
+ E,
+ T,
+ H,
+ total_tokens,
+ runs=runs,
+ num_warmups=num_warmups,
+ gen_strategy=strategy,
+ )
+ triton_results.append((time_ms_triton, gflops, gbps, perc))
+
+ # Calculate speedup ratios (triton baseline / implementation)
+ triton_v2_ratio = time_ms_triton / time_ms_silu_v2
+ ratios.append(triton_v2_ratio)
+
+ print(
+ f"Completed: {config_label}:"
+ f" V2: {time_ms_silu_v2:.3f}ms,"
+ f" Triton: {time_ms_triton:.3f}ms"
+ )
+
+ all_silu_v2_results.append(silu_v2_results)
+ all_triton_results.append(triton_results)
+ all_ratios.append(ratios)
+
+ # Store results for combined plotting
+ all_results.append(
+ (
+ strategy_descriptions[strategy],
+ all_ratios,
+ all_silu_v2_results,
+ all_triton_results,
+ config_labels,
+ config_x_axis,
+ )
+ )
+
+ # Print summary table for this strategy
+ print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
+ print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
+ print("-" * 90)
+
+ for i, (E, T, H) in enumerate(configs):
+ # Get the first result for each config (simplifying for summary)
+ v2_time = silu_v2_results[i][0]
+ triton_time = triton_results[i][0]
+ triton_v2_speedup = triton_time / v2_time
+ config_label = f"E={E:3d},T={T:4d},H={H:4d}"
+ print(
+ f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
+ f"{triton_v2_speedup:8.2f}x"
+ )
+
+
+def create_total_tokens_plot(all_results):
+ num_strategies = len(all_results)
+ num_configs = len(configs)
+
+ fig, axs = plt.subplots(
+ num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
+ )
+
+ # Add main title to the entire figure
+ fig.suptitle(
+ "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
+ fontsize=18,
+ fontweight="bold",
+ y=0.98,
+ )
+
+ # Handle single strategy case
+ if num_strategies == 1:
+ axs = axs.reshape(1, -1)
+
+ # Handle single config case
+ if num_configs == 1:
+ axs = axs.reshape(-1, 2)
+
+ for strategy_idx, result in enumerate(all_results):
+ (
+ strategy_name,
+ all_ratios,
+ all_silu_v2_results,
+ all_triton_results,
+ config_labels,
+ config_x_axis,
+ ) = result
+
+ for config_idx in range(num_configs):
+ # Speedup plot (left column)
+ ax_speedup = axs[strategy_idx, config_idx * 2]
+ # Bandwidth plot (right column)
+ ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
+
+ E, T, H = configs[config_idx]
+ ratios = all_ratios[config_idx]
+ total_tokens_values = config_x_axis[config_idx]
+
+ # Extract speedup ratios
+ triton_v2_ratios = [ratio for ratio in ratios]
+
+ # Extract bandwidth percentages for all implementations
+ v2_bandwidth_percentages = [
+ result[3] for result in all_silu_v2_results[config_idx]
+ ]
+ triton_bandwidth_percentages = [
+ result[3] for result in all_triton_results[config_idx]
+ ]
+
+ # Plot speedup ratios vs total tokens (left plot)
+ ax_speedup.plot(
+ total_tokens_values,
+ triton_v2_ratios,
+ "go-",
+ linewidth=3,
+ markersize=8,
+ label="Triton/V2 Speedup",
+ )
+ ax_speedup.set_title(
+ f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
+ fontsize=12,
+ fontweight="bold",
+ )
+ ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+ ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
+ ax_speedup.legend(prop={"weight": "bold"})
+ ax_speedup.grid(True, alpha=0.3)
+
+ # Plot bandwidth utilization (right plot)
+ ax_bandwidth.plot(
+ total_tokens_values,
+ v2_bandwidth_percentages,
+ "o-",
+ linewidth=3,
+ markersize=8,
+ label="SiLU V2",
+ color="blue",
+ )
+ ax_bandwidth.plot(
+ total_tokens_values,
+ triton_bandwidth_percentages,
+ "o-",
+ linewidth=3,
+ markersize=8,
+ label="Triton",
+ color="green",
+ )
+ ax_bandwidth.set_title(
+ f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
+ fontsize=12,
+ fontweight="bold",
+ )
+ ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+ ax_bandwidth.set_ylabel(
+ "% of Peak Bandwidth", fontweight="bold", fontsize=11
+ )
+ ax_bandwidth.legend(prop={"weight": "bold"})
+ ax_bandwidth.grid(True, alpha=0.3)
+
+ # Format x-axis labels for both plots
+ for ax in [ax_speedup, ax_bandwidth]:
+ ax.set_xticks(total_tokens_values)
+ ax.set_xticklabels(
+ [
+ f"{tt // 1000}K" if tt >= 1000 else str(tt)
+ for tt in total_tokens_values
+ ],
+ fontweight="bold",
+ )
+ # Make tick labels bold
+ for label in ax.get_xticklabels() + ax.get_yticklabels():
+ label.set_fontweight("bold")
+
+ # Add value labels on Triton/V2 speedup points
+ for x, y in zip(total_tokens_values, triton_v2_ratios):
+ ax_speedup.annotate(
+ f"{y:.2f}x",
+ (x, y),
+ textcoords="offset points",
+ xytext=(0, -15),
+ ha="center",
+ fontsize=9,
+ fontweight="bold",
+ bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
+ )
+
+ plt.tight_layout()
+ plt.subplots_adjust(top=0.93) # Make room for main title
+ filename = "silu_benchmark_total_tokens_3way.png"
+ plt.savefig(filename, dpi=300, bbox_inches="tight")
+ plt.show()
+
+ return filename
+
+
+# Create comprehensive 3-way comparison plots
+combined_plot_filename = create_combined_plot(all_results)
+total_tokens_plot_filename = create_total_tokens_plot(all_results)
+
+print(f"\n{'=' * 80}")
+print("3-Way Benchmark Suite Complete!")
+print(f"Generated combined comparison plot: {combined_plot_filename}")
+print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
+print("Compared: SiLU V2 (CUDA), and Triton implementations")
+print(f"{'=' * 80}")
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 603ce5ecf0d2c..6ddab46214577 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -259,6 +259,7 @@ if __name__ == "__main__":
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
(None, None, None),
(None, FP8_DTYPE, None),
+ (FP8_DTYPE, FP8_DTYPE, None),
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
]
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 40903c6c3444f..131df74c7de1b 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -274,6 +274,7 @@ if __name__ == "__main__":
quant_dtypes = [
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
(None, None, None),
+ (FP8_DTYPE, FP8_DTYPE, None),
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
]
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 4fcdbadd65ecd..602fad1810748 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -11,13 +11,13 @@ from datetime import datetime
from typing import Any
import torch
-import tqdm
-import triton
+from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- _w8a8_block_fp8_matmul,
+ _w8a8_triton_block_scaled_mm,
)
from vllm.platforms import current_platform
+from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)
@@ -56,7 +56,7 @@ def w8a8_block_matmul(
Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization.
It should be 2-dim, e.g., [128, 128].
- output_dytpe: The dtype of the returned tensor.
+ output_dtype: The dtype of the returned tensor.
Returns:
torch.Tensor: The result of matmul.
@@ -83,7 +83,7 @@ def w8a8_block_matmul(
)
if A.dtype == torch.float8_e4m3fn:
- kernel = _w8a8_block_fp8_matmul
+ kernel = _w8a8_triton_block_scaled_mm
else:
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
@@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
# cannot TP
total = [
(512 + 64, 7168),
+ (2112, 7168),
((128 + 64) * 128, 7168),
(128 * (128 + 128), 512),
(7168, 16384),
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index b99c2099f2c38..ba31bc5638298 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# fmt: off
# ruff: noqa: E501
import time
@@ -8,27 +7,33 @@ import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- get_col_major_tma_aligned_tensor,
per_token_group_quant_fp8,
- w8a8_block_fp8_matmul,
+ w8a8_triton_block_scaled_mm,
)
from vllm.triton_utils import triton
-from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
+from vllm.utils.deep_gemm import (
+ calc_diff,
+ fp8_gemm_nt,
+ get_col_major_tma_aligned_tensor,
+ per_block_cast_to_fp8,
+)
-def benchmark_shape(m: int,
- n: int,
- k: int,
- warmup: int = 100,
- repeat: int = 10000,
- verbose: bool = False) -> dict:
+def benchmark_shape(
+ m: int,
+ n: int,
+ k: int,
+ warmup: int = 100,
+ repeat: int = 10000,
+ verbose: bool = False,
+) -> dict:
"""Benchmark all implementations for a specific (m, n, k) shape."""
if verbose:
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
# Create test tensors
- A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
- B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
+ A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+ B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
# Reference result in BF16
torch.cuda.synchronize()
@@ -45,34 +50,39 @@ def benchmark_shape(m: int,
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
- C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+ C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
- A, block_size[1], column_major_scales=True)
+ A, block_size[1], column_major_scales=True
+ )
# === DeepGEMM Implementation ===
def deepgemm_gemm():
- fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
- (B_deepgemm, B_scale_deepgemm),
- C_deepgemm)
+ fp8_gemm_nt(
+ (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
+ )
return C_deepgemm
# === vLLM Triton Implementation ===
def vllm_triton_gemm():
- return w8a8_block_fp8_matmul(A_vllm,
- B_vllm,
- A_scale_vllm,
- B_scale_vllm,
- block_size,
- output_dtype=torch.bfloat16)
+ return w8a8_triton_block_scaled_mm(
+ A_vllm,
+ B_vllm,
+ A_scale_vllm,
+ B_scale_vllm,
+ block_size,
+ output_dtype=torch.bfloat16,
+ )
# === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm():
- return ops.cutlass_scaled_mm(A_vllm_cutlass,
- B_vllm.T,
- scale_a=A_scale_vllm_cutlass,
- scale_b=B_scale_vllm.T,
- out_dtype=torch.bfloat16)
+ return ops.cutlass_scaled_mm(
+ A_vllm_cutlass,
+ B_vllm.T,
+ scale_a=A_scale_vllm_cutlass,
+ scale_b=B_scale_vllm.T,
+ out_dtype=torch.bfloat16,
+ )
# Run correctness check first
if verbose:
@@ -89,26 +99,23 @@ def benchmark_shape(m: int,
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
- print("vLLM Triton vs DeepGEMM difference: "
- f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
- print("vLLM CUTLASS vs DeepGEMM difference: "
- f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
+ print(
+ "vLLM Triton vs DeepGEMM difference: "
+ f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
+ )
+ print(
+ "vLLM CUTLASS vs DeepGEMM difference: "
+ f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
+ )
# Benchmark implementations
implementations = {
"DeepGEMM": deepgemm_gemm,
"vLLM Triton": vllm_triton_gemm,
- "vLLM CUTLASS": vllm_cutlass_gemm
+ "vLLM CUTLASS": vllm_cutlass_gemm,
}
- benchmark_results = {
- "shape": {
- "m": m,
- "n": n,
- "k": k
- },
- "implementations": {}
- }
+ benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
for name, func in implementations.items():
# Warmup
@@ -136,38 +143,36 @@ def benchmark_shape(m: int,
"tflops": tflops,
"gb_s": gb_s,
"diff": {
- "DeepGEMM":
- 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
- "Reference":
- deepgemm_diff if name == "DeepGEMM" else
- (vllm_triton_diff
- if name == "vLLM Triton" else vllm_cutlass_diff)
- }
+ "DeepGEMM": 0.0
+ if name == "DeepGEMM"
+ else calc_diff(func(), C_deepgemm),
+ "Reference": deepgemm_diff
+ if name == "DeepGEMM"
+ else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
+ },
}
if verbose:
- print(
- f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
- )
+ print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
# Calculate speedups
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
for name, data in benchmark_results["implementations"].items():
if name != "DeepGEMM":
speedup = baseline / data["time_ms"]
- benchmark_results["implementations"][name][
- "speedup_vs_deepgemm"] = speedup
+ benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
if verbose:
- print(f"DeepGEMM is {1/speedup:.2f}x "
- f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
+ print(
+ f"DeepGEMM is {1 / speedup:.2f}x "
+ f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
+ )
- vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
- "time_ms"]
- vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
- "time_ms"]
+ vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
+ vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
- benchmark_results["implementations"]["vLLM CUTLASS"][
- "speedup_vs_triton"] = cutlass_vs_triton
+ benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
+ cutlass_vs_triton
+ )
if verbose:
print(
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@@ -179,8 +184,7 @@ def benchmark_shape(m: int,
def format_table_row(values, widths):
"""Format a row with specified column widths."""
- return "| " + " | ".join(f"{val:{w}}"
- for val, w in zip(values, widths)) + " |"
+ return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
def print_table(headers, rows, title=None):
@@ -288,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["DeepGEMM"]
- deepgemm_rows.append([
- shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
- f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
- ])
+ deepgemm_rows.append(
+ [
+ shape["m"],
+ shape["n"],
+ shape["k"],
+ f"{impl_data['time_us']:.1f}",
+ f"{impl_data['tflops']:.1f}",
+ f"{impl_data['gb_s']:.1f}",
+ ]
+ )
- print_table(deepgemm_headers,
- deepgemm_rows,
- title="DeepGEMM Implementation:")
+ print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
# Print vLLM Triton table
- triton_headers = [
- "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
- ]
+ triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
triton_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["vLLM Triton"]
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
- triton_rows.append([
- shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
- f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
- format_speedup(speedup)
- ])
+ triton_rows.append(
+ [
+ shape["m"],
+ shape["n"],
+ shape["k"],
+ f"{impl_data['time_us']:.1f}",
+ f"{impl_data['tflops']:.1f}",
+ f"{impl_data['gb_s']:.1f}",
+ format_speedup(speedup),
+ ]
+ )
- print_table(triton_headers,
- triton_rows,
- title="vLLM Triton Implementation:")
+ print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
# Print vLLM CUTLASS table
cutlass_headers = [
- "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
- "vs Triton"
+ "m",
+ "n",
+ "k",
+ "Time (μs)",
+ "TFLOPS",
+ "GB/s",
+ "vs DeepGEMM",
+ "vs Triton",
]
cutlass_rows = []
for result in all_results:
@@ -327,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
impl_data = result["implementations"]["vLLM CUTLASS"]
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
- cutlass_rows.append([
- shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
- f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
- format_speedup(vs_deepgemm),
- format_speedup(vs_triton)
- ])
+ cutlass_rows.append(
+ [
+ shape["m"],
+ shape["n"],
+ shape["k"],
+ f"{impl_data['time_us']:.1f}",
+ f"{impl_data['tflops']:.1f}",
+ f"{impl_data['gb_s']:.1f}",
+ format_speedup(vs_deepgemm),
+ format_speedup(vs_triton),
+ ]
+ )
- print_table(cutlass_headers,
- cutlass_rows,
- title="vLLM CUTLASS Implementation:")
+ print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
# Calculate and print averages
print("\n===== AVERAGE PERFORMANCE =====")
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
avg_metrics = {
- impl: {
- "tflops": 0,
- "gb_s": 0,
- "time_ms": 0
- }
- for impl in implementations
+ impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
}
for result in all_results:
@@ -366,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
- avg_rows.append([
- impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
- ])
+ avg_rows.append(
+ [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
+ )
print_table(avg_headers, avg_rows)
@@ -376,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
avg_speedups = {
"DeepGEMM vs vLLM Triton": 0,
"DeepGEMM vs vLLM CUTLASS": 0,
- "vLLM CUTLASS vs vLLM Triton": 0
+ "vLLM CUTLASS vs vLLM Triton": 0,
}
for result in all_results:
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
- vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
- "time_ms"]
+ vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
- avg_speedups[
- "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
- avg_speedups[
- "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
- avg_speedups[
- "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
+ avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+ avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+ avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
+ vllm_triton_time / vllm_cutlass_time
+ )
print("\n===== AVERAGE SPEEDUPS =====")
speedup_headers = ["Comparison", "Speedup"]
@@ -408,8 +421,7 @@ def run_benchmarks(verbose: bool = False):
for result in all_results:
for impl in implementations:
- avg_diff[impl] += result["implementations"][impl]["diff"][
- "Reference"]
+ avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
diff_headers = ["Implementation", "Avg Diff vs Reference"]
diff_rows = []
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index a27f02394afbd..9a057990bda5f 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -95,4 +95,10 @@ WEIGHT_SHAPES = {
([2048, 2816], 1),
([1408, 2048], 0),
],
+ "CohereLabs/c4ai-command-a-03-2025": [
+ ([12288, 14336], 1),
+ ([12288, 12288], 0),
+ ([12288, 73728], 1),
+ ([36864, 12288], 0),
+ ],
}
diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index 7adf97bcf5622..f5b5c6c97d484 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
----------------------------------------------------------------------------------------------------
```
+### JSON configuration file for synthetic conversations generation
+
+The input flag `--input-file` is used to determine the input conversations for the benchmark.
+When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
+
+The file `generate_multi_turn.json` is an example file.
+
+The file must contain the sections `prompt_input` and `prompt_output`.
+
+The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
+
+* `num_turns` - Number of total turns in the conversation (both user & assistant).
+The final value will always be rounded to an even number so each user turn has a reply.
+* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
+* `num_tokens` - Total token length of each **user** message (one turn).
+
+The `prompt_output` section must contain `num_tokens`:
+
+* `num_tokens` - Total token length of each **assistant** message (one turn).
+
+### Random distributions for synthetic conversations generation
+
+When creating an input JSON file (such as `generate_multi_turn.json`),
+every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.
+The distribution determines how to randomly sample values for the field.
+
+The available distributions are listed below.
+
+**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.
+Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
+
+#### constant
+
+```json
+{
+ "distribution": "constant",
+ "value": 500
+}
+```
+
+* `value` - the fixed integer value (always returns the same number).
+
+#### uniform
+
+```json
+{
+ "distribution": "uniform",
+ "min": 12,
+ "max": 18
+}
+```
+
+* `min` - minimum value (inclusive).
+* `max` - maximum value (inclusive), should be equal or larger than min.
+
+#### lognormal
+
+```json
+{
+ "distribution": "lognormal",
+ "average": 1000,
+ "max": 5000
+}
+```
+
+You can parameterize the lognormal distribution in one of two ways:
+
+Using the average and optional median ratio:
+
+* `average` - target average value of the distribution.
+* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
+
+Using the parameters of the underlying normal distribution:
+
+* `mean` - mean of the underlying normal distribution.
+* `sigma` - standard deviation of the underlying normal distribution.
+
+#### zipf
+
+```json
+{
+ "distribution": "zipf",
+ "alpha": 1.2,
+ "max": 100
+}
+```
+
+* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
+
+#### poisson
+
+```json
+{
+ "distribution": "poisson",
+ "alpha": 10,
+ "max": 50
+}
+```
+
+* `alpha` - expected value (λ). Also the variance of the distribution.
+
## ShareGPT Conversations
To run with the ShareGPT data, download the following ShareGPT dataset:
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
index 411b89dd23dc6..67b937930d58c 100644
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution):
def __init__(
- self, mean: float, sigma: float, max_val: Optional[int] = None
+ self,
+ mean: Optional[float] = None,
+ sigma: Optional[float] = None,
+ average: Optional[int] = None,
+ median_ratio: Optional[float] = None,
+ max_val: Optional[int] = None,
) -> None:
+ self.average = average
+ self.median_ratio = median_ratio
+ self.max_val = max_val
+
+ if average is not None:
+ if average < 1:
+ raise ValueError("Lognormal average must be positive")
+
+ if mean or sigma:
+ raise ValueError(
+ "When using lognormal average, you can't provide mean/sigma"
+ )
+
+ if self.median_ratio is None:
+ # Default value that provides relatively wide range of values
+ self.median_ratio = 0.85
+
+ # Calculate mean/sigma of np.random.lognormal based on the average
+ mean, sigma = self._generate_lognormal_by_median(
+ target_average=self.average, median_ratio=self.median_ratio
+ )
+ else:
+ if mean is None or sigma is None:
+ raise ValueError(
+ "Must provide both mean and sigma if average is not used"
+ )
+
+ if mean <= 0 or sigma < 0:
+ raise ValueError(
+ "Lognormal mean must be positive and sigma must be non-negative"
+ )
+
+ # Mean and standard deviation of the underlying normal distribution
+ # Based on numpy.random.lognormal
self.mean = mean
self.sigma = sigma
- self.max_val = max_val
+
+ @staticmethod
+ def _generate_lognormal_by_median(
+ target_average: int, median_ratio: float
+ ) -> tuple[float, float]:
+ """
+ Compute (mu, sigma) for a lognormal distribution given:
+ - a target average (mean of the distribution)
+ - a ratio of median / mean (controls skewness), assume mean > median
+
+ Background:
+ If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
+ * mean(X) = exp(mu + sigma^2 / 2)
+ * median(X) = exp(mu)
+
+ So:
+ median / mean = exp(mu) / exp(mu + sigma^2 / 2)
+ = exp(-sigma^2 / 2)
+
+ Rearranging:
+ sigma^2 = 2 * ln(mean / median)
+ mu = ln(median)
+
+ This gives a unique (mu, sigma) for any valid mean and median.
+ """
+ # Check input validity: median must be smaller than mean
+ if median_ratio <= 0 or median_ratio >= 1:
+ raise ValueError("median_ratio must be in range (0, 1)")
+
+ target_median = target_average * median_ratio
+
+ # Solve sigma^2 = 2 * ln(mean / median)
+ sigma = np.sqrt(2 * np.log(target_average / target_median))
+ mu = np.log(target_median)
+
+ return mu, sigma
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
+
+ if self.average is not None:
+ # Scale to average
+ samples *= self.average / samples.mean()
+
if self.max_val:
samples = np.minimum(samples, self.max_val)
return np.round(samples).astype(int)
def __repr__(self) -> str:
- return f"LognormalDistribution[{self.mean}, {self.sigma}]"
+ if self.average:
+ return (
+ f"LognormalDistribution[{self.average}, "
+ f"{self.median_ratio}, {self.max_val}]"
+ )
+ return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
class GenConvArgs(NamedTuple):
@@ -173,10 +257,21 @@ def get_random_distribution(
return PoissonDistribution(conf["alpha"], max_val=max_val)
elif distribution == "lognormal":
+ max_val = conf.get("max", None)
+
+ if "average" in conf:
+ # Infer lognormal mean/sigma (numpy) from input average
+ median_ratio = conf.get("median_ratio", None)
+ return LognormalDistribution(
+ average=conf["average"], median_ratio=median_ratio, max_val=max_val
+ )
+
+ # Use mean/sigma directly (for full control over the distribution)
verify_field_exists(conf, "mean", section, subsection)
verify_field_exists(conf, "sigma", section, subsection)
- max_val = conf.get("max", None)
- return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
+ return LognormalDistribution(
+ mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
+ )
elif distribution == "uniform":
verify_field_exists(conf, "min", section, subsection)
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index d23b7b6e4571d..233ed460fc8d5 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum
from http import HTTPStatus
from statistics import mean
-from typing import NamedTuple, Optional, Union
+from typing import NamedTuple, Union
import aiohttp # type: ignore
import numpy as np # type: ignore
@@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
class ClientArgs(NamedTuple):
seed: int
- max_num_requests: Optional[int]
+ max_num_requests: int | None
skip_first_turn: bool
- max_turns: Optional[int]
+ max_turns: int | None
max_active_conversations: int
verbose: bool
print_content: bool
@@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
class MetricStats:
def __init__(self) -> None:
- self.min: Optional[float] = None
- self.max: Optional[float] = None
- self.avg: Optional[float] = None
+ self.min: float | None = None
+ self.max: float | None = None
+ self.avg: float | None = None
self.sum = 0.0
self.count = 0
@@ -143,7 +143,7 @@ class MovingAverage:
self.index = 0
self.sum = 0.0
self.count = 0
- self.avg: Optional[float] = None
+ self.avg: float | None = None
def update(self, new_value: float) -> None:
if self.count < self.window_size:
@@ -198,14 +198,6 @@ class DebugStats:
self.logger.info("-" * 50)
-# Must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
- if text.startswith(prefix):
- return text[len(prefix) :]
- return text
-
-
def nanosec_to_millisec(value: float) -> float:
return value / 1000000.0
@@ -220,8 +212,8 @@ async def send_request(
chat_url: str,
model: str,
stream: bool = True,
- min_tokens: Optional[int] = None,
- max_tokens: Optional[int] = None,
+ min_tokens: int | None = None,
+ max_tokens: int | None = None,
) -> ServerResponse:
payload = {
"model": model,
@@ -250,9 +242,9 @@ async def send_request(
timeout = aiohttp.ClientTimeout(total=timeout_sec)
valid_response = True
- ttft: Optional[float] = None
+ ttft: float | None = None
chunk_delay: list[int] = []
- latency: Optional[float] = None
+ latency: float | None = None
first_chunk = ""
generated_text = ""
@@ -269,7 +261,7 @@ async def send_request(
if not chunk_bytes:
continue
- chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk == "[DONE]":
# End of stream
latency = time.perf_counter_ns() - start_time
@@ -364,7 +356,7 @@ async def send_turn(
req_args: RequestArgs,
verbose: bool,
verify_output: bool,
-) -> Optional[RequestStats]:
+) -> RequestStats | None:
assert messages_to_use > 0
assert messages_to_use <= len(conversation_messages)
@@ -769,7 +761,7 @@ def get_client_config(
"Number of conversations must be equal or larger than the number of clients"
)
- max_req_per_client: Optional[int] = None
+ max_req_per_client: int | None = None
if args.max_num_requests is not None:
# Max number of requests per client
req_per_client = args.max_num_requests // args.num_clients
@@ -962,7 +954,7 @@ async def main_mp(
# At this point all the clients finished,
# collect results (TTFT, TPOT, etc.) from all the clients.
- # This needs to happens before calling join on the clients
+ # This needs to happen before calling join on the clients
# (result_queue should be emptied).
while not result_queue.empty():
client_metrics.append(result_queue.get())
@@ -1032,7 +1024,7 @@ def process_statistics(
warmup_percentages: list[float],
test_params: dict,
verbose: bool,
- gen_conv_args: Optional[GenConvArgs] = None,
+ gen_conv_args: GenConvArgs | None = None,
excel_output: bool = False,
) -> None:
if len(client_metrics) == 0:
diff --git a/benchmarks/multi_turn/generate_multi_turn.json b/benchmarks/multi_turn/generate_multi_turn.json
index 274d03c2bdb2b..03cfc7d63e8aa 100644
--- a/benchmarks/multi_turn/generate_multi_turn.json
+++ b/benchmarks/multi_turn/generate_multi_turn.json
@@ -15,9 +15,8 @@
},
"prefix_num_tokens": {
"distribution": "lognormal",
- "mean": 6,
- "sigma": 4,
- "max": 1500
+ "average": 1000,
+ "max": 5000
},
"num_tokens": {
"distribution": "uniform",
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
deleted file mode 100644
index 65b1e09a247e2..0000000000000
--- a/benchmarks/pyproject.toml
+++ /dev/null
@@ -1,49 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
- # pycodestyle
- "E",
- # Pyflakes
- "F",
- # pyupgrade
- "UP",
- # flake8-bugbear
- "B",
- # flake8-simplify
- "SIM",
- # isort
- "I",
- # flake8-logging-format
- "G",
-]
-ignore = [
- # star imports
- "F405", "F403",
- # lambda expression assignment
- "E731",
- # Loop control variable not used within loop body
- "B007",
- # f-string format
- "UP032",
- # Can remove once 3.10+ is the minimum Python version
- "UP007",
-]
-
-[tool.ruff.lint.isort]
-known-first-party = ["vllm"]
-
-[tool.ruff.format]
-docstring-code-format = true
\ No newline at end of file
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index cc38cd41a5b24..9bac5ea41c8d4 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,6 +1,7 @@
include(FetchContent)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -87,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
message(STATUS "Apple Silicon Detected")
+ set(APPLE_SILICON_FOUND TRUE)
set(ENABLE_NUMA OFF)
check_sysctl(hw.optional.neon ASIMD_FOUND)
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@@ -99,6 +101,7 @@ else()
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND)
+ find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED)
@@ -175,8 +178,14 @@ elseif (S390_FOUND)
"-mzvector"
"-march=native"
"-mtune=native")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+ if(RVV_FOUND)
+ message(FAIL_ERROR "Can't support rvv now.")
+ else()
+ list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
+ endif()
else()
- message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
+ message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
endif()
#
@@ -188,14 +197,25 @@ else()
set(USE_ACL OFF)
endif()
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
- FetchContent_Declare(
- oneDNN
- GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
- GIT_TAG v3.9
- GIT_PROGRESS TRUE
- GIT_SHALLOW TRUE
- )
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+ set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
+
+ if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
+ message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
+ FetchContent_Declare(
+ oneDNN
+ SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
+ )
+ else()
+ message(STATUS "Downloading oneDNN from GitHub")
+ FetchContent_Declare(
+ oneDNN
+ GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+ GIT_TAG v3.9
+ GIT_PROGRESS TRUE
+ GIT_SHALLOW TRUE
+ )
+ endif()
if(USE_ACL)
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@@ -204,6 +224,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POW
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+ add_compile_definitions(VLLM_USE_ACL)
endif()
set(ONEDNN_LIBRARY_TYPE "STATIC")
@@ -256,7 +277,8 @@ set(VLLM_EXT_SRC
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
- "csrc/cpu/torch_bindings.cpp")
+ "csrc/cpu/torch_bindings.cpp"
+ "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC
@@ -298,4 +320,4 @@ define_gpu_extension_target(
WITH_SOABI
)
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 02224cfe3ee81..c9e7aec880b99 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
else()
FetchContent_Declare(
flashmla
- GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
- GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
+ GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
+ GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
@@ -33,23 +33,64 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with
# sm90a
-cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
+
+set(SUPPORT_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
+ list(APPEND SUPPORT_ARCHS 9.0a)
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
+ list(APPEND SUPPORT_ARCHS 10.0a)
+endif()
+
+
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
+if(FLASH_MLA_ARCHS)
+ set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+ list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
+
set(FlashMLA_SOURCES
- ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
- ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
- ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
- ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
- ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
+ ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
+ ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
+ ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
+ ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
+ )
+
+ set(FlashMLA_Extension_SOURCES
+ ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
+ ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
+ ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
+ )
set(FlashMLA_INCLUDES
+ ${flashmla_SOURCE_DIR}/csrc
+ ${flashmla_SOURCE_DIR}/csrc/sm90
${flashmla_SOURCE_DIR}/csrc/cutlass/include
- ${flashmla_SOURCE_DIR}/csrc)
+ ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+ )
+
+ set(FlashMLA_Extension_INCLUDES
+ ${flashmla_SOURCE_DIR}/csrc
+ ${flashmla_SOURCE_DIR}/csrc/sm90
+ ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
+ ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+ ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+ )
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+ set_gencode_flags_for_srcs(
+ SRCS "${FlashMLA_Extension_SOURCES}"
+ CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+
define_gpu_extension_target(
_flashmla_C
DESTINATION vllm
@@ -60,8 +101,32 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
USE_SABI 3
WITH_SOABI)
+
+ # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+ # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+ target_compile_options(_flashmla_C PRIVATE
+ $<$:-UPy_LIMITED_API>
+ $<$:-UPy_LIMITED_API>)
+
+ define_gpu_extension_target(
+ _flashmla_extension_C
+ DESTINATION vllm
+ LANGUAGE ${VLLM_GPU_LANG}
+ SOURCES ${FlashMLA_Extension_SOURCES}
+ COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
+ ARCHITECTURES ${VLLM_GPU_ARCHES}
+ INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
+ USE_SABI 3
+ WITH_SOABI)
+
+ # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+ # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+ target_compile_options(_flashmla_extension_C PRIVATE
+ $<$:-UPy_LIMITED_API>
+ $<$:-UPy_LIMITED_API>)
else()
- # Create an empty target for setup.py when not targeting sm90a systems
+ # Create empty targets for setup.py when not targeting sm90a systems
add_custom_target(_flashmla_C)
+ add_custom_target(_flashmla_extension_C)
endif()
diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake
new file mode 100644
index 0000000000000..9aace7693077a
--- /dev/null
+++ b/cmake/external_projects/qutlass.cmake
@@ -0,0 +1,97 @@
+include(FetchContent)
+
+set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
+
+if(DEFINED ENV{QUTLASS_SRC_DIR})
+ set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
+endif()
+
+if(QUTLASS_SRC_DIR)
+ FetchContent_Declare(
+ qutlass
+ SOURCE_DIR ${QUTLASS_SRC_DIR}
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ )
+else()
+ FetchContent_Declare(
+ qutlass
+ GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
+ GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
+ GIT_PROGRESS TRUE
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ )
+ FetchContent_Populate(qutlass)
+ set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
+endif()
+
+if(NOT qutlass_SOURCE_DIR)
+ message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
+endif()
+message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
+
+cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
+
+ if(QUTLASS_ARCHS MATCHES "10\\.0a")
+ set(QUTLASS_TARGET_CC 100)
+ elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
+ set(QUTLASS_TARGET_CC 120)
+ else()
+ message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
+ endif()
+
+ set(QUTLASS_SOURCES
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
+ )
+
+ set(QUTLASS_INCLUDES
+ ${qutlass_SOURCE_DIR}
+ ${qutlass_SOURCE_DIR}/qutlass
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/include
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
+ )
+
+ if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
+ list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
+ elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
+ list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
+ message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
+ else()
+ message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
+ "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
+ endif()
+
+ set_gencode_flags_for_srcs(
+ SRCS "${QUTLASS_SOURCES}"
+ CUDA_ARCHS "${QUTLASS_ARCHS}"
+ )
+
+ target_sources(_C PRIVATE ${QUTLASS_SOURCES})
+ target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
+ target_compile_definitions(_C PRIVATE
+ QUTLASS_DISABLE_PYBIND=1
+ TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
+ )
+
+ set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
+ $<$:--expt-relaxed-constexpr --use_fast_math -O3>
+ )
+
+else()
+ if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
+ message(STATUS
+ "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
+ else()
+ message(STATUS
+ "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
+ "CUDA_ARCHS='${CUDA_ARCHS}'.")
+ endif()
+endif()
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 49defccbb1fa4..d4908772c69ec 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
+ GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/hipify.py b/cmake/hipify.py
index 55d378f5b1113..8504f9defee96 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -16,7 +16,7 @@ import shutil
from torch.utils.hipify.hipify_python import hipify
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Project directory where all the source + include files live.
@@ -34,15 +34,14 @@ if __name__ == '__main__':
)
# Source files to convert.
- parser.add_argument("sources",
- help="Source files to hipify.",
- nargs="*",
- default=[])
+ parser.add_argument(
+ "sources", help="Source files to hipify.", nargs="*", default=[]
+ )
args = parser.parse_args()
# Limit include scope to project_dir only
- includes = [os.path.join(args.project_dir, '*')]
+ includes = [os.path.join(args.project_dir, "*")]
# Get absolute path for all source files.
extra_files = [os.path.abspath(s) for s in args.sources]
@@ -51,25 +50,31 @@ if __name__ == '__main__':
# The directory might already exist to hold object files so we ignore that.
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
- hipify_result = hipify(project_directory=args.project_dir,
- output_directory=args.output_dir,
- header_include_dirs=[],
- includes=includes,
- extra_files=extra_files,
- show_detailed=True,
- is_pytorch_extension=True,
- hipify_extra_files_only=True)
+ hipify_result = hipify(
+ project_directory=args.project_dir,
+ output_directory=args.output_dir,
+ header_include_dirs=[],
+ includes=includes,
+ extra_files=extra_files,
+ show_detailed=True,
+ is_pytorch_extension=True,
+ hipify_extra_files_only=True,
+ )
hipified_sources = []
for source in args.sources:
s_abs = os.path.abspath(source)
- hipified_s_abs = (hipify_result[s_abs].hipified_path if
- (s_abs in hipify_result
- and hipify_result[s_abs].hipified_path is not None)
- else s_abs)
+ hipified_s_abs = (
+ hipify_result[s_abs].hipified_path
+ if (
+ s_abs in hipify_result
+ and hipify_result[s_abs].hipified_path is not None
+ )
+ else s_abs
+ )
hipified_sources.append(hipified_s_abs)
- assert (len(hipified_sources) == len(args.sources))
+ assert len(hipified_sources) == len(args.sources)
# Print hipified source files.
print("\n".join(hipified_sources))
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 9c0ed1d09572e..f6a0d2b75be1a 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
list(REMOVE_DUPLICATES _PTX_ARCHS)
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
- # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
- # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
+ # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+ # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
set(_CUDA_ARCHS)
foreach(_arch ${_SRC_CUDA_ARCHS})
- if(_arch MATCHES "\\a$")
+ if(_arch MATCHES "[af]$")
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
- string(REPLACE "a" "" _base "${_arch}")
+ string(REGEX REPLACE "[af]$" "" _base "${_arch}")
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
list(APPEND _CUDA_ARCHS "${_arch}")
@@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
endif()
- set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
target_compile_options(${GPU_MOD_NAME} PRIVATE
$<$:${GPU_COMPILE_FLAGS}>)
diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh
index 57382c1ddc65b..052ff168cec4f 100644
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@@ -28,10 +28,10 @@
#ifdef USE_ROCM
#include
- #include "../quantization/fp8/amd/quant_utils.cuh"
+ #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
typedef __hip_bfloat16 __nv_bfloat16;
#else
- #include "../quantization/fp8/nvidia/quant_utils.cuh"
+ #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
diff --git a/csrc/attention/mla/cutlass_mla_entry.cu b/csrc/attention/mla/cutlass_mla_entry.cu
deleted file mode 100644
index 0319d1daf302f..0000000000000
--- a/csrc/attention/mla/cutlass_mla_entry.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include
-
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
- torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale);
-#endif
-
-void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale) {
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
- return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale);
-#endif
- TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
-}
diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
deleted file mode 100644
index 9d05d910dd81f..0000000000000
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include
-
-#include
-#include
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.h"
-
-#include "cutlass_extensions/common.hpp"
-
-#include "device/sm100_mla.hpp"
-#include "kernel/sm100_mla_tile_scheduler.hpp"
-
-using namespace cute;
-using namespace cutlass::fmha::kernel;
-
-template
-struct MlaSm100 {
- using Element = T;
- using ElementAcc = float;
- using ElementOut = T;
-
- using TileShape = Shape<_128, _128, Shape<_512, _64>>;
- using TileShapeH = cute::tuple_element_t<0, TileShape>;
- using TileShapeD = cute::tuple_element_t<2, TileShape>;
-
- // H K (D_latent D_rope) B
- using ProblemShape = cute::tuple;
-
- using StrideQ = cute::tuple; // H D B
- using StrideK = cute::tuple; // K D B
- using StrideO = StrideK; // H D B
- using StrideLSE = cute::tuple<_1, int>; // H B
-
- using TileScheduler =
- std::conditional_t;
-
- using FmhaKernel =
- cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
- TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
- /*kIsCpAsync=*/true>;
- using Fmha = cutlass::fmha::device::MLA;
-};
-
-template
-typename T::Fmha::Arguments args_from_options(
- at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
- at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
- at::Tensor const& page_table, double scale) {
- cutlass::KernelHardwareInfo hw_info;
- hw_info.device_id = q_nope.device().index();
- hw_info.sm_count =
- cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
- hw_info.device_id);
-
- int batches = q_nope.sizes()[0];
- int page_count_per_seq = page_table.sizes()[1];
- int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
- int page_size = kv_c_and_k_pe_cache.sizes()[1];
- int max_seq_len = page_size * page_count_per_seq;
- using TileShapeH = typename T::TileShapeH;
- using TileShapeD = typename T::TileShapeD;
- auto problem_shape =
- cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
-
- auto [H, K, D, B] = problem_shape;
- auto [D_latent, D_rope] = D;
-
- using StrideQ = typename T::StrideQ;
- using StrideK = typename T::StrideK;
- using StrideO = typename T::StrideO;
- using StrideLSE = typename T::StrideLSE;
-
- StrideQ stride_Q_latent = cute::make_tuple(
- static_cast(D_latent), _1{}, static_cast(H * D_latent));
- StrideQ stride_Q_rope = cute::make_tuple(static_cast(D_rope), _1{},
- static_cast(H * D_rope));
- StrideK stride_C =
- cute::make_tuple(static_cast(D_latent + D_rope), _1{},
- static_cast(page_size * (D_latent + D_rope)));
- StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
- StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast(H));
- StrideO stride_O = cute::make_tuple(static_cast(D_latent), _1{},
- static_cast(H * D_latent));
-
- using Element = typename T::Element;
- using ElementOut = typename T::ElementOut;
- using ElementAcc = typename T::ElementAcc;
- auto Q_latent_ptr = static_cast(q_nope.data_ptr());
- auto Q_rope_ptr = static_cast(q_pe.data_ptr());
- auto C_ptr = static_cast(kv_c_and_k_pe_cache.data_ptr());
- auto scale_f = static_cast(scale);
- typename T::Fmha::Arguments arguments{
- problem_shape,
- {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
- stride_C, C_ptr + D_latent, stride_C,
- static_cast(seq_lens.data_ptr()),
- static_cast(page_table.data_ptr()), stride_PT, page_count_total,
- page_size},
- {static_cast(out.data_ptr()), stride_O,
- static_cast(nullptr), stride_LSE},
- hw_info,
- 1, // split_kv
- nullptr, // is_var_split_kv
- };
- // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
- // split_kv automatically based on batch size and sequence length to balance
- // workload across available SMs. Consider using var_split_kv for manual
- // control if needed.
- T::Fmha::set_split_kv(arguments);
- return arguments;
-}
-
-template
-void runMla(at::Tensor const& out, at::Tensor const& q_nope,
- at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
- at::Tensor const& seq_lens, at::Tensor const& page_table,
- float scale, cudaStream_t stream) {
- using MlaSm100Type = MlaSm100;
- typename MlaSm100Type::Fmha fmha;
- auto arguments = args_from_options(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
- size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
- auto const workspace_options =
- torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
- auto workspace = torch::empty(workspace_size, workspace_options);
-
- CUTLASS_CHECK(fmha.can_implement(arguments));
-
- CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
-
- CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
-}
-
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
- torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale) {
- TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
- TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
- TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
- TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
- "kv_c_and_k_pe_cache must be a 3D tensor");
- TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
- TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
- TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
-
- auto B_q_nope = q_nope.size(0);
- auto H_q_nope = q_nope.size(1);
- auto D_q_nope = q_nope.size(2);
- auto B_q_pe = q_pe.size(0);
- auto H_q_pe = q_pe.size(1);
- auto D_q_pe = q_pe.size(2);
- auto B_pt = page_table.size(0);
- auto PAGE_NUM = page_table.size(1);
- auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
- auto D_ckv = kv_c_and_k_pe_cache.size(2);
- auto B_o = out.size(0);
- auto H_o = out.size(1);
- auto D_o = out.size(2);
-
- TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
- TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
- TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
- TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
- "H_q_nope, H_q_pe, and H_o must be equal to 128");
- TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
- "PAGE_SIZE must be a power of 2");
- TORCH_CHECK(
- B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
- "Batch dims must be same for page_table, q_nope and q_pe, and out");
- TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
- "PAGE_NUM must be divisible by 128 / PAGE_SIZE");
- TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
-
- TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
- q_nope.dtype() == at::ScalarType::BFloat16 ||
- q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
- "q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
- TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
- q_nope.dtype() == q_pe.dtype(),
- "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
- TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
- "seq_lens must be a 32-bit integer tensor");
- TORCH_CHECK(page_table.dtype() == torch::kInt32,
- "page_table must be a 32-bit integer tensor");
-
- auto in_dtype = q_nope.dtype();
- const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
- const cudaStream_t stream =
- at::cuda::getCurrentCUDAStream(q_nope.get_device());
- if (in_dtype == at::ScalarType::Half) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
- page_table, scale, stream);
- } else if (in_dtype == at::ScalarType::BFloat16) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale, stream);
- } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale, stream);
- } else {
- TORCH_CHECK(false, "Unsupported input data type of MLA");
- }
-}
diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
index 95e32559cd540..297d94dcc0631 100644
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@@ -133,6 +133,14 @@ public:
// printf(" sm_count = %d\n", sm_count);
int max_splits = ceil_div(K, 128);
max_splits = min(16, max_splits);
+
+ // TODO: This avoids a hang when the batch size larger than 1 and
+ // there is more than 1 kv_splits.
+ // Discuss with NVIDIA how this can be fixed.
+ if (B > 1) {
+ max_splits = min(1, max_splits);
+ }
+
// printf(" max_splits = %d\n", max_splits);
int sms_per_batch = max(1, sm_count / B);
// printf(" sms_per_batch = %d\n", sms_per_batch);
diff --git a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
index 2cbc2379579eb..1f62c37ba4b7f 100644
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
+ if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
}
- if (local_split_kv <= get<3>(blk_coord))
- continue;
+ if (local_split_kv <= get<3>(blk_coord))
+ continue;
load_page_table(
blk_coord,
problem_shape,
params.mainloop,
shared_storage.tensors,
pipeline_page_table, pipeline_pt_producer_state,
- local_split_kv
+ local_split_kv
);
}
}
@@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
- auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto problem_shape = params.problem_shape;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
+ if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
load_cpasync(
blk_coord,
@@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
params.mainloop_params,
shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state,
- local_split_kv,
+ local_split_kv,
/* must be shared pipe */
pipeline_page_table, pipeline_pt_consumer_state
);
@@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
- auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto problem_shape = params.problem_shape;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
- local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
- }
+ if (params.ptr_split_kv != nullptr) {
+ local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+ }
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
load_tma* paged= */ true>(
blk_coord,
@@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state,
pipeline_load_qk, pipeline_load_qk_producer_state,
- local_split_kv
+ local_split_kv
);
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
}
@@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
- auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto problem_shape = params.problem_shape;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
+ if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
- }
+ }
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
load_tma(
blk_coord,
@@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state,
pipeline_load_qk, pipeline_load_qk_producer_state,
- local_split_kv
+ local_split_kv
);
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
}
@@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
mma(blk_coord,
problem_shape,
@@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
pipeline_mma_s, pipeline_mma_s_producer_state,
pipeline_p_mma, pipeline_p_mma_consumer_state,
pipeline_mma_o, pipeline_mma_o_producer_state,
- local_split_kv
+ local_split_kv
);
}
}
@@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape;
- auto split_kv = params.split_kv;
- auto local_split_kv = split_kv;
+ auto split_kv = params.split_kv;
+ auto local_split_kv = split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
+ if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
compute(
blk_coord,
@@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
pipeline_mma_s, pipeline_mma_s_consumer_state,
pipeline_p_mma, pipeline_p_mma_producer_state,
pipeline_mma_o, pipeline_mma_o_consumer_state,
- local_split_kv
+ local_split_kv
);
}
@@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
cutlass::arch::NamedBarrier(
(kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
kNamedBarrierEpilogue
- ).arrive();
+ ).arrive_and_wait();
return;
}
diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index 6dd6f269f3dc9..d1874515cc8fd 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -36,12 +36,14 @@ limitations under the License.
#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
void sm100_cutlass_mla_decode(
torch::Tensor const& out,
+ torch::Tensor const& lse,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table,
torch::Tensor const& workspace,
+ double sm_scale,
int64_t num_kv_splits) {
TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
}
@@ -64,11 +66,11 @@ struct IsPersistent {
static const bool value = v;
};
-template >
+template >
struct MlaSm100 {
using Element = T;
using ElementAcc = float;
- using ElementOut = T;
+ using ElementOut = TOut;
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
using TileShapeH = cute::tuple_element_t<0, TileShape>;
@@ -99,6 +101,7 @@ struct MlaSm100 {
template
typename T::Fmha::Arguments args_from_options(
at::Tensor const& out,
+ at::Tensor const& lse,
at::Tensor const& q_nope,
at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache,
@@ -162,7 +165,10 @@ typename T::Fmha::Arguments args_from_options(
stride_PT,
page_count_total,
page_size},
- {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE},
+ {static_cast(out.data_ptr()),
+ stride_O,
+ static_cast(lse.defined() ? lse.data_ptr() : nullptr),
+ stride_LSE},
hw_info,
// TODO(trevor-m): Change split_kv back to -1 when
// https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
@@ -178,9 +184,10 @@ typename T::Fmha::Arguments args_from_options(
return arguments;
}
-template
+template
void runMla(
at::Tensor const& out,
+ at::Tensor const& lse,
at::Tensor const& q_nope,
at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache,
@@ -190,9 +197,9 @@ void runMla(
double sm_scale,
int64_t num_kv_splits,
cudaStream_t stream) {
- using MlaSm100Type = MlaSm100;
+ using MlaSm100Type = MlaSm100;
typename MlaSm100Type::Fmha fmha;
- auto arguments = args_from_options(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+ auto arguments = args_from_options(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
CUTLASS_CHECK(fmha.can_implement(arguments));
@@ -214,6 +221,7 @@ void runMla(
void sm100_cutlass_mla_decode(
torch::Tensor const& out,
+ torch::Tensor const& lse,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
@@ -233,14 +241,14 @@ void sm100_cutlass_mla_decode(
DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
if (in_dtype == at::ScalarType::Half) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else if (in_dtype == at::ScalarType::BFloat16) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else {
TORCH_CHECK(false, "Unsupported input data type of MLA");
}
@@ -253,7 +261,7 @@ void sm100_cutlass_mla_decode(
int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
// Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
// which are float, so Element type here doesn't matter.
- using MlaSm100Type = MlaSm100;
+ using MlaSm100Type = MlaSm100;
// Get split kv. Requires problem shape and sm_count only.
typename MlaSm100Type::Fmha::Arguments arguments;
diff --git a/csrc/cache.h b/csrc/cache.h
index fb0c353b96137..b162a4a2bc31f 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -47,4 +47,28 @@ void gather_and_maybe_dequant_cache(
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, const std::string& kv_cache_dtype,
torch::Tensor const& scale,
- std::optional seq_starts = std::nullopt);
\ No newline at end of file
+ std::optional seq_starts = std::nullopt);
+
+// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
+void cp_gather_cache(
+ torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+ torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
+ torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
+ torch::Tensor const& cu_seq_lens, // [BATCH+1]
+ int64_t batch_size, std::optional seq_starts = std::nullopt);
+
+// Indexer K quantization and cache function
+void indexer_k_quant_and_cache(
+ torch::Tensor& k, // [num_tokens, head_dim]
+ torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
+ torch::Tensor& slot_mapping, // [num_tokens]
+ int64_t quant_block_size, // quantization block size
+ const std::string& scale_fmt);
+
+// Extract function to gather quantized K cache
+void cp_gather_indexer_k_quant_cache(
+ const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
+ torch::Tensor& dst_k, // [num_tokens, head_dim]
+ torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
+ const torch::Tensor& block_table, // [batch_size, num_blocks]
+ const torch::Tensor& cu_seq_lens); // [batch_size + 1]
\ No newline at end of file
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index b3a985c2d5bbb..0aa0dc14c7480 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1,6 +1,7 @@
#include
#include
#include
+#include
#include "cuda_utils.h"
#include "cuda_compat.h"
@@ -8,15 +9,14 @@
#include "quantization/vectorization_utils.cuh"
#ifdef USE_ROCM
- #include "quantization/fp8/amd/quant_utils.cuh"
+ #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
#else
- #include "quantization/fp8/nvidia/quant_utils.cuh"
+ #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#endif
#include
#include
-#include