diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 68aff793ae6aa..76f6d7aeca0d8 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@ import os
import sys
import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 50431d0cd4c5e..5ea5a50a258a4 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -218,7 +218,7 @@ if __name__ == "__main__":
"--xaxis",
type=str,
default="# of max concurrency.",
- help="column name to use as X Axis in comparision graph",
+ help="column name to use as X Axis in comparison graph",
)
args = parser.parse_args()
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index 2d88a0b30c4f8..f758097e098e4 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,6 +1,6 @@
[
{
- "test_name": "serving_llama8B_tp1_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -32,7 +32,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -64,7 +64,7 @@
}
},
{
- "test_name": "serving_llama8B_tp4_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -96,7 +96,7 @@
}
},
{
- "test_name": "serving_llama8B_tp1_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -131,7 +131,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -166,7 +166,7 @@
}
},
{
- "test_name": "serving_llama8B_tp4_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -198,5 +198,413 @@
"random-output-len": 128,
"num_prompts": 1000
}
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
}
]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index 823abbaa99f86..ce396d6e54f27 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,6 +1,6 @@
[
{
- "test_name": "serving_llama8B_pp1_sharegpt",
+ "test_name": "serving_llama8B_bf16_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -32,7 +32,39 @@
}
},
{
- "test_name": "serving_llama8B_pp3_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -64,7 +96,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2pp3_sharegpt",
+ "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
@@ -97,7 +129,7 @@
}
},
{
- "test_name": "serving_llama8B_pp1_random_128_128",
+ "test_name": "serving_llama8B_bf16_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -132,7 +164,42 @@
}
},
{
- "test_name": "serving_llama8B_pp3_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -167,7 +234,7 @@
}
},
{
- "test_name": "serving_llama8B_tp2pp3_random_128_128",
+ "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
@@ -201,5 +268,553 @@
"ignore-eos": "",
"num_prompts": 1000
}
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
}
]
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 92a1bcada3879..a1de41652c9a6 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,21 +1,24 @@
steps:
- # aarch64 + CUDA builds
- - label: "Build arm64 wheel - CUDA 12.8"
- id: build-wheel-arm64-cuda-12-8
+ # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+ - label: "Build arm64 wheel - CUDA 12.9"
+ id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- # x86 + CUDA builds
+ - block: "Build CUDA 12.8 wheel"
+ key: block-build-cu128-wheel
+
- label: "Build wheel - CUDA 12.8"
+ depends_on: block-build-cu128-wheel
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
@@ -44,18 +47,14 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
- # However, this block can be uncommented to save some compute hours.
- # - block: "Build CUDA 11.8 wheel"
- # key: block-build-cu118-wheel
-
- - label: "Build wheel - CUDA 11.8"
- # depends_on: block-build-cu118-wheel
- id: build-wheel-cuda-11-8
+ # x86 + CUDA builds
+ - label: "Build wheel - CUDA 12.9"
+ depends_on: ~
+ id: build-wheel-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -75,6 +74,7 @@ steps:
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build release image (arm64)"
depends_on: ~
id: build-release-image-arm64
@@ -82,7 +82,7 @@ steps:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
@@ -103,7 +103,7 @@ steps:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- - build-wheel-cuda-11-8
+ - build-wheel-cuda-12-9
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
@@ -150,18 +150,24 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build Neuron release image"
- key: block-neuron-release-image-build
- depends_on: ~
-
- - label: "Build and publish Neuron release image"
- depends_on: block-neuron-release-image-build
+ - label: "Build and publish nightly multi-arch image to DockerHub"
+ depends_on:
+ - create-multi-arch-manifest
+ if: build.env("NIGHTLY") == "1"
agents:
- queue: neuron-postmerge
+ queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ - "docker push vllm/vllm-openai:nightly"
+ - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ # Clean up old nightly builds (keep only last 14)
+ - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllmbot
+ password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
new file mode 100755
index 0000000000000..1a82f7d085233
--- /dev/null
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+set -ex
+
+# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+
+# DockerHub API endpoint for vllm/vllm-openai repository
+REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
+
+# Get DockerHub token from environment
+if [ -z "$DOCKERHUB_TOKEN" ]; then
+ echo "Error: DOCKERHUB_TOKEN environment variable is not set"
+ exit 1
+fi
+
+# Function to get all tags from DockerHub
+get_all_tags() {
+ local page=1
+ local all_tags=""
+
+ while true; do
+ local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+ "$REPO_API_URL?page=$page&page_size=100")
+
+ # Get both last_updated timestamp and tag name, separated by |
+ local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+
+ if [ -z "$tags" ]; then
+ break
+ fi
+
+ all_tags="$all_tags$tags"$'\n'
+ page=$((page + 1))
+ done
+
+ # Sort by timestamp (newest first) and extract just the tag names
+ echo "$all_tags" | sort -r | cut -d'|' -f2
+}
+
+delete_tag() {
+ local tag_name="$1"
+ echo "Deleting tag: $tag_name"
+
+ local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
+ local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+
+ if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
+ echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
+ else
+ echo "Successfully deleted tag: $tag_name"
+ fi
+}
+
+# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
+echo "Fetching all tags from DockerHub..."
+all_tags=$(get_all_tags)
+
+if [ -z "$all_tags" ]; then
+ echo "No tags found to clean up"
+ exit 0
+fi
+
+# Count total tags
+total_tags=$(echo "$all_tags" | wc -l)
+echo "Found $total_tags tags"
+
+# Keep only the last 14 builds (including the current one)
+tags_to_keep=14
+tags_to_delete=$((total_tags - tags_to_keep))
+
+if [ $tags_to_delete -le 0 ]; then
+ echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
+ exit 0
+fi
+
+echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
+
+# Get tags to delete (skip the first $tags_to_keep tags)
+tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
+
+if [ -z "$tags_to_delete_list" ]; then
+ echo "No tags to delete"
+ exit 0
+fi
+
+# Delete old tags
+echo "Deleting old tags..."
+while IFS= read -r tag; do
+ if [ -n "$tag" ]; then
+ delete_tag "$tag"
+ # Add a small delay to avoid rate limiting
+ sleep 1
+ fi
+done <<< "$tags_to_delete_list"
+
+echo "Cleanup completed successfully"
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
deleted file mode 100644
index a397457c83261..0000000000000
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# This script build the Neuron docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
-
-# Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-
-# prune old image and containers to save disk space, and only once a day
-# by using a timestamp file in tmp.
-if [ -f /tmp/neuron-docker-build-timestamp ]; then
- last_build=$(cat /tmp/neuron-docker-build-timestamp)
- current_time=$(date +%s)
- if [ $((current_time - last_build)) -gt 86400 ]; then
- # Remove dangling images (those that are not tagged and not used by any container)
- docker image prune -f
- # Remove unused volumes / force the system prune for old images as well.
- docker volume prune -f && docker system prune -f
- echo "$current_time" > /tmp/neuron-docker-build-timestamp
- fi
-else
- date "+%s" > /tmp/neuron-docker-build-timestamp
-fi
-
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
-
-# Setup cleanup
-remove_docker_container() {
- docker image rm -f "${image_name}" || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
- -v "${HF_CACHE}:${HF_MOUNT}" \
- -e "HF_HOME=${HF_MOUNT}" \
- -e "HF_TOKEN=${HF_TOKEN}" \
- -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
- -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
- --name "${container_name}" \
- ${image_name} \
- /bin/bash -c "
- set -e; # Exit on first error
- python3 /workspace/vllm/examples/offline_inference/neuron.py;
- python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
- for f in /workspace/vllm/tests/neuron/2_core/*.py; do
- echo \"Running test file: \$f\";
- python3 -m pytest \$f -v --capture=tee-sys;
- done
- "
\ No newline at end of file
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 73f3e63fbf5f6..8c9b00990e995 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -30,10 +30,12 @@ docker run \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ pip install tblib==3.1.0
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 745f285c008ad..43aa8c47be299 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-if [[ $normal_wheel == *"cu118"* ]]; then
- # if $normal_wheel matches cu118, do not upload the index.html
- echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+ # if $normal_wheel matches cu128, do not upload the index.html
+ echo "Skipping index files for cu128 wheels"
else
- # only upload index.html for cu128 wheels (default wheels)
+ # only upload index.html for cu129 wheels (default wheels) as it
+ # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
@@ -74,14 +75,15 @@ fi
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu118"* ]]; then
- # if $normal_wheel matches cu118, do not upload the index.html
- echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+ # if $normal_wheel matches cu128, do not upload the index.html
+ echo "Skipping index files for cu128 wheels"
else
- # only upload index.html for cu128 wheels (default wheels)
+ # only upload index.html for cu129 wheels (default wheels) as it
+ # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 482808cd07e8c..adb5c862eecd9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -41,7 +41,8 @@ steps:
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh
-- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -53,6 +54,7 @@ steps:
- tests/utils_
- tests/worker
- tests/standalone_tests/lazy_imports.py
+ - tests/transformers_utils
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine
@@ -62,8 +64,10 @@ steps:
- pytest -v -s multimodal
- pytest -v -s utils_ # Utils
- pytest -v -s worker # Worker
+ - pytest -v -s transformers_utils # transformers_utils
-- label: Python-only Installation Test
+- label: Python-only Installation Test # 10min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
@@ -71,7 +75,8 @@ steps:
commands:
- bash standalone_tests/python_only_compile.sh
-- label: Basic Correctness Test # 30min
+- label: Basic Correctness Test # 20min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
fast_check: true
torch_nightly: true
@@ -88,7 +93,8 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-- label: Core Test # 10min
+- label: Core Test # 22min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: true
source_file_dependencies:
@@ -98,7 +104,19 @@ steps:
commands:
- pytest -v -s core
-- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Unit Tests # 5min
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration Test (LLM) # 30min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -114,7 +132,8 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-- label: Entrypoints Test (API Server) # 40min
+- label: Entrypoints Integration Test (API Server) # 100min
+ timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -126,10 +145,24 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
-- label: Distributed Tests (4 GPUs) # 10min
+- label: Entrypoints Integration Test (Pooling)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
@@ -172,7 +205,8 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
-- label: EPLB Algorithm Test
+- label: EPLB Algorithm Test # 5min
+ timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
@@ -181,6 +215,7 @@ steps:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min
+ timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@@ -189,13 +224,14 @@ steps:
commands:
- pytest -v -s distributed/test_eplb_execute.py
-- label: Metrics, Tracing Test # 10min
+- label: Metrics, Tracing Test # 12min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/metrics
- - tests/tracing
+ - tests/v1/tracing
commands:
- pytest -v -s metrics
- "pip install \
@@ -208,7 +244,8 @@ steps:
##### fast check tests #####
##### 1 GPU test #####
-- label: Regression Test # 5min
+- label: Regression Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -218,7 +255,8 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 10min
+- label: Engine Test # 25min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -233,7 +271,8 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
-- label: V1 Test e2e + engine
+- label: V1 Test e2e + engine # 30min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -244,7 +283,8 @@ steps:
- pytest -v -s v1/e2e
- pytest -v -s v1/engine
-- label: V1 Test entrypoints
+- label: V1 Test entrypoints # 35min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -252,7 +292,8 @@ steps:
commands:
- pytest -v -s v1/entrypoints
-- label: V1 Test others
+- label: V1 Test others # 42min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -276,7 +317,8 @@ steps:
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-- label: Examples Test # 25min
+- label: Examples Test # 30min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
@@ -294,14 +336,14 @@ steps:
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
-- label: Platform Tests (CUDA)
+- label: Platform Tests (CUDA) # 4min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -309,7 +351,8 @@ steps:
commands:
- pytest -v -s cuda/test_cuda_context.py
-- label: Samplers Test # 36min
+- label: Samplers Test # 56min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/layers
@@ -320,15 +363,23 @@ steps:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-- label: LoRA Test %N # 15min each
+- label: LoRA Test %N # 20min each
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/lora
- tests/lora
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
+ commands:
+ - pytest -v -s lora \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --ignore=lora/test_chatglm3_tp.py \
+ --ignore=lora/test_llama_tp.py \
+ --ignore=lora/test_llm_with_multi_loras.py
parallelism: 4
-- label: PyTorch Compilation Unit Tests
+- label: PyTorch Compilation Unit Tests # 15min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -344,7 +395,8 @@ steps:
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
-- label: PyTorch Fullgraph Smoke Test # 9min
+- label: PyTorch Fullgraph Smoke Test # 15min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -352,13 +404,10 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- # these tests need to be separated, cannot combine
- - pytest -v -s compile/piecewise/test_simple.py
- - pytest -v -s compile/piecewise/test_toy_llama.py
- - pytest -v -s compile/piecewise/test_full_cudagraph.py
- - pytest -v -s compile/piecewise/test_multiple_graphs.py
+ - pytest -v -s compile/piecewise/
-- label: PyTorch Fullgraph Test # 18min
+- label: PyTorch Fullgraph Test # 20min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -367,7 +416,8 @@ steps:
commands:
- pytest -v -s compile/test_full_graph.py
-- label: Kernels Core Operation Test
+- label: Kernels Core Operation Test # 48min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -375,7 +425,8 @@ steps:
commands:
- pytest -v -s kernels/core
-- label: Kernels Attention Test %N
+- label: Kernels Attention Test %N # 23min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/attention/
@@ -386,7 +437,8 @@ steps:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels Quantization Test %N
+- label: Kernels Quantization Test %N # 64min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/quantization/
@@ -396,7 +448,8 @@ steps:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels MoE Test %N
+- label: Kernels MoE Test %N # 40min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
@@ -408,7 +461,8 @@ steps:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels Mamba Test
+- label: Kernels Mamba Test # 31min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/mamba/
@@ -416,7 +470,8 @@ steps:
commands:
- pytest -v -s kernels/mamba
-- label: Tensorizer Test # 11min
+- label: Tensorizer Test # 14min
+ timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/model_loader
@@ -428,7 +483,8 @@ steps:
- pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-- label: Model Executor Test
+- label: Model Executor Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor
@@ -438,7 +494,8 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
-- label: Benchmarks # 9min
+- label: Benchmarks # 11min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
@@ -446,7 +503,8 @@ steps:
commands:
- bash scripts/run-benchmarks.sh
-- label: Benchmarks CLI Test # 10min
+- label: Benchmarks CLI Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -454,7 +512,8 @@ steps:
commands:
- pytest -v -s benchmarks/
-- label: Quantization Test
+- label: Quantization Test # 70min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -463,10 +522,15 @@ steps:
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -474,7 +538,8 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-- label: OpenAI API correctness
+- label: OpenAI API correctness # 22min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -483,7 +548,8 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
-- label: Encoder Decoder tests # 5min
+- label: Encoder Decoder tests # 12min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -491,7 +557,8 @@ steps:
commands:
- pytest -v -s encoder_decoder
-- label: OpenAI-Compatible Tool Use # 20 min
+- label: OpenAI-Compatible Tool Use # 23 min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
@@ -504,30 +571,82 @@ steps:
##### models test #####
-- label: Basic Models Test # 24min
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- - tests/models
+ - tests/models/test_initialization.py
commands:
- - pytest -v -s models/test_transformers.py
- - pytest -v -s models/test_registry.py
- - pytest -v -s models/test_utils.py
- - pytest -v -s models/test_vision.py
- - pytest -v -s models/test_initialization.py
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-- label: Language Models Test (Standard)
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py \
+ -k 'not test_can_initialize_small_subset' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ commands:
+ - pytest -v -s models/test_transformers.py \
+ models/test_registry.py \
+ models/test_utils.py \
+ models/test_vision.py
+
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
commands:
+ # Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- - pytest -v -s models/language -m core_model
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
-- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -538,9 +657,15 @@ steps:
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- - pytest -v -s models/language/generation -m hybrid_model
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation \
+ -m hybrid_model \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
-- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
@@ -551,7 +676,18 @@ steps:
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
@@ -560,16 +696,27 @@ steps:
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
-- label: Multi-Modal Processor Test
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+ timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+ - pytest -v -s models/multimodal/processing
-- label: Multi-Modal Models Test (Standard)
+- label: Multi-Modal Models Test (Standard) # 60min
+ timeout_in_minutes: 80
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -579,7 +726,7 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
@@ -611,7 +758,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
-- label: Quantized Models Test
+- label: Quantized Models Test # 45 min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/layers/quantization
@@ -641,7 +789,8 @@ steps:
- python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-- label: Blackwell Test
+- label: Blackwell Test # 38 min
+ timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
@@ -663,7 +812,8 @@ steps:
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
@@ -683,6 +833,7 @@ steps:
##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -692,8 +843,11 @@ steps:
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
- label: 2 Node Tests (4 GPUs in total) # 16min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -717,7 +871,8 @@ steps:
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 110min
+ timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -748,7 +903,8 @@ steps:
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
@@ -758,6 +914,7 @@ steps:
- pytest -v -s models/multimodal/generation/test_maverick.py
- label: Plugin Tests (2 GPUs) # 40min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -770,6 +927,11 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
@@ -778,7 +940,8 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
-- label: Pipeline Parallelism Test # 45min
+- label: Pipeline + Context Parallelism Test # 45min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
@@ -791,8 +954,10 @@ steps:
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
+ # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
-- label: LoRA TP Test (Distributed)
+- label: LoRA TP Test (Distributed) # 17 min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
num_gpus: 4
source_file_dependencies:
@@ -810,9 +975,10 @@ steps:
- label: Weight Loading Multiple GPU Test # 33min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
- num_gpus: 2
+ num_gpus: 2
optional: true
source_file_dependencies:
- vllm/
diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml
new file mode 100644
index 0000000000000..443dfa45af22c
--- /dev/null
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+# - "vllm/v1/attetion/*.py"
+# - "vllm/v1/core/*.py"
+exclude:
+ - "**/*.py"
+
+scan:
+ functions: true # check free functions and methods
+ classes: true # check classes/dataclasses
+ public_only: true # ignore names starting with "_" at any level
+
+annotations:
+ include: # decorators that force‑include a symbol
+ - name: "bc_linter_include" # matched by simple name or dotted suffix
+ propagate_to_members: false # for classes, include methods/inner classes
+ exclude: # decorators that force‑exclude a symbol
+ - name: "bc_linter_skip" # matched by simple name or dotted suffix
+ propagate_to_members: true # for classes, exclude methods/inner classes
+
+excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c087fd555c661..846b68054c0a1 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,18 +5,21 @@
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
-/vllm/multimodal @DarkLight1337 @ywang96
+/vllm/model_executor/model_loader @22quinn
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/sample @22quinn @houseroad
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
@@ -25,8 +28,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @heheda12345
+/vllm/v1/kv_cache_interface.py @heheda12345
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -34,18 +40,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/core @heheda12345
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
+/tests/v1/kv_connector/nixl_integration @NickLucche
# Docs
/docs @hmellor
@@ -67,6 +75,9 @@ mkdocs.yaml @hmellor
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
+# MTP-specific files
+/vllm/model_executor/models/deepseek_mtp.py @luccafong
+
# Mistral-specific files
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
@@ -86,3 +97,8 @@ mkdocs.yaml @hmellor
/vllm/attention/ops/rocm*.py @gshtras
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+# TPU
+/vllm/v1/worker/tpu* @NickLucche
+/vllm/platforms/tpu.py @NickLucche
+/vllm/v1/sample/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
\ No newline at end of file
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 495d207d44260..f2dd2e06214ae 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -124,9 +124,16 @@ pull_request_rules:
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
+ - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
+ - files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
+ - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/tool_server.py
+ - files~=^vllm/entrypoints/tool.py
+ - files~=^vllm/entrypoints/context.py
- title~=(?i)gpt[-_]?oss
+ - title~=(?i)harmony
actions:
label:
add:
@@ -273,6 +280,20 @@ pull_request_rules:
users:
- "sangstar"
+- name: assign reviewer for modelopt changes
+ conditions:
+ - or:
+ - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
+ - files~=^vllm/model_executor/layers/quantization/__init__\.py$
+ - files~=^tests/models/quantization/test_modelopt\.py$
+ - files~=^tests/quantization/test_modelopt\.py$
+ - files~=^tests/models/quantization/test_nvfp4\.py$
+ - files~=^docs/features/quantization/modelopt\.md$
+ actions:
+ assign:
+ users:
+ - "Edwardf0t1"
+
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- -conflict
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 315042fbf5cf4..d8bbedef3174b 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
github.rest.issues.addLabels({
diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml
new file mode 100644
index 0000000000000..823695a921321
--- /dev/null
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,29 @@
+name: BC Lint
+
+on:
+ pull_request:
+ types:
+ - opened
+ - synchronize
+ - reopened
+ - labeled
+ - unlabeled
+
+jobs:
+ bc_lint:
+ if: github.repository_owner == 'vllm-project'
+ runs-on: ubuntu-latest
+ steps:
+ - name: Run BC Lint Action
+ uses: pytorch/test-infra/.github/actions/bc-lint@main
+ with:
+ repo: ${{ github.event.pull_request.head.repo.full_name }}
+ base_sha: ${{ github.event.pull_request.base.sha }}
+ head_sha: ${{ github.event.pull_request.head.sha }}
+ suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+ docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+ config_dir: .github
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+ cancel-in-progress: true
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index d5c6b8d43a6ef..c3e132a536a42 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: '3.12'
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index e0ab3872d8fa3..c2b17abe811cd 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Add new labels and keywords here
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 195579f206a2f..e21d13b8161f3 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 1ee605dc7bb0d..8884359fa0ce4 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
try {
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 656f3d3fa7bc4..82844810a633a 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+ - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/.gitignore b/.gitignore
index 465935d488f84..b1df673e83ca8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
-# triton jit
+# triton jit
.triton
# Byte-compiled / optimized / DLL files
@@ -177,6 +177,14 @@ cython_debug/
# VSCode
.vscode/
+# Claude
+CLAUDE.md
+.claude/
+
+# Codex
+AGENTS.md
+.codex/
+
# DS Store
.DS_Store
@@ -209,4 +217,4 @@ shellcheck*/
csrc/moe/marlin_moe_wna16/kernel_*
# Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
\ No newline at end of file
+ep_kernels_workspace/
diff --git a/.yapfignore b/.yapfignore
index 2d6dcf8380cac..38158259032a6 100644
--- a/.yapfignore
+++ b/.yapfignore
@@ -1 +1,2 @@
collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f1f9a781a07a..8df349ce14fda 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
diff --git a/MANIFEST.in b/MANIFEST.in
index 82fd22b845f09..fb3cccbb4a9c1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,6 @@ include LICENSE
include requirements/common.txt
include requirements/cuda.txt
include requirements/rocm.txt
-include requirements/neuron.txt
include requirements/cpu.txt
include CMakeLists.txt
diff --git a/README.md b/README.md
index 8812aac4ea266..0c6e5aa6b31d2 100644
--- a/README.md
+++ b/README.md
@@ -14,19 +14,24 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |
+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
---
*Latest News* 🔥
+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
-- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
-- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Previous News
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
@@ -76,7 +81,7 @@ vLLM is flexible and easy to use with:
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support
- Multi-LoRA support
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 38072152b653b..ee172642033de 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -95,6 +95,24 @@ become available.
✅ |
lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered |
+
+ | HuggingFace-MTBench |
+ ✅ |
+ ✅ |
+ philschmid/mt-bench |
+
+
+ | HuggingFace-Blazedit |
+ ✅ |
+ ✅ |
+ vdaita/edit_5k_char, vdaita/edit_10k_char |
+
+
+ | Spec Bench |
+ ✅ |
+ ✅ |
+ wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl |
+
| Custom |
✅ |
@@ -110,7 +128,12 @@ become available.
🚧: to be supported
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```
## 🚀 Example - Online Benchmark
@@ -234,6 +257,43 @@ vllm bench serve \
--num-prompts 2048
```
+### Spec Bench Benchmark with Speculative Decoding
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+ --speculative-config $'{"method": "ngram",
+ "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+ "prompt_lookup_min": 2}'
+```
+
+[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
+
+Run all categories:
+
+``` bash
+# Download the dataset using:
+# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+
+vllm bench serve \
+ --model meta-llama/Meta-Llama-3-8B-Instruct \
+ --dataset-name spec_bench \
+ --dataset-path "/data/spec_bench/question.jsonl" \
+ --num-prompts -1
+```
+
+Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
+
+Run only a specific category like "summarization":
+
+``` bash
+vllm bench serve \
+ --model meta-llama/Meta-Llama-3-8B-Instruct \
+ --dataset-name spec_bench \
+ --dataset-path "/data/spec_bench/question.jsonl" \
+ --num-prompts -1
+ --spec-bench-category "summarization"
+```
+
### Other HuggingFaceDataset Examples
```bash
@@ -290,6 +350,18 @@ vllm bench serve \
--num-prompts 80
```
+`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
+
+``` bash
+vllm bench serve \
+ --model Qwen/QwQ-32B \
+ --dataset-name hf \
+ --dataset-path vdaita/edit_5k_char \
+ --num-prompts 90 \
+ --blazedit-min-distance 0.01 \
+ --blazedit-max-distance 0.99
+```
+
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
@@ -689,7 +761,7 @@ python -m vllm.entrypoints.openai.api_server \
Send requests with images:
```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
@@ -716,7 +788,7 @@ python -m vllm.entrypoints.openai.api_server \
Send requests with videos:
```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index 9aad51df6e003..3aa988aac2548 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -31,6 +31,12 @@ cd vllm
You must set the following variables at the top of the script before execution.
+ Note: You can also override the default values below via environment variables when running the script.
+
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+
| Variable | Description | Example Value |
| --- | --- | --- |
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 82c20ffa6554c..ed3679b66f805 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -5,25 +5,41 @@
TAG=$(date +"%Y_%m_%d_%H_%M")
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
-SYSTEM="TPU"
-TP=1
-DOWNLOAD_DIR=""
-INPUT_LEN=4000
-OUTPUT_LEN=16
-MAX_MODEL_LEN=4096
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000
-NUM_SEQS_LIST="128 256"
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM=${SYSTEM:-"TPU"}
+TP=${TP:-1}
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
-echo "result file: $RESULT"
-echo "model: $MODEL"
+echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="
rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
@@ -213,7 +229,7 @@ run_benchmark() {
pkill -if vllm
sleep 10
- printf '=%.0s' $(seq 1 20)
+ echo "===================="
return 0
}
diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
index fd363c2ad0514..eae8d9927ea39 100644
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -57,7 +57,7 @@ def invoke_main() -> None:
"--num-iteration",
type=int,
default=1000,
- help="Number of iterations to run to stablize final data readings",
+ help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--allocate-blocks",
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 2ea4f9ccaff2b..64ffa62c04d85 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset):
# [6880, 6881] -> ['Ġcalls', 'here'] ->
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length,
- # the encoded sequence is truncated before being decode again.
+ # the encoded sequence is truncated before being decoded again.
total_input_len = prefix_len + int(input_lens[i])
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
:total_input_len
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d8b960edaa468..a7892f3f71243 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,191 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark the latency of processing a single batch of requests."""
-
-import argparse
-import dataclasses
-import json
-import os
-import time
-from typing import Any, Optional
-
-import numpy as np
-from tqdm import tqdm
-from typing_extensions import deprecated
-
-import vllm.envs as envs
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any]
-) -> None:
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={"latency": results["latencies"]},
- extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
- )
- if pt_records:
- pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-@deprecated(
- "benchmark_latency.py is deprecated and will be removed in a "
- "future version. Please use 'vllm bench latency' instead.",
-)
-def main(args: argparse.Namespace):
- print(args)
-
- engine_args = EngineArgs.from_cli_args(args)
-
- # NOTE(woosuk): If the request cannot be processed in a single batch,
- # the engine will automatically process the request in multiple batches.
- llm = LLM(**dataclasses.asdict(engine_args))
- assert llm.llm_engine.model_config.max_model_len >= (
- args.input_len + args.output_len
- ), (
- "Please ensure that max_model_len is greater than"
- " the sum of input_len and output_len."
- )
-
- sampling_params = SamplingParams(
- n=args.n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=args.output_len,
- detokenize=not args.disable_detokenize,
- )
- print(sampling_params)
- dummy_prompt_token_ids = np.random.randint(
- 10000, size=(args.batch_size, args.input_len)
- )
- dummy_prompts: list[PromptType] = [
- {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
- ]
-
- def llm_generate():
- if not args.use_beam_search:
- llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
- else:
- llm.beam_search(
- dummy_prompts,
- BeamSearchParams(
- beam_width=args.n,
- max_tokens=args.output_len,
- ignore_eos=True,
- ),
- )
-
- def run_to_completion(profile_dir: Optional[str] = None):
- if profile_dir:
- llm.start_profile()
- llm_generate()
- llm.stop_profile()
- else:
- start_time = time.perf_counter()
- llm_generate()
- end_time = time.perf_counter()
- latency = end_time - start_time
- return latency
-
- print("Warming up...")
- for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
- run_to_completion(profile_dir=None)
-
- if args.profile:
- profile_dir = envs.VLLM_TORCH_PROFILER_DIR
- print(f"Profiling (results will be saved to '{profile_dir}')...")
- run_to_completion(profile_dir=profile_dir)
- return
-
- # Benchmark.
- latencies = []
- for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
- latencies.append(run_to_completion(profile_dir=None))
- latencies = np.array(latencies)
- percentages = [10, 25, 50, 75, 90, 99]
- percentiles = np.percentile(latencies, percentages)
- print(f"Avg latency: {np.mean(latencies)} seconds")
- for percentage, percentile in zip(percentages, percentiles):
- print(f"{percentage}% percentile latency: {percentile} seconds")
-
- # Output JSON results if specified
- if args.output_json:
- results = {
- "avg_latency": np.mean(latencies),
- "latencies": latencies.tolist(),
- "percentiles": dict(zip(percentages, percentiles.tolist())),
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
- save_to_pytorch_benchmark_format(args, results)
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(
- description="Benchmark the latency of processing a single batch of "
- "requests till completion."
- )
- parser.add_argument("--input-len", type=int, default=32)
- parser.add_argument("--output-len", type=int, default=128)
- parser.add_argument("--batch-size", type=int, default=8)
- parser.add_argument(
- "--n",
- type=int,
- default=1,
- help="Number of generated sequences per prompt.",
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-iters-warmup",
- type=int,
- default=10,
- help="Number of iterations to run for warmup.",
- )
- parser.add_argument(
- "--num-iters", type=int, default=30, help="Number of iterations to run."
- )
- parser.add_argument(
- "--profile",
- action="store_true",
- help="profile the generation process of a single batch",
- )
- parser.add_argument(
- "--output-json",
- type=str,
- default=None,
- help="Path to save the latency results in JSON format.",
- )
- parser.add_argument(
- "--disable-detokenize",
- action="store_true",
- help=(
- "Do not detokenize responses (i.e. do not include "
- "detokenization time in the latency measurement)"
- ),
- )
-
- parser = EngineArgs.add_cli_args(parser)
- # V1 enables prefix caching by default which skews the latency
- # numbers. We need to disable prefix caching by default.
- parser.set_defaults(enable_prefix_caching=False)
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
- raise OSError(
- "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
- "Please set it to a valid path to use torch profiler."
- )
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench latency
+
+For help with the new command, run:
+ vllm bench latency --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench latency --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index c60040d05ab7a..11833fa1b3c8b 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -77,7 +77,7 @@ def invoke_main() -> None:
"--num-iteration",
type=int,
default=100,
- help="Number of iterations to run to stablize final data readings",
+ help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--num-req", type=int, default=128, help="Number of requests in the batch"
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 02f5f585c0c16..76cf51498020b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,1324 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-r"""Benchmark online serving throughput.
-
-On the server side, run one of the following commands:
- vLLM OpenAI API server
- vllm serve \
- --swap-space 16
-
-On the client side, run:
- python benchmarks/benchmark_serving.py \
- --backend \
- --model \
- --dataset-name sharegpt \
- --dataset-path \
- --request-rate \ # By default is inf
- --num-prompts # By default is 1000
-
- when using tgi backend, add
- --endpoint /generate_stream
- to the end of the command above.
-"""
-
-import argparse
-import asyncio
-import gc
-import json
-import os
-import random
-import time
-import warnings
-from collections.abc import Iterable
-from dataclasses import dataclass
-from datetime import datetime
-from typing import Any, Literal, Optional
-
-import numpy as np
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from backend_request_func import (
- ASYNC_REQUEST_FUNCS,
- OPENAI_COMPATIBLE_BACKENDS,
- RequestFuncInput,
- RequestFuncOutput,
-)
-
-try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
- from backend_request_func import get_tokenizer
-
-try:
- from vllm.utils import FlexibleArgumentParser
-except ImportError:
- from argparse import ArgumentParser as FlexibleArgumentParser
-
-from benchmark_dataset import (
- AIMODataset,
- ASRDataset,
- BurstGPTDataset,
- ConversationDataset,
- CustomDataset,
- HuggingFaceDataset,
- InstructCoderDataset,
- MTBenchDataset,
- NextEditPredictionDataset,
- RandomDataset,
- SampleRequest,
- ShareGPTDataset,
- SonnetDataset,
- VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.benchmarks.serve import get_request
-
-MILLISECONDS_TO_SECONDS_CONVERSION = 1000
-
-
-@dataclass
-class BenchmarkMetrics:
- completed: int
- total_input: int
- total_output: int
- request_throughput: float
- request_goodput: float
- output_throughput: float
- total_token_throughput: float
- mean_ttft_ms: float
- median_ttft_ms: float
- std_ttft_ms: float
- percentiles_ttft_ms: list[tuple[float, float]]
- mean_tpot_ms: float
- median_tpot_ms: float
- std_tpot_ms: float
- percentiles_tpot_ms: list[tuple[float, float]]
- mean_itl_ms: float
- median_itl_ms: float
- std_itl_ms: float
- percentiles_itl_ms: list[tuple[float, float]]
- # E2EL stands for end-to-end latency per request.
- # It is the time taken on the client side from sending
- # a request to receiving a complete response.
- mean_e2el_ms: float
- median_e2el_ms: float
- std_e2el_ms: float
- percentiles_e2el_ms: list[tuple[float, float]]
-
-
-def calculate_metrics(
- input_requests: list[SampleRequest],
- outputs: list[RequestFuncOutput],
- dur_s: float,
- tokenizer: PreTrainedTokenizerBase,
- selected_percentile_metrics: list[str],
- selected_percentiles: list[float],
- goodput_config_dict: dict[str, float],
-) -> tuple[BenchmarkMetrics, list[int]]:
- actual_output_lens: list[int] = []
- total_input = 0
- completed = 0
- good_completed = 0
- itls: list[float] = []
- tpots: list[float] = []
- all_tpots: list[float] = []
- ttfts: list[float] = []
- e2els: list[float] = []
- for i in range(len(outputs)):
- if outputs[i].success:
- output_len = outputs[i].output_tokens
-
- if not output_len:
- # We use the tokenizer to count the number of output tokens
- # for some serving backends instead of looking at
- # len(outputs[i].itl) since multiple output tokens may be
- # bundled together
- # Note : this may inflate the output token count slightly
- output_len = len(
- tokenizer(
- outputs[i].generated_text, add_special_tokens=False
- ).input_ids
- )
- actual_output_lens.append(output_len)
- total_input += input_requests[i].prompt_len
- tpot = 0
- if output_len > 1:
- latency_minus_ttft = outputs[i].latency - outputs[i].ttft
- tpot = latency_minus_ttft / (output_len - 1)
- tpots.append(tpot)
- # Note: if output_len <= 1, we regard tpot as 0 for goodput
- all_tpots.append(tpot)
- itls += outputs[i].itl
- ttfts.append(outputs[i].ttft)
- e2els.append(outputs[i].latency)
- completed += 1
- else:
- actual_output_lens.append(0)
-
- if goodput_config_dict:
- valid_metrics = []
- slo_values = []
-
- if "ttft" in goodput_config_dict:
- valid_metrics.append(ttfts)
- slo_values.append(
- goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
- if "tpot" in goodput_config_dict:
- valid_metrics.append(all_tpots)
- slo_values.append(
- goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
- if "e2el" in goodput_config_dict:
- valid_metrics.append(e2els)
- slo_values.append(
- goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
- )
-
- for req_metric in zip(*valid_metrics):
- is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
- if is_good_req:
- good_completed += 1
-
- if completed == 0:
- warnings.warn(
- "All requests failed. This is likely due to a misconfiguration "
- "on the benchmark arguments.",
- stacklevel=2,
- )
- metrics = BenchmarkMetrics(
- completed=completed,
- total_input=total_input,
- total_output=sum(actual_output_lens),
- request_throughput=completed / dur_s,
- request_goodput=good_completed / dur_s,
- output_throughput=sum(actual_output_lens) / dur_s,
- total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
- mean_ttft_ms=np.mean(ttfts or 0)
- * 1000, # ttfts is empty if streaming is not supported by backend
- std_ttft_ms=np.std(ttfts or 0) * 1000,
- median_ttft_ms=np.median(ttfts or 0) * 1000,
- percentiles_ttft_ms=[
- (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_tpot_ms=np.mean(tpots or 0) * 1000,
- std_tpot_ms=np.std(tpots or 0) * 1000,
- median_tpot_ms=np.median(tpots or 0) * 1000,
- percentiles_tpot_ms=[
- (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_itl_ms=np.mean(itls or 0) * 1000,
- std_itl_ms=np.std(itls or 0) * 1000,
- median_itl_ms=np.median(itls or 0) * 1000,
- percentiles_itl_ms=[
- (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
- ],
- mean_e2el_ms=np.mean(e2els or 0) * 1000,
- std_e2el_ms=np.std(e2els or 0) * 1000,
- median_e2el_ms=np.median(e2els or 0) * 1000,
- percentiles_e2el_ms=[
- (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
- ],
- )
-
- return metrics, actual_output_lens
-
-
-async def benchmark(
- backend: str,
- api_url: str,
- base_url: str,
- model_id: str,
- model_name: str,
- tokenizer: PreTrainedTokenizerBase,
- input_requests: list[SampleRequest],
- logprobs: Optional[int],
- request_rate: float,
- burstiness: float,
- disable_tqdm: bool,
- profile: bool,
- selected_percentile_metrics: list[str],
- selected_percentiles: list[float],
- ignore_eos: bool,
- goodput_config_dict: dict[str, float],
- max_concurrency: Optional[int],
- lora_modules: Optional[Iterable[str]],
- extra_body: Optional[dict],
- ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
- ramp_up_start_rps: Optional[int] = None,
- ramp_up_end_rps: Optional[int] = None,
-):
- if backend in ASYNC_REQUEST_FUNCS:
- request_func = ASYNC_REQUEST_FUNCS[backend]
- else:
- raise ValueError(f"Unknown backend: {backend}")
-
- print("Starting initial single prompt test run...")
- test_prompt, test_prompt_len, test_output_len, test_mm_content = (
- input_requests[0].prompt,
- input_requests[0].prompt_len,
- input_requests[0].expected_output_len,
- input_requests[0].multi_modal_data,
- )
-
- assert (
- test_mm_content is None
- or isinstance(test_mm_content, dict)
- or (
- isinstance(test_mm_content, list)
- and all(isinstance(item, dict) for item in test_mm_content)
- )
- ), "multi_modal_data must be a dict or list[dict]"
- test_input = RequestFuncInput(
- model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=api_url,
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- multi_modal_content=test_mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- )
-
- test_output = await request_func(request_func_input=test_input)
- if not test_output.success:
- raise ValueError(
- "Initial test run failed - Please make sure benchmark arguments "
- f"are correctly specified. Error: {test_output.error}"
- )
- else:
- print("Initial test run completed. Starting main benchmark run...")
-
- if lora_modules:
- # For each input request, choose a LoRA module at random.
- lora_modules = iter(
- [random.choice(lora_modules) for _ in range(len(input_requests))]
- )
-
- if profile:
- print("Starting profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=base_url + "/start_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- multi_modal_content=test_mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- )
- profile_output = await request_func(request_func_input=profile_input)
- if profile_output.success:
- print("Profiler started")
-
- distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
-
- if ramp_up_strategy is not None:
- print(
- f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
- f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
- "the duration of the benchmark."
- )
- else:
- print(f"Traffic request rate: {request_rate} RPS.")
-
- print(f"Burstiness factor: {burstiness} ({distribution})")
- print(f"Maximum request concurrency: {max_concurrency}")
-
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-
- # This can be used once the minimum Python version is 3.10 or higher,
- # and it will simplify the code in limited_request_func.
- # semaphore = (asyncio.Semaphore(max_concurrency)
- # if max_concurrency else contextlib.nullcontext())
- semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
-
- async def limited_request_func(request_func_input, pbar):
- if semaphore is None:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
- async with semaphore:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
-
- benchmark_start_time = time.perf_counter()
- tasks: list[asyncio.Task] = []
-
- rps_change_events = []
- last_int_rps = -1
- if ramp_up_strategy is not None and ramp_up_start_rps is not None:
- last_int_rps = ramp_up_start_rps
- rps_change_events.append(
- {
- "rps": last_int_rps,
- "timestamp": datetime.now().isoformat(),
- }
- )
-
- async for request, current_request_rate in get_request(
- input_requests,
- request_rate,
- burstiness,
- ramp_up_strategy,
- ramp_up_start_rps,
- ramp_up_end_rps,
- ):
- if ramp_up_strategy is not None:
- current_int_rps = int(current_request_rate)
- if current_int_rps > last_int_rps:
- timestamp = datetime.now().isoformat()
- for rps_val in range(last_int_rps + 1, current_int_rps + 1):
- rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
- last_int_rps = current_int_rps
-
- prompt, prompt_len, output_len, mm_content, request_id = (
- request.prompt,
- request.prompt_len,
- request.expected_output_len,
- request.multi_modal_data,
- request.request_id,
- )
- req_model_id, req_model_name = model_id, model_name
- if lora_modules:
- req_lora_module = next(lora_modules)
- req_model_id, req_model_name = req_lora_module, req_lora_module
-
- request_func_input = RequestFuncInput(
- model=req_model_id,
- model_name=req_model_name,
- prompt=prompt,
- api_url=api_url,
- prompt_len=prompt_len,
- output_len=output_len,
- logprobs=logprobs,
- multi_modal_content=mm_content,
- ignore_eos=ignore_eos,
- extra_body=extra_body,
- request_id=request_id,
- )
- task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
- tasks.append(asyncio.create_task(task))
- outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
-
- if pbar is not None:
- pbar.close()
-
- benchmark_duration = time.perf_counter() - benchmark_start_time
-
- metrics, actual_output_lens = calculate_metrics(
- input_requests=input_requests,
- outputs=outputs,
- dur_s=benchmark_duration,
- tokenizer=tokenizer,
- selected_percentile_metrics=selected_percentile_metrics,
- selected_percentiles=selected_percentiles,
- goodput_config_dict=goodput_config_dict,
- )
-
- print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
- print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
- if max_concurrency is not None:
- print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
- if request_rate != float("inf"):
- print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
- print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
- print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
- print(
- "{:<40} {:<10.2f}".format(
- "Request throughput (req/s):", metrics.request_throughput
- )
- )
- if goodput_config_dict:
- print(
- "{:<40} {:<10.2f}".format(
- "Request goodput (req/s):", metrics.request_goodput
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- "Output token throughput (tok/s):", metrics.output_throughput
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- "Total Token throughput (tok/s):", metrics.total_token_throughput
- )
- )
-
- result = {
- "duration": benchmark_duration,
- "completed": metrics.completed,
- "total_input_tokens": metrics.total_input,
- "total_output_tokens": metrics.total_output,
- "request_throughput": metrics.request_throughput,
- "request_goodput": metrics.request_goodput if goodput_config_dict else None,
- "output_throughput": metrics.output_throughput,
- "total_token_throughput": metrics.total_token_throughput,
- "input_lens": [output.prompt_len for output in outputs],
- "output_lens": actual_output_lens,
- "ttfts": [output.ttft for output in outputs],
- "itls": [output.itl for output in outputs],
- "generated_texts": [output.generated_text for output in outputs],
- "errors": [output.error for output in outputs],
- }
-
- if rps_change_events:
- result["rps_change_events"] = rps_change_events
-
- def process_one_metric(
- # E.g., "ttft"
- metric_attribute_name: str,
- # E.g., "TTFT"
- metric_name: str,
- # E.g., "Time to First Token"
- metric_header: str,
- ):
- # This function prints and adds statistics of the specified
- # metric.
- if metric_attribute_name not in selected_percentile_metrics:
- return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
- print(
- "{:<40} {:<10.2f}".format(
- f"Mean {metric_name} (ms):",
- getattr(metrics, f"mean_{metric_attribute_name}_ms"),
- )
- )
- print(
- "{:<40} {:<10.2f}".format(
- f"Median {metric_name} (ms):",
- getattr(metrics, f"median_{metric_attribute_name}_ms"),
- )
- )
- result[f"mean_{metric_attribute_name}_ms"] = getattr(
- metrics, f"mean_{metric_attribute_name}_ms"
- )
- result[f"median_{metric_attribute_name}_ms"] = getattr(
- metrics, f"median_{metric_attribute_name}_ms"
- )
- result[f"std_{metric_attribute_name}_ms"] = getattr(
- metrics, f"std_{metric_attribute_name}_ms"
- )
- for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
- p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
- result[f"p{p_word}_{metric_attribute_name}_ms"] = value
-
- process_one_metric("ttft", "TTFT", "Time to First Token")
- process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
- process_one_metric("itl", "ITL", "Inter-token Latency")
- process_one_metric("e2el", "E2EL", "End-to-end Latency")
-
- print("=" * 50)
-
- if profile:
- print("Stopping profiler...")
- profile_input = RequestFuncInput(
- model=model_id,
- prompt=test_prompt,
- api_url=base_url + "/stop_profile",
- prompt_len=test_prompt_len,
- output_len=test_output_len,
- logprobs=logprobs,
- )
- profile_output = await request_func(request_func_input=profile_input)
- if profile_output.success:
- print("Profiler stopped")
-
- return result
-
-
-def check_goodput_args(args):
- # Check and parse goodput arguments
- goodput_config_dict = {}
- VALID_NAMES = ["ttft", "tpot", "e2el"]
- if args.goodput:
- goodput_config_dict = parse_goodput(args.goodput)
- for slo_name, slo_val in goodput_config_dict.items():
- if slo_name not in VALID_NAMES:
- raise ValueError(
- f"Invalid metric name found, {slo_name}: {slo_val}. "
- "The service level objective name should be one of "
- f"{str(VALID_NAMES)}. "
- )
- if slo_val < 0:
- raise ValueError(
- f"Invalid value found, {slo_name}: {slo_val}. "
- "The service level objective value should be "
- "non-negative."
- )
- return goodput_config_dict
-
-
-def parse_goodput(slo_pairs):
- goodput_config_dict = {}
- try:
- for slo_pair in slo_pairs:
- slo_name, slo_val = slo_pair.split(":")
- goodput_config_dict[slo_name] = float(slo_val)
- except ValueError as err:
- raise argparse.ArgumentTypeError(
- "Invalid format found for service level objectives. "
- 'Specify service level objectives for goodput as "KEY:VALUE" '
- "pairs, where the key is a metric name, and the value is a "
- "number in milliseconds."
- ) from err
- return goodput_config_dict
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any], file_name: str
-) -> None:
- metrics = [
- "median_ttft_ms",
- "mean_ttft_ms",
- "std_ttft_ms",
- "p99_ttft_ms",
- "mean_tpot_ms",
- "median_tpot_ms",
- "std_tpot_ms",
- "p99_tpot_ms",
- "median_itl_ms",
- "mean_itl_ms",
- "std_itl_ms",
- "p99_itl_ms",
- ]
- # These raw data might be useful, but they are rather big. They can be added
- # later if needed
- ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={k: [results[k]] for k in metrics},
- extra_info={
- k: results[k]
- for k in results
- if k not in metrics and k not in ignored_metrics
- },
- )
- if pt_records:
- # Don't use json suffix here as we don't want CI to pick it up
- pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-@deprecated(
- "benchmark_serving.py is deprecated and will be removed in a future "
- "version. Please use 'vllm bench serve' instead.",
-)
-def main(args: argparse.Namespace):
- print(args)
- random.seed(args.seed)
- np.random.seed(args.seed)
-
- backend = args.backend
- model_id = args.model
- model_name = args.served_model_name
- tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
- tokenizer_mode = args.tokenizer_mode
-
- # Validate ramp-up arguments
- if args.ramp_up_strategy is not None:
- if args.request_rate != float("inf"):
- raise ValueError(
- "When using ramp-up, do not specify --request-rate. "
- "The request rate will be controlled by ramp-up parameters. "
- "Please remove the --request-rate argument."
- )
- if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
- raise ValueError(
- "When using --ramp-up-strategy, both --ramp-up-start-rps and "
- "--ramp-up-end-rps must be specified"
- )
- if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
- raise ValueError("Ramp-up start and end RPS must be non-negative")
- if args.ramp_up_start_rps > args.ramp_up_end_rps:
- raise ValueError("Ramp-up start RPS must be less than end RPS")
- if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
- raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
-
- if args.base_url is not None:
- api_url = f"{args.base_url}{args.endpoint}"
- base_url = f"{args.base_url}"
- else:
- api_url = f"http://{args.host}:{args.port}{args.endpoint}"
- base_url = f"http://{args.host}:{args.port}"
-
- tokenizer = get_tokenizer(
- tokenizer_id,
- tokenizer_mode=tokenizer_mode,
- trust_remote_code=args.trust_remote_code,
- )
-
- if args.dataset_name is None:
- raise ValueError(
- "Please specify '--dataset-name' and the corresponding "
- "'--dataset-path' if required."
- )
-
- if args.dataset_name == "custom":
- dataset = CustomDataset(dataset_path=args.dataset_path)
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- output_len=args.custom_output_len,
- skip_chat_template=args.custom_skip_chat_template,
- request_id_prefix=args.request_id_prefix,
- )
-
- elif args.dataset_name == "sonnet":
- dataset = SonnetDataset(dataset_path=args.dataset_path)
- # For the "sonnet" dataset, formatting depends on the backend.
- if args.backend == "openai-chat":
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- return_prompt_formatted=False,
- request_id_prefix=args.request_id_prefix,
- )
- else:
- assert tokenizer.chat_template or tokenizer.default_chat_template, (
- "Tokenizer/model must have chat template for sonnet dataset."
- )
- input_requests = dataset.sample(
- num_requests=args.num_prompts,
- input_len=args.sonnet_input_len,
- output_len=args.sonnet_output_len,
- prefix_len=args.sonnet_prefix_len,
- tokenizer=tokenizer,
- return_prompt_formatted=True,
- request_id_prefix=args.request_id_prefix,
- )
-
- elif args.dataset_name == "hf":
- # all following datasets are implemented from the
- # HuggingFaceDataset base class
- if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = VisionArenaDataset
- args.hf_split = "train"
- args.hf_subset = None
- elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = InstructCoderDataset
- args.hf_split = "train"
- elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = MTBenchDataset
- args.hf_split = "train"
- elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = ConversationDataset
- elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
- dataset_class = AIMODataset
- args.hf_split = "train"
- elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501
- dataset_class = NextEditPredictionDataset
- args.hf_split = "train"
- elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
- dataset_class = ASRDataset
- args.hf_split = "train"
- else:
- supported_datasets = set(
- [
- dataset_name
- for cls in HuggingFaceDataset.__subclasses__()
- for dataset_name in cls.SUPPORTED_DATASET_PATHS
- ]
- )
- raise ValueError(
- f"Unsupported dataset path: {args.dataset_path}. "
- "Huggingface dataset only supports dataset_path"
- f" from one of following: {supported_datasets}. "
- "Please consider contributing if you would "
- "like to add support for additional dataset formats."
- )
-
- if dataset_class.IS_MULTIMODAL and backend not in [
- "openai-chat",
- "openai-audio",
- ]:
- # multi-modal benchmark is only available on OpenAI Chat backend.
- raise ValueError(
- "Multi-modal content is only supported on 'openai-chat' and "
- "'openai-audio' backend."
- )
- input_requests = dataset_class(
- dataset_path=args.dataset_path,
- dataset_subset=args.hf_subset,
- dataset_split=args.hf_split,
- random_seed=args.seed,
- no_stream=args.no_stream,
- ).sample(
- num_requests=args.num_prompts,
- tokenizer=tokenizer,
- output_len=args.hf_output_len,
- request_id_prefix=args.request_id_prefix,
- )
-
- else:
- # For datasets that follow a similar structure, use a mapping.
- dataset_mapping = {
- "sharegpt": lambda: ShareGPTDataset(
- random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- output_len=args.sharegpt_output_len,
- request_id_prefix=args.request_id_prefix,
- ),
- "burstgpt": lambda: BurstGPTDataset(
- random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- request_id_prefix=args.request_id_prefix,
- ),
- "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
- tokenizer=tokenizer,
- num_requests=args.num_prompts,
- prefix_len=args.random_prefix_len,
- input_len=args.random_input_len,
- output_len=args.random_output_len,
- range_ratio=args.random_range_ratio,
- request_id_prefix=args.request_id_prefix,
- ),
- }
-
- try:
- input_requests = dataset_mapping[args.dataset_name]()
- except KeyError as err:
- raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
- goodput_config_dict = check_goodput_args(args)
-
- # Collect the sampling parameters.
- sampling_params = {
- k: v
- for k, v in {
- "top_p": args.top_p,
- "top_k": args.top_k,
- "min_p": args.min_p,
- "temperature": args.temperature,
- }.items()
- if v is not None
- }
-
- # Sampling parameters are only supported by openai-compatible backend.
- if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
- raise ValueError(
- "Sampling parameters are only supported by openai-compatible backends."
- )
-
- if "temperature" not in sampling_params:
- sampling_params["temperature"] = 0.0 # Default to greedy decoding.
-
- if args.backend == "llama.cpp":
- # Disable prompt caching in llama.cpp backend
- sampling_params["cache_prompt"] = False
-
- # Avoid GC processing "static" data - reduce pause times.
- gc.collect()
- gc.freeze()
-
- benchmark_result = asyncio.run(
- benchmark(
- backend=backend,
- api_url=api_url,
- base_url=base_url,
- model_id=model_id,
- model_name=model_name,
- tokenizer=tokenizer,
- input_requests=input_requests,
- logprobs=args.logprobs,
- request_rate=args.request_rate,
- burstiness=args.burstiness,
- disable_tqdm=args.disable_tqdm,
- profile=args.profile,
- selected_percentile_metrics=args.percentile_metrics.split(","),
- selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
- ignore_eos=args.ignore_eos,
- goodput_config_dict=goodput_config_dict,
- max_concurrency=args.max_concurrency,
- lora_modules=args.lora_modules,
- extra_body=sampling_params,
- ramp_up_strategy=args.ramp_up_strategy,
- ramp_up_start_rps=args.ramp_up_start_rps,
- ramp_up_end_rps=args.ramp_up_end_rps,
- )
- )
-
- # Save config and results to json
- if args.save_result or args.append_result:
- result_json: dict[str, Any] = {}
-
- # Setup
- current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
- result_json["date"] = current_dt
- result_json["backend"] = backend
- result_json["model_id"] = model_id
- result_json["tokenizer_id"] = tokenizer_id
- result_json["num_prompts"] = args.num_prompts
-
- # Metadata
- if args.metadata:
- for item in args.metadata:
- if "=" in item:
- kvstring = item.split("=")
- result_json[kvstring[0].strip()] = kvstring[1].strip()
- else:
- raise ValueError(
- "Invalid metadata format. Please use KEY=VALUE format."
- )
- # Traffic
- result_json["request_rate"] = (
- args.request_rate if args.request_rate < float("inf") else "inf"
- )
- result_json["burstiness"] = args.burstiness
- result_json["max_concurrency"] = args.max_concurrency
-
- if args.ramp_up_strategy is not None:
- result_json["ramp_up_strategy"] = args.ramp_up_strategy
- result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
- result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
-
- # Merge with benchmark result
- result_json = {**result_json, **benchmark_result}
-
- if not args.save_detailed:
- # Remove fields with too many data points
- for field in [
- "input_lens",
- "output_lens",
- "ttfts",
- "itls",
- "generated_texts",
- "errors",
- ]:
- if field in result_json:
- del result_json[field]
- if field in benchmark_result:
- del benchmark_result[field]
-
- # Save to file
- base_model_id = model_id.split("/")[-1]
- max_concurrency_str = (
- f"-concurrency{args.max_concurrency}"
- if args.max_concurrency is not None
- else ""
- )
- if args.ramp_up_strategy is not None:
- file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
- else:
- file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
- if args.result_filename:
- file_name = args.result_filename
- if args.result_dir:
- os.makedirs(args.result_dir, exist_ok=True)
- file_name = os.path.join(args.result_dir, file_name)
- with open(
- file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
- ) as outfile:
- # Append a newline.
- if args.append_result and outfile.tell() != 0:
- outfile.write("\n")
- json.dump(result_json, outfile)
- save_to_pytorch_benchmark_format(args, result_json, file_name)
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(
- description="Benchmark the online serving throughput."
- )
- parser.add_argument(
- "--backend",
- type=str,
- default="vllm",
- choices=list(ASYNC_REQUEST_FUNCS.keys()),
- )
- parser.add_argument(
- "--base-url",
- type=str,
- default=None,
- help="Server or API base url if not using http host and port.",
- )
- # Use 127.0.0.1 here instead of localhost to force the use of ipv4
- parser.add_argument("--host", type=str, default="127.0.0.1")
- parser.add_argument("--port", type=int, default=8000)
- parser.add_argument(
- "--endpoint",
- type=str,
- default="/v1/completions",
- help="API endpoint.",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- default="sharegpt",
- choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
- help="Name of the dataset to benchmark on.",
- )
- parser.add_argument(
- "--dataset-path",
- type=str,
- default=None,
- help="Path to the sharegpt/sonnet dataset. "
- "Or the huggingface dataset ID if using HF dataset.",
- )
- parser.add_argument(
- "--no-stream",
- action="store_true",
- help="Do not load the dataset in streaming mode.",
- )
- parser.add_argument(
- "--max-concurrency",
- type=int,
- default=None,
- help="Maximum number of concurrent requests. This can be used "
- "to help simulate an environment where a higher level component "
- "is enforcing a maximum number of concurrent requests. While the "
- "--request-rate argument controls the rate at which requests are "
- "initiated, this argument will control how many are actually allowed "
- "to execute at a time. This means that when used in combination, the "
- "actual request rate may be lower than specified with --request-rate, "
- "if the server is not processing requests fast enough to keep up.",
- )
-
- parser.add_argument(
- "--model",
- type=str,
- required=True,
- help="Name of the model.",
- )
- parser.add_argument(
- "--tokenizer",
- type=str,
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
- )
- parser.add_argument("--use-beam-search", action="store_true")
- parser.add_argument(
- "--num-prompts",
- type=int,
- default=1000,
- help="Number of prompts to process.",
- )
- parser.add_argument(
- "--logprobs",
- type=int,
- default=None,
- help=(
- "Number of logprobs-per-token to compute & return as part of "
- "the request. If unspecified, then either (1) if beam search "
- "is disabled, no logprobs are computed & a single dummy "
- "logprob is returned for each token; or (2) if beam search "
- "is enabled 1 logprob per token is computed"
- ),
- )
- parser.add_argument(
- "--request-rate",
- type=float,
- default=float("inf"),
- help="Number of requests per second. If this is inf, "
- "then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process or gamma distribution "
- "to synthesize the request arrival times.",
- )
- parser.add_argument(
- "--burstiness",
- type=float,
- default=1.0,
- help="Burstiness factor of the request generation. "
- "Only take effect when request_rate is not inf. "
- "Default value is 1, which follows Poisson process. "
- "Otherwise, the request intervals follow a gamma distribution. "
- "A lower burstiness value (0 < burstiness < 1) results in more "
- "bursty requests. A higher burstiness value (burstiness > 1) "
- "results in a more uniform arrival of requests.",
- )
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument(
- "--trust-remote-code",
- action="store_true",
- help="Trust remote code from huggingface",
- )
- parser.add_argument(
- "--disable-tqdm",
- action="store_true",
- help="Specify to disable tqdm progress bar.",
- )
- parser.add_argument(
- "--profile",
- action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
- )
- parser.add_argument(
- "--save-result",
- action="store_true",
- help="Specify to save benchmark results to a json file",
- )
- parser.add_argument(
- "--save-detailed",
- action="store_true",
- help="When saving the results, whether to include per request "
- "information such as response, error, ttfs, tpots, etc.",
- )
- parser.add_argument(
- "--append-result",
- action="store_true",
- help="Append the benchmark result to the existing json file.",
- )
- parser.add_argument(
- "--metadata",
- metavar="KEY=VALUE",
- nargs="*",
- help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
- "for metadata of this run to be saved in the result JSON file "
- "for record keeping purposes.",
- )
- parser.add_argument(
- "--result-dir",
- type=str,
- default=None,
- help="Specify directory to save benchmark json results."
- "If not specified, results are saved in the current directory.",
- )
- parser.add_argument(
- "--result-filename",
- type=str,
- default=None,
- help="Specify the filename to save benchmark json results."
- "If not specified, results will be saved in "
- "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
- " format.",
- )
- parser.add_argument(
- "--ignore-eos",
- action="store_true",
- help="Set ignore_eos flag when sending the benchmark request."
- "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
- )
- parser.add_argument(
- "--percentile-metrics",
- type=str,
- default="ttft,tpot,itl",
- help="Comma-separated list of selected metrics to report percentils. "
- "This argument specifies the metrics to report percentiles. "
- 'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
- 'Default value is "ttft,tpot,itl".',
- )
- parser.add_argument(
- "--metric-percentiles",
- type=str,
- default="99",
- help="Comma-separated list of percentiles for selected metrics. "
- 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
- 'Default value is "99". '
- 'Use "--percentile-metrics" to select metrics.',
- )
- parser.add_argument(
- "--goodput",
- nargs="+",
- required=False,
- help='Specify service level objectives for goodput as "KEY:VALUE" '
- "pairs, where the key is a metric name, and the value is in "
- 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
- "separated by spaces. Allowed request level metric names are "
- '"ttft", "tpot", "e2el". For more context on the definition of '
- "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
- "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
- )
- parser.add_argument(
- "--request-id-prefix",
- type=str,
- required=False,
- default="benchmark-serving",
- help="Specify the prefix of request id.",
- )
-
- # group for dataset specific arguments
- custom_group = parser.add_argument_group("custom dataset options")
- custom_group.add_argument(
- "--custom-output-len",
- type=int,
- default=256,
- help="Number of output tokens per request, used only for custom dataset.",
- )
- custom_group.add_argument(
- "--custom-skip-chat-template",
- action="store_true",
- help="Skip applying chat template to prompt, used only for custom dataset.",
- )
-
- sonnet_group = parser.add_argument_group("sonnet dataset options")
- sonnet_group.add_argument(
- "--sonnet-input-len",
- type=int,
- default=550,
- help="Number of input tokens per request, used only for sonnet dataset.",
- )
- sonnet_group.add_argument(
- "--sonnet-output-len",
- type=int,
- default=150,
- help="Number of output tokens per request, used only for sonnet dataset.",
- )
- sonnet_group.add_argument(
- "--sonnet-prefix-len",
- type=int,
- default=200,
- help="Number of prefix tokens per request, used only for sonnet dataset.",
- )
-
- sharegpt_group = parser.add_argument_group("sharegpt dataset options")
- sharegpt_group.add_argument(
- "--sharegpt-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output length "
- "from the ShareGPT dataset.",
- )
-
- random_group = parser.add_argument_group("random dataset options")
- random_group.add_argument(
- "--random-input-len",
- type=int,
- default=1024,
- help="Number of input tokens per request, used only for random sampling.",
- )
- random_group.add_argument(
- "--random-output-len",
- type=int,
- default=128,
- help="Number of output tokens per request, used only for random sampling.",
- )
- random_group.add_argument(
- "--random-range-ratio",
- type=float,
- default=0.0,
- help="Range ratio for sampling input/output length, "
- "used only for random sampling. Must be in the range [0, 1) to define "
- "a symmetric sampling range"
- "[length * (1 - range_ratio), length * (1 + range_ratio)].",
- )
- random_group.add_argument(
- "--random-prefix-len",
- type=int,
- default=0,
- help=(
- "Number of fixed prefix tokens before the random context "
- "in a request. "
- "The total input length is the sum of `random-prefix-len` and "
- "a random "
- "context length sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]."
- ),
- )
-
- hf_group = parser.add_argument_group("hf dataset options")
- hf_group.add_argument(
- "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
- )
- hf_group.add_argument(
- "--hf-split", type=str, default=None, help="Split of the HF dataset."
- )
- hf_group.add_argument(
- "--hf-output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the output lengths "
- "from the sampled HF dataset.",
- )
-
- sampling_group = parser.add_argument_group("sampling parameters")
- sampling_group.add_argument(
- "--top-p",
- type=float,
- default=None,
- help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--top-k",
- type=int,
- default=None,
- help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--min-p",
- type=float,
- default=None,
- help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
- )
- sampling_group.add_argument(
- "--temperature",
- type=float,
- default=None,
- help="Temperature sampling parameter. Only has effect on "
- "openai-compatible backends. If not specified, default to greedy "
- "decoding (i.e. temperature==0.0).",
- )
-
- parser.add_argument(
- "--tokenizer-mode",
- type=str,
- default="auto",
- choices=["auto", "slow", "mistral", "custom"],
- help='The tokenizer mode.\n\n* "auto" will use the '
- 'fast tokenizer if available.\n* "slow" will '
- "always use the slow tokenizer. \n* "
- '"mistral" will always use the `mistral_common` tokenizer. \n*'
- '"custom" will use --tokenizer to select the preregistered tokenizer.',
- )
-
- parser.add_argument(
- "--served-model-name",
- type=str,
- default=None,
- help="The model name used in the API. "
- "If not specified, the model name will be the "
- "same as the ``--model`` argument. ",
- )
-
- parser.add_argument(
- "--lora-modules",
- nargs="+",
- default=None,
- help="A subset of LoRA module names passed in when "
- "launching the server. For each request, the "
- "script chooses a LoRA module at random.",
- )
-
- parser.add_argument(
- "--ramp-up-strategy",
- type=str,
- default=None,
- choices=["linear", "exponential"],
- help="The ramp-up strategy. This would be used to "
- "ramp up the request rate from initial RPS to final "
- "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
- "over the duration of the benchmark.",
- )
- parser.add_argument(
- "--ramp-up-start-rps",
- type=int,
- default=None,
- help="The starting request rate for ramp-up (RPS). "
- "Needs to be specified when --ramp-up-strategy is used.",
- )
- parser.add_argument(
- "--ramp-up-end-rps",
- type=int,
- default=None,
- help="The ending request rate for ramp-up (RPS). "
- "Needs to be specified when --ramp-up-strategy is used.",
- )
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench serve
+
+For help with the new command, run:
+ vllm bench serve --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench serve --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index ca6843a72aa36..4aae755eb4e44 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -998,7 +998,7 @@ def create_argument_parser():
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
- help="Comma-separated list of selected metrics to report percentils. "
+ help="Comma-separated list of selected metrics to report percentiles. "
"This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'Default value is "ttft,tpot,itl".',
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6b24b8c8f3c67..b6dc0918fd4d1 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,741 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark offline inference throughput."""
-
-import argparse
-import dataclasses
-import json
-import os
-import random
-import time
-import warnings
-from typing import Any, Optional, Union
-
-import torch
-import uvloop
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from benchmark_dataset import (
- AIMODataset,
- BurstGPTDataset,
- ConversationDataset,
- InstructCoderDataset,
- RandomDataset,
- SampleRequest,
- ShareGPTDataset,
- SonnetDataset,
- VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
- build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TextPrompt, TokensPrompt
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-
-
-def run_vllm(
- requests: list[SampleRequest],
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
- from vllm import LLM, SamplingParams
-
- llm = LLM(**dataclasses.asdict(engine_args))
- assert all(
- llm.llm_engine.model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of"
- " prompt_len and expected_output_len for all requests."
- )
- # Add the requests to the engine.
- prompts: list[Union[TextPrompt, TokensPrompt]] = []
- sampling_params: list[SamplingParams] = []
- for request in requests:
- prompts.append(
- TokensPrompt(
- prompt_token_ids=request.prompt["prompt_token_ids"],
- multi_modal_data=request.multi_modal_data,
- )
- if "prompt_token_ids" in request.prompt
- else TextPrompt(
- prompt=request.prompt, multi_modal_data=request.multi_modal_data
- )
- )
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- lora_requests: Optional[list[LoRARequest]] = None
- if engine_args.enable_lora:
- lora_requests = [request.lora_request for request in requests]
-
- use_beam_search = False
-
- outputs = None
- if not use_beam_search:
- start = time.perf_counter()
- outputs = llm.generate(
- prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
- )
- end = time.perf_counter()
- else:
- assert lora_requests is None, "BeamSearch API does not support LoRA"
- # output_len should be the same for all requests.
- output_len = requests[0].expected_output_len
- for request in requests:
- assert request.expected_output_len == output_len
- start = time.perf_counter()
- llm.beam_search(
- prompts,
- BeamSearchParams(
- beam_width=n,
- max_tokens=output_len,
- ignore_eos=True,
- ),
- )
- end = time.perf_counter()
- return end - start, outputs
-
-
-def run_vllm_chat(
- requests: list[SampleRequest],
- n: int,
- engine_args: EngineArgs,
- disable_detokenize: bool = False,
-) -> tuple[float, list[RequestOutput]]:
- """
- Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
- multimodal models as it properly handles multimodal inputs and chat
- formatting. For non-multimodal models, use run_vllm() instead.
- """
- from vllm import LLM, SamplingParams
-
- llm = LLM(**dataclasses.asdict(engine_args))
-
- assert all(
- llm.llm_engine.model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of "
- "prompt_len and expected_output_len for all requests."
- )
-
- prompts = []
- sampling_params: list[SamplingParams] = []
- for request in requests:
- prompts.append(request.prompt)
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- start = time.perf_counter()
- outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
- end = time.perf_counter()
- return end - start, outputs
-
-
-async def run_vllm_async(
- requests: list[SampleRequest],
- n: int,
- engine_args: AsyncEngineArgs,
- disable_frontend_multiprocessing: bool = False,
- disable_detokenize: bool = False,
-) -> float:
- from vllm import SamplingParams
-
- async with build_async_engine_client_from_engine_args(
- engine_args,
- disable_frontend_multiprocessing=disable_frontend_multiprocessing,
- ) as llm:
- model_config = await llm.get_model_config()
- assert all(
- model_config.max_model_len
- >= (request.prompt_len + request.expected_output_len)
- for request in requests
- ), (
- "Please ensure that max_model_len is greater than the sum of"
- " prompt_len and expected_output_len for all requests."
- )
-
- # Add the requests to the engine.
- prompts: list[Union[TextPrompt, TokensPrompt]] = []
- sampling_params: list[SamplingParams] = []
- lora_requests: list[Optional[LoRARequest]] = []
- for request in requests:
- prompts.append(
- TokensPrompt(
- prompt_token_ids=request.prompt["prompt_token_ids"],
- multi_modal_data=request.multi_modal_data,
- )
- if "prompt_token_ids" in request.prompt
- else TextPrompt(
- prompt=request.prompt, multi_modal_data=request.multi_modal_data
- )
- )
- sampling_params.append(
- SamplingParams(
- n=n,
- temperature=1.0,
- top_p=1.0,
- ignore_eos=True,
- max_tokens=request.expected_output_len,
- detokenize=not disable_detokenize,
- )
- )
- lora_requests.append(request.lora_request)
-
- generators = []
- start = time.perf_counter()
- for i, (prompt, sp, lr) in enumerate(
- zip(prompts, sampling_params, lora_requests)
- ):
- generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
- generators.append(generator)
- all_gens = merge_async_iterators(*generators)
- async for i, res in all_gens:
- pass
- end = time.perf_counter()
- return end - start
-
-
-def run_hf(
- requests: list[SampleRequest],
- model: str,
- tokenizer: PreTrainedTokenizerBase,
- n: int,
- max_batch_size: int,
- trust_remote_code: bool,
- disable_detokenize: bool = False,
-) -> float:
- llm = AutoModelForCausalLM.from_pretrained(
- model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
- )
- if llm.config.model_type == "llama":
- # To enable padding in the HF backend.
- tokenizer.pad_token = tokenizer.eos_token
- llm = llm.cuda()
-
- pbar = tqdm(total=len(requests))
- start = time.perf_counter()
- batch: list[str] = []
- max_prompt_len = 0
- max_output_len = 0
- for i in range(len(requests)):
- prompt = requests[i].prompt
- prompt_len = requests[i].prompt_len
- output_len = requests[i].expected_output_len
- # Add the prompt to the batch.
- batch.append(prompt)
- max_prompt_len = max(max_prompt_len, prompt_len)
- max_output_len = max(max_output_len, output_len)
- if len(batch) < max_batch_size and i != len(requests) - 1:
- # Check if we can add more requests to the batch.
- next_prompt_len = requests[i + 1].prompt_len
- next_output_len = requests[i + 1].expected_output_len
- if (
- max(max_prompt_len, next_prompt_len)
- + max(max_output_len, next_output_len)
- ) <= 2048:
- # We can add more requests to the batch.
- continue
-
- # Generate the sequences.
- input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
- llm_outputs = llm.generate(
- input_ids=input_ids.cuda(),
- do_sample=True,
- num_return_sequences=n,
- temperature=1.0,
- top_p=1.0,
- use_cache=True,
- max_new_tokens=max_output_len,
- )
- if not disable_detokenize:
- # Include the decoding time.
- tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
- pbar.update(len(batch))
-
- # Clear the batch.
- batch = []
- max_prompt_len = 0
- max_output_len = 0
- end = time.perf_counter()
- return end - start
-
-
-def run_mii(
- requests: list[SampleRequest],
- model: str,
- tensor_parallel_size: int,
- output_len: int,
-) -> float:
- from mii import client, serve
-
- llm = serve(model, tensor_parallel=tensor_parallel_size)
- prompts = [request.prompt for request in requests]
-
- start = time.perf_counter()
- llm.generate(prompts, max_new_tokens=output_len)
- end = time.perf_counter()
- client = client(model)
- client.terminate_server()
- return end - start
-
-
-def save_to_pytorch_benchmark_format(
- args: argparse.Namespace, results: dict[str, Any]
-) -> None:
- pt_records = convert_to_pytorch_benchmark_format(
- args=args,
- metrics={
- "requests_per_second": [results["requests_per_second"]],
- "tokens_per_second": [results["tokens_per_second"]],
- },
- extra_info={
- k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
- },
- )
- if pt_records:
- # Don't use json suffix here as we don't want CI to pick it up
- pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
- write_to_json(pt_file, pt_records)
-
-
-def get_requests(args, tokenizer):
- # Common parameters for all dataset types.
- common_kwargs = {
- "dataset_path": args.dataset_path,
- "random_seed": args.seed,
- }
- sample_kwargs = {
- "tokenizer": tokenizer,
- "lora_path": args.lora_path,
- "max_loras": args.max_loras,
- "num_requests": args.num_prompts,
- "input_len": args.input_len,
- "output_len": args.output_len,
- }
-
- if args.dataset_path is None or args.dataset_name == "random":
- sample_kwargs["range_ratio"] = args.random_range_ratio
- sample_kwargs["prefix_len"] = args.prefix_len
- dataset_cls = RandomDataset
- elif args.dataset_name == "sharegpt":
- dataset_cls = ShareGPTDataset
- if args.backend == "vllm-chat":
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_name == "sonnet":
- assert tokenizer.chat_template or tokenizer.default_chat_template, (
- "Tokenizer/model must have chat template for sonnet dataset."
- )
- dataset_cls = SonnetDataset
- sample_kwargs["prefix_len"] = args.prefix_len
- sample_kwargs["return_prompt_formatted"] = True
- elif args.dataset_name == "burstgpt":
- dataset_cls = BurstGPTDataset
- elif args.dataset_name == "hf":
- common_kwargs["no_stream"] = args.no_stream
- if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = VisionArenaDataset
- common_kwargs["dataset_subset"] = None
- common_kwargs["dataset_split"] = "train"
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = InstructCoderDataset
- common_kwargs["dataset_split"] = "train"
- elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = ConversationDataset
- common_kwargs["dataset_subset"] = args.hf_subset
- common_kwargs["dataset_split"] = args.hf_split
- sample_kwargs["enable_multimodal_chat"] = True
- elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
- dataset_cls = AIMODataset
- common_kwargs["dataset_subset"] = None
- common_kwargs["dataset_split"] = "train"
- else:
- raise ValueError(f"Unknown dataset name: {args.dataset_name}")
- # Remove None values
- sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
- return dataset_cls(**common_kwargs).sample(**sample_kwargs)
-
-
-@deprecated(
- "benchmark_throughput.py is deprecated and will be removed in a "
- "future version. Please use 'vllm bench throughput' instead.",
-)
-def main(args: argparse.Namespace):
- if args.seed is None:
- args.seed = 0
- print(args)
- random.seed(args.seed)
- # Sample the requests.
- tokenizer = AutoTokenizer.from_pretrained(
- args.tokenizer, trust_remote_code=args.trust_remote_code
- )
- requests = get_requests(args, tokenizer)
- is_multi_modal = any(request.multi_modal_data is not None for request in requests)
- request_outputs: Optional[list[RequestOutput]] = None
- if args.backend == "vllm":
- if args.async_engine:
- elapsed_time = uvloop.run(
- run_vllm_async(
- requests,
- args.n,
- AsyncEngineArgs.from_cli_args(args),
- args.disable_frontend_multiprocessing,
- args.disable_detokenize,
- )
- )
- else:
- elapsed_time, request_outputs = run_vllm(
- requests,
- args.n,
- EngineArgs.from_cli_args(args),
- args.disable_detokenize,
- )
- elif args.backend == "hf":
- assert args.tensor_parallel_size == 1
- elapsed_time = run_hf(
- requests,
- args.model,
- tokenizer,
- args.n,
- args.hf_max_batch_size,
- args.trust_remote_code,
- args.disable_detokenize,
- )
- elif args.backend == "mii":
- elapsed_time = run_mii(
- requests, args.model, args.tensor_parallel_size, args.output_len
- )
- elif args.backend == "vllm-chat":
- elapsed_time, request_outputs = run_vllm_chat(
- requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
- )
- else:
- raise ValueError(f"Unknown backend: {args.backend}")
-
- if request_outputs:
- # Note: with the vllm and vllm-chat backends,
- # we have request_outputs, which we use to count tokens.
- total_prompt_tokens = 0
- total_output_tokens = 0
- for ro in request_outputs:
- if not isinstance(ro, RequestOutput):
- continue
- total_prompt_tokens += (
- len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
- )
- total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
- total_num_tokens = total_prompt_tokens + total_output_tokens
- else:
- total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
- total_output_tokens = sum(r.expected_output_len for r in requests)
- total_prompt_tokens = total_num_tokens - total_output_tokens
-
- if is_multi_modal and args.backend != "vllm-chat":
- print(
- "\033[91mWARNING\033[0m: Multi-modal request with "
- f"{args.backend} backend detected. The "
- "following metrics are not accurate because image tokens are not"
- " counted. See vllm-project/vllm/issues/9778 for details."
- )
- # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
- # vllm-chat backend counts the image tokens now
-
- print(
- f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
- f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
- f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
- )
- print(f"Total num prompt tokens: {total_prompt_tokens}")
- print(f"Total num output tokens: {total_output_tokens}")
-
- # Output JSON results if specified
- if args.output_json:
- results = {
- "elapsed_time": elapsed_time,
- "num_requests": len(requests),
- "total_num_tokens": total_num_tokens,
- "requests_per_second": len(requests) / elapsed_time,
- "tokens_per_second": total_num_tokens / elapsed_time,
- }
- with open(args.output_json, "w") as f:
- json.dump(results, f, indent=4)
- save_to_pytorch_benchmark_format(args, results)
-
-
-def validate_args(args):
- """
- Validate command-line arguments.
- """
-
- # === Deprecation and Defaulting ===
- if args.dataset is not None:
- warnings.warn(
- "The '--dataset' argument will be deprecated in the next release. "
- "Please use '--dataset-name' and '--dataset-path' instead.",
- stacklevel=2,
- )
- args.dataset_path = args.dataset
-
- if not getattr(args, "tokenizer", None):
- args.tokenizer = args.model
-
- # === Backend Validation ===
- valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
- if args.backend not in valid_backends:
- raise ValueError(f"Unsupported backend: {args.backend}")
-
- # === Dataset Configuration ===
- if not args.dataset and not args.dataset_path:
- print("When dataset path is not set, it will default to random dataset")
- args.dataset_name = "random"
- if args.input_len is None:
- raise ValueError("input_len must be provided for a random dataset")
-
- # === Dataset Name Specific Checks ===
- # --hf-subset and --hf-split: only used
- # when dataset_name is 'hf'
- if args.dataset_name != "hf" and (
- getattr(args, "hf_subset", None) is not None
- or getattr(args, "hf_split", None) is not None
- ):
- warnings.warn(
- "--hf-subset and --hf-split will be ignored \
- since --dataset-name is not 'hf'.",
- stacklevel=2,
- )
- elif args.dataset_name == "hf":
- if args.dataset_path in (
- VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
- | ConversationDataset.SUPPORTED_DATASET_PATHS
- ):
- assert args.backend == "vllm-chat", (
- f"{args.dataset_path} needs to use vllm-chat as the backend."
- ) # noqa: E501
- elif args.dataset_path in (
- InstructCoderDataset.SUPPORTED_DATASET_PATHS
- | AIMODataset.SUPPORTED_DATASET_PATHS
- ):
- assert args.backend == "vllm", (
- f"{args.dataset_path} needs to use vllm as the backend."
- ) # noqa: E501
- else:
- raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
-
- # --random-range-ratio: only used when dataset_name is 'random'
- if args.dataset_name != "random" and args.random_range_ratio is not None:
- warnings.warn(
- "--random-range-ratio will be ignored since \
- --dataset-name is not 'random'.",
- stacklevel=2,
- )
-
- # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
- # set.
- if (
- args.dataset_name not in {"random", "sonnet", None}
- and args.prefix_len is not None
- ):
- warnings.warn(
- "--prefix-len will be ignored since --dataset-name\
- is not 'random', 'sonnet', or not set.",
- stacklevel=2,
- )
-
- # === LoRA Settings ===
- if getattr(args, "enable_lora", False) and args.backend != "vllm":
- raise ValueError("LoRA benchmarking is only supported for vLLM backend")
- if getattr(args, "enable_lora", False) and args.lora_path is None:
- raise ValueError("LoRA path must be provided when enable_lora is True")
-
- # === Backend-specific Validations ===
- if args.backend == "hf" and args.hf_max_batch_size is None:
- raise ValueError("HF max batch size is required for HF backend")
- if args.backend != "hf" and args.hf_max_batch_size is not None:
- raise ValueError("HF max batch size is only for HF backend.")
-
- if (
- args.backend in {"hf", "mii"}
- and getattr(args, "quantization", None) is not None
- ):
- raise ValueError("Quantization is only for vLLM backend.")
-
- if args.backend == "mii" and args.dtype != "auto":
- raise ValueError("dtype must be auto for MII backend.")
- if args.backend == "mii" and args.n != 1:
- raise ValueError("n must be 1 for MII backend.")
- if args.backend == "mii" and args.tokenizer != args.model:
- raise ValueError("Tokenizer must be the same as the model for MII backend.")
-
- # --data-parallel is not supported currently.
- # https://github.com/vllm-project/vllm/issues/16222
- if args.data_parallel_size > 1:
- raise ValueError(
- "Data parallel is not supported in offline benchmark, "
- "please use benchmark serving instead"
- )
-
-
-def create_argument_parser():
- parser = FlexibleArgumentParser(description="Benchmark the throughput.")
- parser.add_argument(
- "--backend",
- type=str,
- choices=["vllm", "hf", "mii", "vllm-chat"],
- default="vllm",
- )
- parser.add_argument(
- "--dataset-name",
- type=str,
- choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
- help="Name of the dataset to benchmark on.",
- default="sharegpt",
- )
- parser.add_argument(
- "--no-stream",
- action="store_true",
- help="Do not load the dataset in streaming mode.",
- )
- parser.add_argument(
- "--dataset",
- type=str,
- default=None,
- help="Path to the ShareGPT dataset, will be deprecated in\
- the next release. The dataset is expected to "
- "be a json in form of list[dict[..., conversations: "
- "list[dict[..., value: ]]]]",
- )
- parser.add_argument(
- "--dataset-path", type=str, default=None, help="Path to the dataset"
- )
- parser.add_argument(
- "--input-len",
- type=int,
- default=None,
- help="Input prompt length for each request",
- )
- parser.add_argument(
- "--output-len",
- type=int,
- default=None,
- help="Output length for each request. Overrides the "
- "output length from the dataset.",
- )
- parser.add_argument(
- "--n", type=int, default=1, help="Number of generated sequences per prompt."
- )
- parser.add_argument(
- "--num-prompts", type=int, default=1000, help="Number of prompts to process."
- )
- parser.add_argument(
- "--hf-max-batch-size",
- type=int,
- default=None,
- help="Maximum batch size for HF backend.",
- )
- parser.add_argument(
- "--output-json",
- type=str,
- default=None,
- help="Path to save the throughput results in JSON format.",
- )
- parser.add_argument(
- "--async-engine",
- action="store_true",
- default=False,
- help="Use vLLM async engine rather than LLM class.",
- )
- parser.add_argument(
- "--disable-frontend-multiprocessing",
- action="store_true",
- default=False,
- help="Disable decoupled async engine frontend.",
- )
- parser.add_argument(
- "--disable-detokenize",
- action="store_true",
- help=(
- "Do not detokenize the response (i.e. do not include "
- "detokenization time in the measurement)"
- ),
- )
- # LoRA
- parser.add_argument(
- "--lora-path",
- type=str,
- default=None,
- help="Path to the LoRA adapters to use. This can be an absolute path, "
- "a relative path, or a Hugging Face model identifier.",
- )
- parser.add_argument(
- "--prefix-len",
- type=int,
- default=None,
- help=f"Number of prefix tokens to be used in RandomDataset "
- "and SonnetDataset. For RandomDataset, the total input "
- "length is the sum of prefix-len (default: "
- f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
- "sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]. For SonnetDataset, "
- f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
- "controls how much of the input is fixed lines versus "
- "random lines, but the total input length remains approximately "
- "input_len tokens.",
- )
- # random dataset
- parser.add_argument(
- "--random-range-ratio",
- type=float,
- default=None,
- help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
- "for sampling input/output length, "
- "used only for RandomDataset. Must be in the range [0, 1) to "
- "define a symmetric sampling range "
- "[length * (1 - range_ratio), length * (1 + range_ratio)].",
- )
-
- # hf dtaset
- parser.add_argument(
- "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
- )
- parser.add_argument(
- "--hf-split", type=str, default=None, help="Split of the HF dataset."
- )
-
- parser = AsyncEngineArgs.add_cli_args(parser)
-
- return parser
-
+import sys
if __name__ == "__main__":
- parser = create_argument_parser()
- args = parser.parse_args()
- if args.tokenizer is None:
- args.tokenizer = args.model
- validate_args(args)
- main(args)
+ print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+ vllm bench throughput
+
+For help with the new command, run:
+ vllm bench throughput --help
+
+Alternatively, you can run the new command directly with:
+ python -m vllm.entrypoints.cli.main bench throughput --help
+""")
+ sys.exit(1)
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 92f97ffabea2a..2c72941cf7e51 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -62,7 +62,7 @@ benchmark() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
@@ -72,7 +72,7 @@ benchmark() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index af2bcba3ea57a..0bbf7cd2b1c81 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -69,7 +69,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
@@ -78,7 +78,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
- '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index 9663503e9baa0..f1e504499eaf6 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -4,7 +4,10 @@
import torch
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- w8a8_block_fp8_matmul,
+ apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+ CUTLASS_BLOCK_FP8_SUPPORTED,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton as vllm_triton
@@ -29,7 +32,7 @@ DEEPSEEK_V3_SHAPES = [
]
-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
"""Build runner function for w8a8 block fp8 matmul."""
factor_for_scale = 1e-2
@@ -37,37 +40,54 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
fp8_max, fp8_min = fp8_info.max, fp8_info.min
# Create random FP8 tensors
- A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
- A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+ A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
- B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
- B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+ B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+ B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
# Create scales
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
- As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
Bs = (
torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
* factor_for_scale
)
+ # SM90 CUTLASS requires row-major format for scales
+ if use_cutlass and current_platform.is_device_capability(90):
+ Bs = Bs.T.contiguous()
+
def run():
- return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+ if use_cutlass:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+ )
+ else:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+ )
return run
+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+ available_providers.append("w8a8-block-fp8-cutlass")
+
+
@vllm_triton.testing.perf_report(
vllm_triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
- line_vals=["torch-bf16", "w8a8-block-fp8"],
- line_names=["torch-bf16", "w8a8-block-fp8"],
+ line_vals=available_providers,
+ line_names=available_providers,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs W8A8 Block FP8 GEMMs",
args={},
@@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
- else: # w8a8-block-fp8
- run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
- ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
- lambda: run_w8a8(), quantiles=quantiles
+ elif provider == "w8a8-block-fp8-triton":
+ run_w8a8_triton = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=False
)
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_triton(), quantiles=quantiles
+ )
+ elif provider == "w8a8-block-fp8-cutlass":
+ run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=True
+ )
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_cutlass(), quantiles=quantiles
+ )
+ else:
+ raise ValueError(f"Unknown provider: {provider}")
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
new file mode 100644
index 0000000000000..93edbcc9391fc
--- /dev/null
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# benchmark custom activation op performance
+import itertools
+
+import torch
+
+import vllm.model_executor.layers.activation # noqa F401
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+intermediate_size = [3072, 9728, 12288]
+configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
+
+
+def benchmark_activation(
+ batch_size: int,
+ seq_len: int,
+ intermediate_size: int,
+ provider: str,
+ func_name: str,
+ dtype: torch.dtype,
+):
+ device = "cuda"
+ num_tokens = batch_size * seq_len
+ dim = intermediate_size
+ current_platform.seed_everything(42)
+ torch.set_default_device(device)
+
+ if func_name == "gelu_and_mul":
+ layer = CustomOp.op_registry[func_name](approximate="none")
+ elif func_name == "gelu_and_mul_tanh":
+ layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+ elif func_name == "fatrelu_and_mul":
+ threshold = 0.5
+ layer = CustomOp.op_registry[func_name](threshold)
+ else:
+ layer = CustomOp.op_registry[func_name]()
+
+ x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
+ compiled_layer = torch.compile(layer.forward_native)
+
+ if provider == "custom":
+ fn = lambda: layer(x)
+ elif provider == "compiled":
+ fn = lambda: compiled_layer(x)
+
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ fn, quantiles=[0.5, 0.2, 0.8]
+ )
+ return ms, max_ms, min_ms
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
+ parser.add_argument(
+ "--func-name",
+ type=str,
+ choices=[
+ "mul_and_silu",
+ "silu_and_mul",
+ "gelu_and_mul",
+ "gelu_and_mul_tanh",
+ "fatrelu_and_mul",
+ "swigluoai_and_mul",
+ "gelu_new",
+ "gelu_fast",
+ "quick_gelu",
+ ],
+ default="silu_and_mul",
+ )
+ parser.add_argument(
+ "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+ )
+ args = parser.parse_args()
+ assert args
+
+ func_name = args.func_name
+ dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+ perf_report = triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size", "seq_len", "intermediate_size"],
+ x_vals=configs,
+ line_arg="provider",
+ line_vals=["custom", "compiled"],
+ line_names=["Custom OP", "Compiled"],
+ styles=[("blue", "-"), ("green", "-")],
+ ylabel="ms",
+ plot_name=f"{func_name}-op-performance",
+ args={},
+ )
+ )
+
+ perf_report(
+ lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
+ batch_size, seq_len, intermediate_size, provider, func_name, dtype
+ )
+ ).run(print_data=True)
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
new file mode 100644
index 0000000000000..a61c17edc1e28
--- /dev/null
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark script for device communicators:
+CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
+and SymmMemCommunicator (multimem, two-shot).
+
+Usage:
+ torchrun --nproc_per_node= benchmark_device_communicators.py [options]
+
+Example:
+ torchrun --nproc_per_node=2 benchmark_device_communicators.py
+ --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
+"""
+
+import json
+import os
+import time
+from contextlib import nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+# Default sequence lengths to benchmark
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+
+# Fixed hidden size and dtype for all benchmarks
+HIDDEN_SIZE = 8192
+BENCHMARK_DTYPE = torch.bfloat16
+
+# CUDA graph settings
+CUDA_GRAPH_CAPTURE_CYCLES = 10
+
+
+class CommunicatorBenchmark:
+ """Benchmark class for testing device communicators."""
+
+ def __init__(
+ self,
+ rank: int,
+ world_size: int,
+ device: torch.device,
+ cpu_group: ProcessGroup,
+ sequence_lengths: list[int],
+ ):
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.cpu_group = cpu_group
+
+ # Calculate max_size_override based on largest sequence length
+ max_seq_len = max(sequence_lengths)
+ max_tensor_elements = max_seq_len * HIDDEN_SIZE
+ self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
+
+ # Initialize communicators
+ self.custom_allreduce = None
+ self.pynccl_comm = None
+ self.symm_mem_comm = None
+ self.symm_mem_comm_multimem = None
+ self.symm_mem_comm_two_shot = None
+
+ self._init_communicators()
+
+ def _init_communicators(self):
+ """Initialize all available communicators."""
+ try:
+ self.custom_allreduce = CustomAllreduce(
+ group=self.cpu_group,
+ device=self.device,
+ max_size=self.max_size_override,
+ )
+ if not self.custom_allreduce.disabled:
+ logger.info("Rank %s: CustomAllreduce initialized", self.rank)
+ else:
+ logger.info("Rank %s: CustomAllreduce disabled", self.rank)
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
+ )
+ self.custom_allreduce = None
+
+ try:
+ self.pynccl_comm = PyNcclCommunicator(
+ group=self.cpu_group, device=self.device
+ )
+ if not self.pynccl_comm.disabled:
+ logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+ else:
+ logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
+ self.pynccl_comm = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
+ )
+ self.pynccl_comm = None
+
+ # Initialize variants for SymmMemCommunicator
+ try:
+ self.symm_mem_comm_multimem = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=True,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_multimem.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_multimem = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_multimem = None
+
+ try:
+ self.symm_mem_comm_two_shot = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=False,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_two_shot.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_two_shot = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_two_shot = None
+
+ def benchmark_allreduce(
+ self, sequence_length: int, num_warmup: int, num_trials: int
+ ) -> dict[str, float]:
+ """Benchmark allreduce operations for all available communicators."""
+
+ results = {}
+
+ # Define communicators with their benchmark functions
+ communicators = []
+
+ if self.custom_allreduce is not None:
+ comm = self.custom_allreduce
+ # CustomAllreduce one-shot
+ communicators.append(
+ (
+ "ca_1stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "1stage", # env variable value
+ )
+ )
+ # CustomAllreduce two-shot
+ communicators.append(
+ (
+ "ca_2stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "2stage", # env variable value
+ )
+ )
+
+ if self.pynccl_comm is not None:
+ comm = self.pynccl_comm
+ communicators.append(
+ (
+ "pynccl",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t: True, # Always available if initialized
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_multimem is not None:
+ comm = self.symm_mem_comm_multimem
+ communicators.append(
+ (
+ "symm_mem_multimem",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_two_shot is not None:
+ comm = self.symm_mem_comm_two_shot
+ communicators.append(
+ (
+ "symm_mem_two_shot",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ # Benchmark each communicator
+ for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+ # Set environment variable if needed
+ if env_var is not None:
+ os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+ else:
+ # Clear the environment variable to avoid interference
+ os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+ latency = self.benchmark_allreduce_single(
+ sequence_length,
+ allreduce_fn,
+ should_use_fn,
+ context,
+ num_warmup,
+ num_trials,
+ )
+ if latency is not None:
+ results[name] = latency
+
+ return results
+
+ def benchmark_allreduce_single(
+ self,
+ sequence_length: int,
+ allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+ should_use_fn: Callable[[torch.Tensor], bool],
+ context,
+ num_warmup: int,
+ num_trials: int,
+ ) -> Optional[float]:
+ """Benchmark method with CUDA graph optimization."""
+ try:
+ # Create test tensor (2D: sequence_length x hidden_size)
+ tensor = torch.randn(
+ sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
+ )
+ if not should_use_fn(tensor):
+ return None
+
+ torch.cuda.synchronize()
+ stream = torch.cuda.Stream()
+ with torch.cuda.stream(stream):
+ graph_input = tensor.clone()
+
+ # Warmup before capture
+ for _ in range(3):
+ allreduce_fn(graph_input)
+
+ # Capture the graph using context manager
+ with context:
+ graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(graph):
+ for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
+ allreduce_fn(graph_input)
+
+ torch.cuda.synchronize()
+ for _ in range(num_warmup):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+
+ for _ in range(num_trials):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ end_time = time.perf_counter()
+
+ # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
+ return (
+ (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
+ )
+
+ except Exception as e:
+ logger.error("CUDA graph benchmark failed: %s", e)
+ raise RuntimeError(
+ f"CUDA graph benchmark failed for communicator: {e}"
+ ) from e
+
+
+def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
+ """Calculate speedup information for a single tensor size."""
+ if not comm_results:
+ return "N/A"
+
+ # Find the fastest communicator
+ fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
+ fastest_time = comm_results[fastest_comm]
+
+ # Calculate speedup vs PyNccl if available
+ if "pynccl" in comm_results:
+ pynccl_time = comm_results["pynccl"]
+ speedup = pynccl_time / fastest_time
+ return f"{fastest_comm} ({speedup:.2f}x)"
+ else:
+ return f"{fastest_comm} (N/A)"
+
+
+def print_results(
+ results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
+):
+ """Print benchmark results in a formatted table."""
+
+ print(f"\n{'=' * 130}")
+ print("Device Communicator Benchmark Results")
+ print(
+ f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
+ f"Hidden Size: {HIDDEN_SIZE}"
+ )
+ print(f"{'=' * 130}")
+
+ # Get all communicator names
+ all_comms = set()
+ for size_results in results.values():
+ all_comms.update(size_results.keys())
+
+ all_comms = sorted(list(all_comms))
+
+ # Print header
+ header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
+ for comm in all_comms:
+ header += f"{comm:<20}"
+ header += f"{'Best (Speedup vs PyNccl)':<30}"
+ print(header)
+ print("-" * len(header))
+
+ # Print results for each sequence length
+ for seq_len in sequence_lengths:
+ if seq_len in results:
+ # Calculate tensor size in elements and bytes
+ tensor_elements = seq_len * HIDDEN_SIZE
+ tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
+
+ # Format tensor size (MB)
+ tensor_size_mb = tensor_bytes / (1024 * 1024)
+ tensor_size_str = f"{tensor_size_mb:.2f} MB"
+
+ # Format tensor shape
+ tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
+
+ row = f"{tensor_shape:<20}{tensor_size_str:<15}"
+ for comm in all_comms:
+ if comm in results[seq_len]:
+ row += f"{results[seq_len][comm]:<20.3f}"
+ else:
+ row += f"{'N/A':<20}"
+
+ # Calculate speedup information
+ speedup_info = _calculate_speedup_info(results[seq_len])
+ row += f"{speedup_info:<30}"
+
+ print(row)
+
+ print(f"{'=' * 130}")
+ print("All times are in milliseconds (ms) per allreduce operation")
+ print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
+
+
+def main():
+ parser = FlexibleArgumentParser(description="Benchmark device communicators")
+
+ parser.add_argument(
+ "--sequence-lengths",
+ type=int,
+ nargs="+",
+ default=DEFAULT_SEQUENCE_LENGTHS,
+ help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
+ )
+
+ parser.add_argument(
+ "--num-warmup", type=int, default=5, help="Number of warmup iterations"
+ )
+
+ parser.add_argument(
+ "--num-trials", type=int, default=50, help="Number of benchmark trials"
+ )
+
+ parser.add_argument("--output-json", type=str, help="Output results to JSON file")
+
+ args = parser.parse_args()
+
+ # Initialize distributed
+ if not dist.is_initialized():
+ dist.init_process_group(backend="gloo")
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+
+ # Set device
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(device)
+
+ # Get CPU process group
+ cpu_group = dist.new_group(backend="gloo")
+
+ # Disable USE_SYMM_MEM to avoid affecting the max_sizes
+ # in symm_mem and custom_all_reduce for benchmark
+ os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+ # Initialize benchmark
+ benchmark = CommunicatorBenchmark(
+ rank, world_size, device, cpu_group, args.sequence_lengths
+ )
+
+ # Run benchmarks
+ all_results = {}
+
+ for seq_len in args.sequence_lengths:
+ if rank == 0:
+ logger.info(
+ "Benchmarking sequence length: %s (tensor shape: %s x %s)",
+ seq_len,
+ seq_len,
+ HIDDEN_SIZE,
+ )
+
+ results = benchmark.benchmark_allreduce(
+ sequence_length=seq_len,
+ num_warmup=args.num_warmup,
+ num_trials=args.num_trials,
+ )
+
+ all_results[seq_len] = results
+
+ # Synchronize between ranks
+ dist.barrier()
+
+ # Print results (only rank 0)
+ if rank == 0:
+ print_results(all_results, args.sequence_lengths, world_size)
+
+ # Save to JSON if requested
+ if args.output_json:
+ # Add speedup information to results
+ enhanced_results = {}
+ for seq_len, comm_results in all_results.items():
+ enhanced_results[seq_len] = {
+ "timings": comm_results,
+ "speedup_info": _calculate_speedup_info(comm_results),
+ }
+
+ output_data = {
+ "world_size": world_size,
+ "dtype": str(BENCHMARK_DTYPE),
+ "hidden_size": HIDDEN_SIZE,
+ "sequence_lengths": args.sequence_lengths,
+ "num_warmup": args.num_warmup,
+ "num_trials": args.num_trials,
+ "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
+ "results": enhanced_results,
+ }
+
+ with open(args.output_json, "w") as f:
+ json.dump(output_data, f, indent=2)
+
+ logger.info("Results saved to %s", args.output_json)
+
+ # Cleanup
+ if cpu_group != dist.group.WORLD:
+ dist.destroy_process_group(cpu_group)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 3d38d4b3534e8..89309c79f0991 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -637,7 +637,7 @@ def bench_optype(
# Clear LoRA optimization hash-maps.
_LORA_A_PTR_DICT.clear()
_LORA_B_PTR_DICT.clear()
- # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+ # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
for kwargs in kwargs_list:
op_type.bench_fn()(**kwargs)
torch.cuda.synchronize()
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 710d30adfd846..94f3f1ae11f27 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -594,7 +594,11 @@ def main(args: argparse.Namespace):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
- elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+ elif config.architectures[0] in (
+ "Qwen2MoeForCausalLM",
+ "Qwen3MoeForCausalLM",
+ "Qwen3NextForCausalLM",
+ ):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
@@ -678,7 +682,11 @@ def main(args: argparse.Namespace):
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
print(f"Start tuning over {len(search_space)} configurations...")
-
+ if use_deep_gemm:
+ raise ValueError(
+ "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
+ "kernels. Please remove the flag."
+ )
start = time.time()
configs = _distribute(
"tune",
diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py
new file mode 100644
index 0000000000000..9ac8f5e6594e4
--- /dev/null
+++ b/benchmarks/kernels/benchmark_polynorm.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+
+def polynorm_naive(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+
+ def norm(x, eps: float):
+ return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+ x = x.float()
+ return (
+ (
+ weight[0] * norm(x**3, eps)
+ + weight[1] * norm(x**2, eps)
+ + weight[2] * norm(x, eps)
+ + bias
+ )
+ .to(weight.dtype)
+ .view(orig_shape)
+ )
+
+
+def polynorm_vllm(
+ x: torch.Tensor,
+ weight: torch.Tensor,
+ bias: torch.Tensor,
+ eps: float = 1e-6,
+):
+ orig_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+
+ out = torch.empty_like(x)
+ vllm_ops.poly_norm(out, x, weight, bias, eps)
+ output = out
+
+ output = output.view(orig_shape)
+ return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_dim):
+ dtype = torch.bfloat16
+ x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+ weight = torch.ones(3, dtype=dtype, device="cuda")
+ bias = torch.ones(1, dtype=dtype, device="cuda")
+
+ output_naive = polynorm_naive(x, weight, bias)
+ output_vllm = polynorm_vllm(x, weight, bias)
+
+ if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+ print("✅ All implementations match")
+ else:
+ print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+dim_range = [2048, 4096]
+configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark():
+ @triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["dim", "batch_size", "seq_len"],
+ x_vals=[list(_) for _ in configs],
+ line_arg="provider",
+ line_vals=["naive", "vllm"],
+ line_names=["Naive", "vLLM"],
+ styles=[("blue", "-"), ("red", "-")],
+ ylabel="us",
+ plot_name="polynorm-perf",
+ args={},
+ )
+ )
+ def benchmark(dim, batch_size, seq_len, provider):
+ dtype = torch.bfloat16
+ hidden_dim = dim * 4
+
+ x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+ weight = torch.ones(3, dtype=dtype, device="cuda")
+ bias = torch.ones(1, dtype=dtype, device="cuda")
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "naive":
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: polynorm_naive(x, weight, bias),
+ quantiles=quantiles,
+ )
+ else:
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: polynorm_vllm(x, weight, bias),
+ quantiles=quantiles,
+ )
+
+ return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+ return benchmark
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=4,
+ help="Batch size",
+ )
+ parser.add_argument(
+ "--seq-len",
+ type=int,
+ default=128,
+ help="Sequence length",
+ )
+ parser.add_argument(
+ "--hidden-dim",
+ type=int,
+ default=8192,
+ help="Intermediate size of MLP",
+ )
+ parser.add_argument(
+ "--save-path",
+ type=str,
+ default="./configs/polnorm/",
+ help="Path to save polnorm benchmark results",
+ )
+
+ args = parser.parse_args()
+
+ # Run correctness test
+ calculate_diff(
+ batch_size=args.batch_size,
+ seq_len=args.seq_len,
+ hidden_dim=args.hidden_dim,
+ )
+
+ benchmark = get_benchmark()
+ # Run performance benchmark
+ benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index 0650cbf3cc18e..c7a4066b39d70 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -1,77 +1,675 @@
-#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
+from collections.abc import Callable
+import matplotlib.pyplot as plt
+import numpy as np
import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
- silu_mul_fp8_quant_deep_gemm,
+ silu_mul_fp8_quant_deep_gemm_cuda,
)
from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
-def benchmark(E, T, H, G=128, runs=50):
- current_platform.seed_everything(42)
- y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
- tokens_per_expert = torch.randint(
- T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
+@triton.jit
+def _silu_mul_fp8_quant_deep_gemm(
+ # Pointers ------------------------------------------------------------
+ input_ptr, # 16-bit activations (E, T, 2*H)
+ y_q_ptr, # fp8 quantized activations (E, T, H)
+ y_s_ptr, # 16-bit scales (E, T, G)
+ counts_ptr, # int32 num tokens per expert (E)
+ # Sizes ---------------------------------------------------------------
+ H: tl.constexpr, # hidden dimension (per output)
+ GROUP_SIZE: tl.constexpr, # elements per group (usually 128)
+ # Strides for input (elements) ---------------------------------------
+ stride_i_e,
+ stride_i_t,
+ stride_i_h,
+ # Strides for y_q (elements) -----------------------------------------
+ stride_yq_e,
+ stride_yq_t,
+ stride_yq_h,
+ # Strides for y_s (elements) -----------------------------------------
+ stride_ys_e,
+ stride_ys_t,
+ stride_ys_g,
+ # Stride for counts (elements)
+ stride_counts_e,
+ # Numeric params ------------------------------------------------------
+ eps: tl.constexpr,
+ fp8_min: tl.constexpr,
+ fp8_max: tl.constexpr,
+ use_ue8m0: tl.constexpr,
+ # Meta ---------------------------------------------------------------
+ BLOCK: tl.constexpr,
+ NUM_STAGES: tl.constexpr,
+):
+ G = H // GROUP_SIZE
+
+ # map program id -> (e, g)
+ pid = tl.program_id(0)
+ e = pid // G
+ g = pid % G
+
+ e = e.to(tl.int64)
+ g = g.to(tl.int64)
+
+ # number of valid tokens for this expert
+ n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
+
+ cols = tl.arange(0, BLOCK).to(tl.int64)
+ mask = cols < BLOCK
+
+ base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
+ base_gate_offset = base_input_offset + cols * stride_i_h
+ base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
+ base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
+ base_ys_offset = e * stride_ys_e + g * stride_ys_g
+
+ for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
+ gate = tl.load(
+ input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
+ ).to(tl.float32)
+ up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
+
+ gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
+ y = gate * up
+
+ y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
+ if use_ue8m0:
+ y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
+
+ y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+ tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
+ tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
+
+
+def silu_mul_fp8_quant_deep_gemm_triton(
+ y: torch.Tensor, # (E, T, 2*H)
+ tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert
+ num_parallel_tokens,
+ group_size: int = 128,
+ eps: float = 1e-10,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
+
+ y has shape (E, T, 2*H). The first half of the last dimension is
+ silu-activated, multiplied by the second half, then quantized into FP8.
+
+ Returns `(y_q, y_s)` where
+ * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
+ * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+ """
+ assert y.ndim == 3, "y must be (E, T, 2*H)"
+ E, T, H2 = y.shape
+ assert H2 % 2 == 0, "last dim of y must be even (2*H)"
+ H = H2 // 2
+ G = (H + group_size - 1) // group_size
+ assert H % group_size == 0, "H must be divisible by group_size"
+ assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
+ "tokens_per_expert must be shape (E,)"
+ )
+ tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
+
+ # allocate outputs
+ fp8_dtype = torch.float8_e4m3fn
+ y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+
+ # strides (elements)
+ stride_i_e, stride_i_t, stride_i_h = y.stride()
+ stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+
+ # desired scale strides (elements): (T*G, 1, T)
+ stride_ys_e = T * G
+ stride_ys_t = 1
+ stride_ys_g = T
+ y_s = torch.empty_strided(
+ (E, T, G),
+ (stride_ys_e, stride_ys_t, stride_ys_g),
+ dtype=torch.float32,
+ device=y.device,
)
+ stride_cnt_e = tokens_per_expert.stride()[0]
+
+ # Static grid over experts and H-groups.
+ # A loop inside the kernel handles the token dim
+ grid = (E * G,)
+
+ f_info = torch.finfo(fp8_dtype)
+ fp8_max = f_info.max
+ fp8_min = f_info.min
+
+ _silu_mul_fp8_quant_deep_gemm[grid](
+ y,
+ y_q,
+ y_s,
+ tokens_per_expert,
+ H,
+ group_size,
+ stride_i_e,
+ stride_i_t,
+ stride_i_h,
+ stride_yq_e,
+ stride_yq_t,
+ stride_yq_h,
+ stride_ys_e,
+ stride_ys_t,
+ stride_ys_g,
+ stride_cnt_e,
+ eps,
+ fp8_min,
+ fp8_max,
+ is_deep_gemm_e8m0_used(),
+ BLOCK=group_size,
+ NUM_STAGES=4,
+ num_warps=1,
+ )
+
+ return y_q, y_s
+
+
+# Parse generation strategies
+strategies = ["uniform", "max_t", "first_t"]
+
+
+def benchmark(
+ kernel: Callable,
+ E: int,
+ T: int,
+ H: int,
+ total_tokens: int,
+ num_parallel_tokens: int = 64,
+ G: int = 128,
+ runs: int = 200,
+ num_warmups: int = 20,
+ gen_strategy: str = "default",
+ iterations_per_run: int = 20,
+):
+ def generate_data(seed_offset=0):
+ """Generate input data with given seed offset"""
+ current_platform.seed_everything(42 + seed_offset)
+ y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
+
+ if gen_strategy == "uniform":
+ r = torch.rand(size=(E,), device="cuda")
+ r /= r.sum()
+ r *= total_tokens
+ tokens_per_expert = r.int()
+ tokens_per_expert = torch.minimum(
+ tokens_per_expert,
+ torch.ones((E,), device=r.device, dtype=torch.int) * T,
+ )
+ elif gen_strategy == "max_t":
+ tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
+ tokens_per_expert.fill_(total_tokens / E)
+ elif gen_strategy == "first_t":
+ tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
+ tokens_per_expert[0] = min(T, total_tokens)
+ else:
+ raise ValueError(f"Unknown generation strategy: {gen_strategy}")
+ return y, tokens_per_expert
+
+ dataset_count = 4
+ # Pre-generate different input matrices for each iteration to avoid cache effects
+ data_sets = [generate_data(i) for i in range(dataset_count)]
+
# Warmup
- for _ in range(10):
- silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
- torch.cuda.synchronize()
+ y, tokens_per_expert = data_sets[0]
+ for _ in range(num_warmups):
+ kernel(
+ y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
+ )
+ torch.cuda.synchronize()
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
# Benchmark
- torch.cuda.synchronize()
- start = time.perf_counter()
+ latencies: list[float] = []
for _ in range(runs):
- silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
- torch.cuda.synchronize()
+ torch.cuda.synchronize()
- avg_time = (time.perf_counter() - start) / runs * 1000
+ start_event.record()
+ for i in range(iterations_per_run):
+ y, tokens_per_expert = data_sets[i % dataset_count]
+ kernel(
+ y,
+ tokens_per_expert,
+ num_parallel_tokens=num_parallel_tokens,
+ group_size=G,
+ )
+ end_event.record()
+ end_event.synchronize()
- # Calculate actual work done (only count valid tokens)
+ total_time_ms = start_event.elapsed_time(end_event)
+ per_iter_time_ms = total_time_ms / iterations_per_run
+ latencies.append(per_iter_time_ms)
+
+ # Use median instead of average for better outlier handling
+ median_time_ms = np.median(latencies)
+ median_time_s = median_time_ms / 1000
+
+ # Calculate actual work done (using first dataset for consistency)
+ _, tokens_per_expert = data_sets[0]
actual_tokens = tokens_per_expert.sum().item()
actual_elements = actual_tokens * H
# GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
ops_per_element = 8
total_ops = actual_elements * ops_per_element
- gflops = total_ops / (avg_time / 1000) / 1e9
+ gflops = total_ops / median_time_s / 1e9
# Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs
output_bytes = actual_tokens * H * 1 # H fp8 outputs
scale_bytes = actual_tokens * (H // G) * 4 # scales in float32
total_bytes = input_bytes + output_bytes + scale_bytes
- memory_bw = total_bytes / (avg_time / 1000) / 1e9
+ memory_bw = total_bytes / median_time_s / 1e9
- return avg_time, gflops, memory_bw
+ HOPPER_BANDWIDTH_TBPS = 3.35
+ return (
+ median_time_ms,
+ gflops,
+ memory_bw,
+ (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
+ )
+def create_comparison_plot(
+ ratio, cuda_times, baseline_times, config_labels, strategy_name, id
+):
+ """Create a comparison plot for a specific generation strategy"""
+ fig, ax = plt.subplots(1, 1, figsize=(16, 6))
+
+ # Configure x-axis positions
+ x = np.arange(len(config_labels))
+ width = 0.35
+
+ # Execution Time plot (lower is better)
+ ax.bar(
+ x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
+ )
+ ax.bar(
+ x + width / 2,
+ baseline_times,
+ width,
+ label="Baseline",
+ alpha=0.8,
+ color="orange",
+ )
+
+ # Add speedup labels over each bar pair
+ for i in range(len(x)):
+ speedup = ratio[i]
+ max_height = max(cuda_times[i], baseline_times[i])
+ ax.text(
+ x[i],
+ max_height + max_height * 0.02,
+ f"{speedup:.2f}x",
+ ha="center",
+ va="bottom",
+ fontweight="bold",
+ fontsize=9,
+ )
+
+ ax.set_xlabel("Configuration")
+ ax.set_ylabel("% Utilization")
+ ax.set_title(
+ f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(config_labels, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ return fig, ax
+
+
+def create_combined_plot(all_results):
+ """Create a combined plot with all strategies in one PNG"""
+ num_strategies = len(all_results)
+ fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
+
+ if num_strategies == 1:
+ axes = [axes]
+
+ for idx, (
+ strategy_name,
+ ratio,
+ cuda_times,
+ baseline_times,
+ config_labels,
+ ) in enumerate(all_results):
+ ax = axes[idx]
+
+ # Configure x-axis positions
+ x = np.arange(len(config_labels))
+ width = 0.35
+
+ # Execution Time plot (lower is better)
+ ax.bar(
+ x - width / 2,
+ cuda_times,
+ width,
+ label="CUDA Kernel",
+ alpha=0.8,
+ color="blue",
+ )
+ ax.bar(
+ x + width / 2,
+ baseline_times,
+ width,
+ label="Baseline",
+ alpha=0.8,
+ color="orange",
+ )
+
+ # Add speedup labels over each bar pair
+ for i in range(len(x)):
+ speedup = ratio[i]
+ max_height = max(cuda_times[i], baseline_times[i])
+ ax.text(
+ x[i],
+ max_height + max_height * 0.02,
+ f"{speedup:.2f}x",
+ ha="center",
+ va="bottom",
+ fontweight="bold",
+ fontsize=9,
+ )
+
+ ax.set_xlabel("Configuration")
+ ax.set_ylabel("% Utilization")
+ ax.set_title(
+ f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(config_labels, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ filename = "../../silu_bench/silu_benchmark_combined.png"
+ plt.savefig(filename, dpi=300, bbox_inches="tight")
+ plt.show()
+
+ return filename
+
+
+outer_dim = 7168
configs = [
- (8, 32, 1024),
- (16, 64, 2048),
- (32, 128, 4096),
# DeepSeekV3 Configs
- (256, 16, 7168),
- (256, 32, 7168),
- (256, 64, 7168),
- (256, 128, 7168),
- (256, 256, 7168),
- (256, 512, 7168),
+ (8, 1024, 7168),
+ # DeepSeekV3 Configs
+ (32, 1024, 7168),
+ # DeepSeekV3 Configs
(256, 1024, 7168),
]
-print(f"GPU: {torch.cuda.get_device_name()}")
-print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
-print("-" * 50)
+runs = 100
+num_warmups = 20
-for E, T, H in configs:
- try:
- time_ms, gflops, gbps = benchmark(E, T, H)
- print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
- except Exception:
- print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
+strategy_descriptions = {
+ "uniform": "Uniform Random",
+ "max_t": "Even Assignment",
+ "first_t": "experts[0] = T, experts[1:] = 0",
+}
+
+print(f"GPU: {torch.cuda.get_device_name()}")
+print(f"Testing strategies: {', '.join(strategies)}")
+print(f"Configurations: {len(configs)} configs")
+
+all_results = []
+
+# Run benchmarks for each strategy
+for id, strategy in enumerate(strategies):
+ print(f"\n{'=' * 60}")
+ print(f"Testing strategy: {strategy_descriptions[strategy]}")
+ print(f"{'=' * 60}")
+
+ # Collect benchmark data for both algorithms
+ config_labels = []
+ config_x_axis = []
+ all_cuda_results = []
+ all_baseline_results = []
+ all_ratios = []
+
+ for E, T, H in configs:
+ total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
+ config_x_axis.append(total_tokens_config)
+
+ cuda_results = []
+ baseline_results = []
+ ratios = []
+
+ for total_tokens in total_tokens_config:
+ config_label = f"E={E},T={T},H={H},TT={total_tokens}"
+ config_labels.append(config_label)
+
+ # CUDA kernel results
+ time_ms_cuda, gflops, gbps, perc = benchmark(
+ silu_mul_fp8_quant_deep_gemm_cuda,
+ E,
+ T,
+ H,
+ total_tokens,
+ runs=runs,
+ num_warmups=num_warmups,
+ gen_strategy=strategy,
+ )
+ cuda_results.append((time_ms_cuda, gflops, gbps, perc))
+
+ # Baseline results
+ time_ms_triton, gflops, gbps, perc = benchmark(
+ silu_mul_fp8_quant_deep_gemm_triton,
+ E,
+ T,
+ H,
+ total_tokens,
+ runs=runs,
+ num_warmups=num_warmups,
+ gen_strategy=strategy,
+ )
+ baseline_results.append((time_ms_triton, gflops, gbps, perc))
+ ratios.append(time_ms_triton / time_ms_cuda)
+
+ print(f"Completed: {config_label}")
+ all_cuda_results.append(cuda_results)
+ all_baseline_results.append(baseline_results)
+ all_ratios.append(ratios)
+
+ # Store results for combined plotting
+ all_results.append(
+ (
+ strategy_descriptions[strategy],
+ all_ratios,
+ all_cuda_results,
+ all_baseline_results,
+ config_labels,
+ config_x_axis,
+ )
+ )
+
+ # Print summary table for this strategy
+ print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
+ print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
+ print("-" * 60)
+
+ for i, (E, T, H) in enumerate(configs):
+ speedup = baseline_results[i][0] / cuda_results[i][0]
+ config_label = f"E={E:3d},T={T:4d},H={H:4d}"
+ print(
+ f"{config_label:<20} {cuda_results[i][0]:8.5f} "
+ f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
+ )
+
+
+def create_total_tokens_plot(all_results):
+ num_strategies = len(all_results)
+ num_configs = len(configs)
+
+ # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
+ fig, axs = plt.subplots(
+ num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
+ )
+
+ # Add main title to the entire figure
+ fig.suptitle(
+ "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
+ fontsize=16,
+ fontweight="bold",
+ y=0.98,
+ )
+
+ # Handle single strategy case
+ if num_strategies == 1:
+ axs = axs.reshape(1, -1)
+
+ # Handle single config case
+ if num_configs == 1:
+ axs = axs.reshape(-1, 2)
+
+ for strategy_idx, result in enumerate(all_results):
+ (
+ strategy_name,
+ all_ratios,
+ all_cuda_results,
+ all_baseline_results,
+ config_labels,
+ config_x_axis,
+ ) = result
+
+ for config_idx in range(num_configs):
+ # Speedup plot (left column)
+ ax_speedup = axs[strategy_idx, config_idx * 2]
+ # Bandwidth plot (right column)
+ ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
+
+ E, T, H = configs[config_idx]
+ ratios = all_ratios[config_idx]
+ total_tokens_values = config_x_axis[config_idx]
+
+ # Extract CUDA and Triton bandwidth percentages
+ cuda_bandwidth_percentages = [
+ result[3] for result in all_cuda_results[config_idx]
+ ]
+ triton_bandwidth_percentages = [
+ result[3] for result in all_baseline_results[config_idx]
+ ]
+
+ # Plot speedup ratios vs total tokens (left plot)
+ ax_speedup.plot(
+ total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
+ )
+ ax_speedup.set_title(
+ f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
+ fontsize=12,
+ fontweight="bold",
+ )
+ ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+ ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
+ ax_speedup.grid(True, alpha=0.3)
+
+ ax_bandwidth.plot(
+ total_tokens_values,
+ cuda_bandwidth_percentages,
+ "ro-",
+ linewidth=3,
+ markersize=8,
+ label="CUDA",
+ )
+ ax_bandwidth.plot(
+ total_tokens_values,
+ triton_bandwidth_percentages,
+ "go-",
+ linewidth=3,
+ markersize=8,
+ label="Triton",
+ )
+ ax_bandwidth.set_title(
+ f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
+ fontsize=12,
+ fontweight="bold",
+ )
+ ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+ ax_bandwidth.set_ylabel(
+ "% of Peak Bandwidth", fontweight="bold", fontsize=11
+ )
+ ax_bandwidth.legend(prop={"weight": "bold"})
+ ax_bandwidth.grid(True, alpha=0.3)
+
+ # Format x-axis labels for both plots
+ for ax in [ax_speedup, ax_bandwidth]:
+ ax.set_xticks(total_tokens_values)
+ ax.set_xticklabels(
+ [
+ f"{tt // 1000}K" if tt >= 1000 else str(tt)
+ for tt in total_tokens_values
+ ],
+ fontweight="bold",
+ )
+ # Make tick labels bold
+ for label in ax.get_xticklabels() + ax.get_yticklabels():
+ label.set_fontweight("bold")
+
+ # Add value labels on speedup points
+ for x, y in zip(total_tokens_values, ratios):
+ ax_speedup.annotate(
+ f"{y:.2f}x",
+ (x, y),
+ textcoords="offset points",
+ xytext=(0, 12),
+ ha="center",
+ fontsize=10,
+ fontweight="bold",
+ bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
+ )
+
+ # Add value labels on CUDA bandwidth points
+ for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
+ ax_bandwidth.annotate(
+ f"{y:.1f}%",
+ (x, y),
+ textcoords="offset points",
+ xytext=(0, 12),
+ ha="center",
+ fontsize=9,
+ fontweight="bold",
+ bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
+ )
+
+ # Add value labels on Triton bandwidth points
+ for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
+ ax_bandwidth.annotate(
+ f"{y:.1f}%",
+ (x, y),
+ textcoords="offset points",
+ xytext=(0, -15),
+ ha="center",
+ fontsize=9,
+ fontweight="bold",
+ bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
+ )
+
+ plt.tight_layout()
+ plt.subplots_adjust(top=0.93) # Make room for main title
+ filename = "silu_benchmark_total_tokens.png"
+ plt.savefig(filename, dpi=300, bbox_inches="tight")
+ plt.show()
+
+ return filename
+
+
+# Create combined plot with all strategies
+combined_plot_filename = create_total_tokens_plot(all_results)
+
+print(f"\n{'=' * 60}")
+print("Benchmark Complete!")
+print(f"Generated combined plot: {combined_plot_filename}")
+print(f"{'=' * 60}")
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 603ce5ecf0d2c..6ddab46214577 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -259,6 +259,7 @@ if __name__ == "__main__":
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
(None, None, None),
(None, FP8_DTYPE, None),
+ (FP8_DTYPE, FP8_DTYPE, None),
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
]
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 40903c6c3444f..131df74c7de1b 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -274,6 +274,7 @@ if __name__ == "__main__":
quant_dtypes = [
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
(None, None, None),
+ (FP8_DTYPE, FP8_DTYPE, None),
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
]
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 98bde9d83c82d..df2b713e46dc4 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -56,7 +56,7 @@ def w8a8_block_matmul(
Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization.
It should be 2-dim, e.g., [128, 128].
- output_dytpe: The dtype of the returned tensor.
+ output_dtype: The dtype of the returned tensor.
Returns:
torch.Tensor: The result of matmul.
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index d23b7b6e4571d..66d85eaf51312 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -962,7 +962,7 @@ async def main_mp(
# At this point all the clients finished,
# collect results (TTFT, TPOT, etc.) from all the clients.
- # This needs to happens before calling join on the clients
+ # This needs to happen before calling join on the clients
# (result_queue should be emptied).
while not result_queue.empty():
client_metrics.append(result_queue.get())
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 52bfd82c7fcfe..06494463223bd 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
message(STATUS "Apple Silicon Detected")
+ set(APPLE_SILICON_FOUND TRUE)
set(ENABLE_NUMA OFF)
check_sysctl(hw.optional.neon ASIMD_FOUND)
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@@ -189,7 +190,7 @@ else()
set(USE_ACL OFF)
endif()
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 49defccbb1fa4..3d32121f13ac2 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
+ GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 9c0ed1d09572e..8558976e2c392 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
endif()
- set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
target_compile_options(${GPU_MOD_NAME} PRIVATE
$<$:${GPU_COMPILE_FLAGS}>)
diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index 6dd6f269f3dc9..d1874515cc8fd 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -36,12 +36,14 @@ limitations under the License.
#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
void sm100_cutlass_mla_decode(
torch::Tensor const& out,
+ torch::Tensor const& lse,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table,
torch::Tensor const& workspace,
+ double sm_scale,
int64_t num_kv_splits) {
TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
}
@@ -64,11 +66,11 @@ struct IsPersistent {
static const bool value = v;
};
-template >
+template >
struct MlaSm100 {
using Element = T;
using ElementAcc = float;
- using ElementOut = T;
+ using ElementOut = TOut;
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
using TileShapeH = cute::tuple_element_t<0, TileShape>;
@@ -99,6 +101,7 @@ struct MlaSm100 {
template
typename T::Fmha::Arguments args_from_options(
at::Tensor const& out,
+ at::Tensor const& lse,
at::Tensor const& q_nope,
at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache,
@@ -162,7 +165,10 @@ typename T::Fmha::Arguments args_from_options(
stride_PT,
page_count_total,
page_size},
- {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE},
+ {static_cast(out.data_ptr()),
+ stride_O,
+ static_cast(lse.defined() ? lse.data_ptr() : nullptr),
+ stride_LSE},
hw_info,
// TODO(trevor-m): Change split_kv back to -1 when
// https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
@@ -178,9 +184,10 @@ typename T::Fmha::Arguments args_from_options(
return arguments;
}
-template
+template
void runMla(
at::Tensor const& out,
+ at::Tensor const& lse,
at::Tensor const& q_nope,
at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache,
@@ -190,9 +197,9 @@ void runMla(
double sm_scale,
int64_t num_kv_splits,
cudaStream_t stream) {
- using MlaSm100Type = MlaSm100;
+ using MlaSm100Type = MlaSm100;
typename MlaSm100Type::Fmha fmha;
- auto arguments = args_from_options(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+ auto arguments = args_from_options(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
CUTLASS_CHECK(fmha.can_implement(arguments));
@@ -214,6 +221,7 @@ void runMla(
void sm100_cutlass_mla_decode(
torch::Tensor const& out,
+ torch::Tensor const& lse,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
@@ -233,14 +241,14 @@ void sm100_cutlass_mla_decode(
DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
if (in_dtype == at::ScalarType::Half) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else if (in_dtype == at::ScalarType::BFloat16) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
- runMla>(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+ runMla>(
+ out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
} else {
TORCH_CHECK(false, "Unsupported input data type of MLA");
}
@@ -253,7 +261,7 @@ void sm100_cutlass_mla_decode(
int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
// Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
// which are float, so Element type here doesn't matter.
- using MlaSm100Type = MlaSm100;
+ using MlaSm100Type = MlaSm100;
// Get split kv. Requires problem shape and sm_count only.
typename MlaSm100Type::Fmha::Arguments arguments;
diff --git a/csrc/cache.h b/csrc/cache.h
index e8e069aefd9c5..fd230bec27fca 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -36,13 +36,6 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
const std::string& kv_cache_dtype,
torch::Tensor& scale);
-void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
- torch::Tensor& cp_local_token_select_indices,
- torch::Tensor& kv_cache,
- torch::Tensor& slot_mapping,
- const std::string& kv_cache_dtype,
- torch::Tensor& scale);
-
// Just for unittest
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index fbb022464ef27..80b4c47c55476 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -396,51 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
}
-template
-__global__ void cp_fused_concat_and_cache_mla_kernel(
- const scalar_t* __restrict__ kv_c, // [num_full_tokens, kv_lora_rank]
- const scalar_t* __restrict__ k_pe, // [num_full_tokens, pe_dim]
- const int64_t* __restrict__ cp_local_token_select_indices, // [num_tokens]
- cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
- // + pe_dim)]
- const int64_t* __restrict__ slot_mapping, // [num_tokens]
- const int block_stride, //
- const int entry_stride, //
- const int kv_c_stride, //
- const int k_pe_stride, //
- const int kv_lora_rank, //
- const int pe_dim, //
- const int block_size, //
- const float* scale //
-) {
- const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
- const int64_t slot_idx = slot_mapping[blockIdx.x];
- // NOTE: slot_idx can be -1 if the token is padded
- if (slot_idx < 0) {
- return;
- }
- const int64_t block_idx = slot_idx / block_size;
- const int64_t block_offset = slot_idx % block_size;
-
- auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
- int src_stride, int dst_stride, int size, int offset) {
- for (int i = threadIdx.x; i < size; i += blockDim.x) {
- const int64_t src_idx = token_idx * src_stride + i;
- const int64_t dst_idx =
- block_idx * block_stride + block_offset * entry_stride + i + offset;
- if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
- dst[dst_idx] = src[src_idx];
- } else {
- dst[dst_idx] =
- fp8::scaled_convert(src[src_idx], *scale);
- }
- }
- };
-
- copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
- copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-}
-
} // namespace vllm
// KV_T is the data type of key and value tensors.
@@ -554,20 +509,6 @@ void reshape_and_cache_flash(
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
reinterpret_cast(scale.data_ptr()));
-// KV_T is the data type of key and value tensors.
-// CACHE_T is the stored data type of kv-cache.
-// KV_DTYPE is the real data type of kv-cache.
-#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
- vllm::cp_fused_concat_and_cache_mla_kernel \
- <<>>( \
- reinterpret_cast(kv_c.data_ptr()), \
- reinterpret_cast(k_pe.data_ptr()), \
- cp_local_token_select_indices.data_ptr(), \
- reinterpret_cast(kv_cache.data_ptr()), \
- slot_mapping.data_ptr(), block_stride, entry_stride, \
- kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
- reinterpret_cast(scale.data_ptr()));
-
void concat_and_cache_mla(
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
torch::Tensor& k_pe, // [num_tokens, pe_dim]
@@ -606,50 +547,6 @@ void concat_and_cache_mla(
CALL_CONCAT_AND_CACHE_MLA);
}
-// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
-// calls into one:
-// k_c_normed.index_select(0, cp_local_token_select_indices) + \
-// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
-// concat_and_cache_mla.
-void cp_fused_concat_and_cache_mla(
- torch::Tensor& kv_c, // [num_total_tokens, kv_lora_rank]
- torch::Tensor& k_pe, // [num_total_tokens, pe_dim]
- torch::Tensor& cp_local_token_select_indices, // [num_tokens]
- torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
- // pe_dim)]
- torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
- const std::string& kv_cache_dtype, torch::Tensor& scale) {
- // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
- // slot_mapping.size(0) because of padding for CUDA graphs.
- // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
- // both include padding.
- // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
- // since key includes padding for CUDA graphs, while slot_mapping does not.
- // In this case, slot_mapping.size(0) represents the actual number of tokens
- // before padding.
- // For compatibility with both cases, we use slot_mapping.size(0) as the
- // number of tokens.
- int num_tokens = slot_mapping.size(0);
- int kv_lora_rank = kv_c.size(1);
- int pe_dim = k_pe.size(1);
- int block_size = kv_cache.size(1);
-
- TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-
- int kv_c_stride = kv_c.stride(0);
- int k_pe_stride = k_pe.stride(0);
- int block_stride = kv_cache.stride(0);
- int entry_stride = kv_cache.stride(1);
-
- dim3 grid(num_tokens);
- dim3 block(std::min(kv_lora_rank, 512));
- const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
- DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
- CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
-}
-
namespace vllm {
template
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index ab8cbbbf4ec4f..51bca37e699b9 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -12,7 +12,7 @@ namespace vec_op {
#define vec_sub(a, b) ((a) - (b))
#define vec_mul(a, b) ((a) * (b))
#define vec_div(a, b) ((a) / (b))
-#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic
+#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
// FIXME: FP16 is not fully supported in Torch-CPU
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index f3f00edb36068..6def0e061fa96 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -22,6 +22,23 @@ void release_dnnl_matmul_handler(int64_t handler) {
delete ptr;
}
+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+ this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+ new_size = round(new_size);
+ if (new_size > size_) {
+ ptr_ = std::aligned_alloc(64, new_size);
+ size_ = new_size;
+ }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+ static DNNLScratchPadManager manager;
+ return &manager;
+}
+
template
class DNNLPrimitiveCache {
public:
@@ -166,6 +183,23 @@ struct hash {
hash()(static_cast(val.bias_type));
}
};
+
+template <>
+struct hash {
+ size_t operator()(
+ const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
+ return hash()(val.b_n_size) ^ hash()(val.b_k_size);
+ }
+};
+
+template <>
+struct hash {
+ size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
+ return hash()(val.a_m_size) ^
+ hash()(val.a_m_stride) ^ hash()(val.use_bias) ^
+ hash()(static_cast(val.bias_type));
+ }
+};
} // namespace std
bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
@@ -181,6 +215,17 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
l.bias_type == r.bias_type;
}
+bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
+ const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
+ return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
+}
+
+bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
+ const MatMulPrimitiveHandler::MSizeCacheKey& r) {
+ return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
+ l.use_bias == r.use_bias && l.bias_type == r.bias_type;
+}
+
static std::shared_ptr
get_w8a8_class_primitive_cache(
const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
@@ -239,6 +284,11 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
}
dnnl::matmul matmul = get_matmul_cache(args);
+
+ auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
+ scratchpad_storage->set_data_handle(
+ DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data());
+
matmul.execute(default_stream(), memory_cache_);
default_stream().wait();
}
@@ -257,6 +307,8 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
return m_size_cache_->get_or_create(key, [&]() {
dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+ auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+ manager->realloc(desc.scratchpad_desc().get_size());
return dnnl::matmul(desc);
});
}
@@ -300,6 +352,11 @@ void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
default_engine(), nullptr);
set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
+
+ memory_cache_[DNNL_ARG_SCRATCHPAD] =
+ dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
}
dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
@@ -319,6 +376,9 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
dnnl::memory::format_tag::ab);
dnnl::primitive_attr attr;
+
+ attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
// For PER_TOKEN, scales will be applied in outside epilogue
if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
attr.set_scales_mask(DNNL_ARG_SRC, 0);
@@ -344,3 +404,120 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
attr);
}
}
+
+MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
+ : DNNLMatMulPrimitiveHandler(
+ static_cast(args), args.ab_type),
+ m_size_cache_(nullptr) {
+ assert(ab_type_ == dnnl::memory::data_type::f32 ||
+ ab_type_ == dnnl::memory::data_type::bf16 ||
+ ab_type_ == dnnl::memory::data_type::f16);
+ prepack_weight(args.b_ptr,
+ create_primitive_desc(
+ MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
+ .a_m_stride = DNNL_RUNTIME_DIM_VAL,
+ .use_bias = false,
+ .bias_type = dnnl::memory::data_type::undef},
+ true)
+ .weights_desc());
+ init_runtime_memory_cache(args);
+}
+
+static std::shared_ptr
+get_matul_class_primitive_cache(
+ const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
+ int64_t cache_size) {
+ static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
+ assert(cache_size > 0);
+ return cache.get_or_create(key, [&]() {
+ return std::make_shared(cache_size);
+ });
+}
+
+void MatMulPrimitiveHandler::execute(ExecArgs& args) {
+ auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
+ auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
+ a_storage->set_data_handle((void*)args.a_ptr);
+ a_mem_desc->dims[0] = args.a_m_size;
+ a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
+ c_storage->set_data_handle((void*)args.c_ptr);
+ c_mem_desc->dims[0] = args.a_m_size;
+
+ if (args.use_bias) {
+ auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
+ bias_storage->set_data_handle((void*)args.bias_ptr);
+ }
+
+ dnnl::matmul matmul = get_matmul_cache(args);
+
+ auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
+ scratchpad_storage->set_data_handle(
+ DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data());
+
+ matmul.execute(default_stream(), memory_cache_);
+ default_stream().wait();
+}
+
+dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
+ const MSizeCacheKey& key) {
+ if (m_size_cache_.get() == nullptr) {
+ ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
+ m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
+ }
+ return m_size_cache_->get_or_create(key, [&]() {
+ dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+ auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+ manager->realloc(desc.scratchpad_desc().get_size());
+ return dnnl::matmul(desc);
+ });
+}
+
+dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
+ const MSizeCacheKey& key, bool first_time) {
+ dnnl::memory::desc a_md;
+ dnnl::memory::desc b_md;
+ if (first_time) {
+ a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+ dnnl::memory::format_tag::ab);
+ b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
+ dnnl::memory::format_tag::any);
+ } else {
+ a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+ {key.a_m_stride, 1});
+ b_md = b_target_mem_desc_;
+ }
+ dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
+ dnnl::memory::format_tag::ab);
+
+ dnnl::primitive_attr attr;
+ attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+ if (key.use_bias) {
+ dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
+ return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
+ c_md, attr);
+ } else {
+ return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
+ attr);
+ }
+}
+
+void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
+ memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
+ {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
+ set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
+ memory_cache_[DNNL_ARG_DST] =
+ dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
+
+ memory_cache_[DNNL_ARG_BIAS] =
+ dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
+
+ memory_cache_[DNNL_ARG_SCRATCHPAD] =
+ dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
+}
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
index 54ceefced9e98..ad6773d2b9fd6 100644
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -59,6 +59,30 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
return DNNLType>::type;
}
+class DNNLScratchPadManager {
+ public:
+ static constexpr size_t allocation_unit = 4 * 1024 * 1024; // 4KB
+
+ static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+ DNNLScratchPadManager();
+
+ template
+ T* get_data() {
+ return reinterpret_cast(ptr_);
+ }
+
+ static size_t round(size_t size) {
+ return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+ }
+
+ void realloc(size_t new_size);
+
+ private:
+ size_t size_;
+ void* ptr_;
+};
+
class DNNLMatMulPrimitiveHandler {
public:
virtual ~DNNLMatMulPrimitiveHandler() = default;
@@ -166,4 +190,54 @@ class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
std::shared_ptr m_size_cache_;
};
+class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
+ public:
+ struct Args : public DNNLMatMulPrimitiveHandler::Args {
+ dnnl::memory::data_type ab_type;
+ };
+
+ struct ClassMatmulCacheKey {
+ dnnl_dim_t b_n_size;
+ dnnl_dim_t b_k_size;
+
+ friend bool operator==(const ClassMatmulCacheKey& l,
+ const ClassMatmulCacheKey& r);
+ };
+
+ struct MSizeCacheKey {
+ dnnl_dim_t a_m_size;
+ dnnl_dim_t a_m_stride;
+ bool use_bias;
+ dnnl::memory::data_type bias_type;
+
+ friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
+ };
+
+ using MSizeCache = DNNLPrimitiveCache;
+ using ClassMatmulCache =
+ DNNLPrimitiveCache>;
+
+ struct ExecArgs : public MSizeCacheKey {
+ const void* a_ptr;
+ const void* bias_ptr;
+ void* c_ptr;
+ };
+
+ public:
+ MatMulPrimitiveHandler(const Args& args);
+
+ void execute(ExecArgs& args);
+
+ private:
+ dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
+ bool first_time);
+
+ void init_runtime_memory_cache(const Args& args);
+
+ dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
+
+ private:
+ std::shared_ptr m_size_cache_;
+};
+
#endif
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index acc3b9ecde143..9a3af4ac9d8a6 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
}
}
- float scale_val, azp_val;
+ float scale_val;
+ float azp_val = 0.0f;
if constexpr (AZP) {
float max_scalar = max_value.reduce_max();
float min_scalar = min_value.reduce_min();
@@ -379,6 +380,7 @@ void onednn_scaled_mm(
exec_args.a_ptr = a.data_ptr();
exec_args.a_m_size = a.size(0);
exec_args.bias_ptr = nullptr;
+ exec_args.bias_type = get_dnnl_type();
exec_args.use_bias = false;
exec_args.a_scales_ptr = nullptr;
exec_args.a_zero_points_ptr = nullptr;
@@ -492,3 +494,56 @@ void dynamic_scaled_int8_quant(
}
});
}
+
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+ int64_t primitive_cache_size) {
+ TORCH_CHECK(b.dim() == 2);
+
+ MatMulPrimitiveHandler::Args args;
+ args.primitive_cache_size = primitive_cache_size;
+
+ args.b_k_size = b.size(0);
+ args.b_k_stride = b.stride(0);
+ args.b_n_size = b.size(1);
+ args.b_n_stride = b.stride(1);
+ args.b_ptr = b.data_ptr();
+
+ VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler",
+ [&] {
+ args.c_type = get_dnnl_type();
+ args.ab_type = get_dnnl_type();
+ });
+
+ return reinterpret_cast(new MatMulPrimitiveHandler(args));
+}
+
+void onednn_mm(torch::Tensor& c, // [M, OC], row-major
+ const torch::Tensor& a, // [M, IC], row-major
+ const std::optional& bias, int64_t handler) {
+ CPU_KERNEL_GUARD_IN(onednn_mm)
+ TORCH_CHECK(a.dim() == 2);
+ TORCH_CHECK(a.stride(-1) == 1);
+ TORCH_CHECK(c.is_contiguous());
+ MatMulPrimitiveHandler* ptr =
+ reinterpret_cast(handler);
+
+ MatMulPrimitiveHandler::ExecArgs exec_args;
+ exec_args.a_m_size = a.size(0);
+ exec_args.a_m_stride = a.stride(0);
+
+ VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
+ if (bias.has_value()) {
+ exec_args.use_bias = true;
+ exec_args.bias_type = get_dnnl_type();
+ exec_args.bias_ptr = bias->data_ptr();
+ } else {
+ exec_args.use_bias = false;
+ exec_args.bias_type = get_dnnl_type();
+ exec_args.bias_ptr = nullptr;
+ }
+ exec_args.a_ptr = a.data_ptr();
+ exec_args.c_ptr = c.data_ptr();
+
+ ptr->execute(exec_args);
+ });
+}
diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp
index beeccff783ea0..94b24c2f13a06 100644
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -215,7 +215,7 @@ int moe_align_block_size(
offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
}
});
- // TODO: do we need to vecterize this ?
+ // TODO: do we need to vectorize this ?
for (int mb = 0; mb < num_token_blocks; ++mb) {
offsets[mb + 1] += offsets[mb];
}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index c9f426bdf618a..98c3ebc5a75f8 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -21,6 +21,12 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional& bias,
int64_t handler);
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+ int64_t primitive_cache_size);
+
+void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
+ const std::optional& bias, int64_t handler);
+
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& kv_cache, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens);
@@ -153,6 +159,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("release_dnnl_matmul_handler(int handler) -> ()",
&release_dnnl_matmul_handler);
+ // Create oneDNN GEMM handler
+ ops.def(
+ "create_onednn_mm_handler(Tensor b, int "
+ "primitive_cache_size) -> int",
+ &create_onednn_mm_handler);
+
+ // oneDNN GEMM
+ ops.def(
+ "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
+ "int handler) -> ()");
+ ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
+
// Create oneDNN W8A8 handler
ops.def(
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 44709b4597765..58926f6429dd3 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16;
#include