Address conflict

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Jee Jee Li 2025-12-03 06:10:36 +00:00
commit 83556e9d85
783 changed files with 23991 additions and 13021 deletions

View File

@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
#### Default Parameters Field
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
<details>
<summary> An Example of default parameters field </summary>
```json
{
"defaults": {
"qps_list": [
"inf"
],
"server_environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
},
"server_parameters": {
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"block_size": 128,
"disable_log_stats": "",
"load_format": "dummy"
},
"client_parameters": {
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 200,
"ignore-eos": ""
}
},
"tests": [
{
"test_name": "serving_llama3B_tp2_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 2,
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
}
},
{
"test_name": "serving_qwen3_tp4_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-14B",
"tensor_parallel_size": 4,
},
"client_parameters": {
"model": "Qwen/Qwen3-14B",
}
},
]
}
```
</details>
### Visualizing the results ### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.

View File

@ -110,7 +110,8 @@ json2envs() {
wait_for_server() { wait_for_server() {
# wait for vllm server to start # wait for vllm server to start
# return 1 if vllm server crashes # return 1 if vllm server crashes
timeout 1200 bash -c ' local timeout_val="1200"
timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do until curl -X POST localhost:8000/v1/completions; do
sleep 1 sleep 1
done' && return 0 || return 1 done' && return 0 || return 1
@ -316,12 +317,44 @@ run_throughput_tests() {
run_serving_tests() { run_serving_tests() {
# run serving tests using `vllm bench serve` command # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases # $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file local serving_test_file
serving_test_file=$1 serving_test_file=$1
# Iterate over serving tests # Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do jq -c '
if type == "array" then
# Plain format: test cases array
.[]
elif (type == "object" and has("tests")) then
# merge the default parameters into each test cases
. as $root
| ($root.defaults // {}) as $d
| ($root.tests // [])[]
# default qps / max_concurrency from defaults if missing
| .qps_list = (.qps_list // $d.qps_list)
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
# merge envs / params: test overrides defaults
| .server_environment_variables =
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
| .server_parameters =
(($d.server_parameters // {}) + (.server_parameters // {}))
| .client_parameters =
(($d.client_parameters // {}) + (.client_parameters // {}))
else
error("Unsupported serving test file format: must be array or object with .tests")
end
' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then if [[ ! "$test_name" =~ ^serving_ ]]; then
@ -335,20 +368,25 @@ run_serving_tests() {
continue continue
fi fi
# get client and server arguments # get client and server arguments (after merged the default parameters)
server_params=$(echo "$params" | jq -r '.server_parameters') server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables') server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params") server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs") server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
# qps_list
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list" echo "Running over qps list $qps_list"
# max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
num_prompts=$(echo "$client_params" | jq -r '.num_prompts') num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
max_concurrency_list="[$num_prompts]" max_concurrency_list="[$num_prompts]"
fi fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list" echo "Running over max concurrency list $max_concurrency_list"

View File

@ -1,610 +0,0 @@
[
{
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

View File

@ -1,276 +1,246 @@
[ {
{ "defaults": {
"test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [
"qps_list": [1, 4, 16, "inf"], "inf"
"max_concurrency_list": [32], ],
"server_environment_variables": { "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"VLLM_RPC_TIMEOUT": 100000, "server_environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_SGL_KERNEL": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40 "VLLM_CPU_SGL_KERNEL": 1,
}, "VLLM_CPU_KVCACHE_SPACE": 40
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 32
}
}, },
{ "server_parameters": {
"test_name": "serving_llama8B_tp2_sharegpt", "model": "meta-llama/Llama-3.1-8B-Instruct",
"qps_list": [1, 4, 16, "inf"], "tensor_parallel_size": 1,
"max_concurrency_list": [32], "dtype": "bfloat16",
"server_environment_variables": { "distributed_executor_backend": "mp",
"VLLM_RPC_TIMEOUT": 100000, "block_size": 128,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "trust_remote_code": "",
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "disable_log_stats": "",
"VLLM_CPU_SGL_KERNEL": 1, "enforce_eager": "",
"VLLM_CPU_KVCACHE_SPACE": 40 "max_num_batched_tokens": 2048,
}, "max_num_seqs": 256,
"server_parameters": { "load_format": "dummy"
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 32
}
}, },
{ "client_parameters": {
"test_name": "serving_llama8B_tp1_random_128_128", "model": "meta-llama/Llama-3.1-8B-Instruct",
"qps_list": [1, 4, 16, "inf"], "backend": "vllm",
"max_concurrency_list": [32], "ignore-eos": "",
"server_environment_variables": { "num_prompts": 200
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
} }
] },
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@ -8,7 +8,7 @@ steps:
commands: commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
@ -30,19 +30,6 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# x86 + CUDA builds # x86 + CUDA builds
- label: "Build wheel - CUDA 12.8"
depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.9" - label: "Build wheel - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-wheel-cuda-12-9 id: build-wheel-cuda-12-9
@ -109,7 +96,6 @@ steps:
- label: "Annotate release workflow" - label: "Annotate release workflow"
depends_on: depends_on:
- create-multi-arch-manifest - create-multi-arch-manifest
- build-wheel-cuda-12-8
id: annotate-release-workflow id: annotate-release-workflow
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge

View File

@ -0,0 +1,369 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# do not complain about line length (for docstring)
# ruff: noqa: E501
import argparse
import json
import re
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
from urllib.parse import quote
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html>
<meta name="pypi:repository-version" content="1.0">
<body>
{items}
</body>
</html>
"""
@dataclass
class WheelFileInfo:
package_name: str
version: str
build_tag: str | None
python_tag: str
abi_tag: str
platform_tag: str
variant: str | None
filename: str
def parse_from_filename(file: str) -> WheelFileInfo:
"""
Parse wheel file name to extract metadata.
The format of wheel names:
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
Example:
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
"""
wheel_file_re = re.compile(
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
)
match = wheel_file_re.match(file)
if not match:
raise ValueError(f"Invalid wheel file name: {file}")
package_name = match.group("package_name")
version = match.group("version")
build_tag = match.group("build_tag")
python_tag = match.group("python_tag")
abi_tag = match.group("abi_tag")
platform_tag = match.group("platform_tag")
# extract variant from version
variant = None
if "dev" in version:
ver_after_dev = version.split("dev")[-1]
if "." in ver_after_dev:
variant = ver_after_dev.split(".")[-1]
version = version.removesuffix("." + variant)
else:
if "+" in version:
version, variant = version.split("+")
return WheelFileInfo(
package_name=package_name,
version=version,
build_tag=build_tag,
python_tag=python_tag,
abi_tag=abi_tag,
platform_tag=platform_tag,
variant=variant,
filename=file,
)
def generate_project_list(subdir_names: list[str]) -> str:
"""
Generate project list HTML content linking to each project & variant sub-directory.
"""
href_tags = []
for name in sorted(subdir_names):
name = name.strip("/").strip(".")
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
def generate_package_index_and_metadata(
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
) -> tuple[str, str]:
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
"""
href_tags = []
metadata = []
for file in sorted(wheel_files, key=lambda x: x.filename):
relative_path = (
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
)
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
# NOTE: this is AWS S3 specific behavior!
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
file_meta = asdict(file)
file_meta["path"] = file_path_quoted
metadata.append(file_meta)
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str
def generate_index_and_metadata(
whl_files: list[str],
wheel_base_dir: Path,
index_base_dir: Path,
default_variant: str | None = None,
alias_to_default: str | None = None,
):
"""
Generate index for all wheel files.
Args:
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
wheel_base_dir (Path): Base directory for wheel files.
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
as the default variant index, but the links are adjusted accordingly.
Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant sub-directory
index.html
vllm/
index.html
metadata.json
cu129/ # cu129 is actually the alias to default variant
index.html
vllm/
index.html
metadata.json
cu130/ # cu130 variant sub-directory
index.html
vllm/
index.html
metadata.json
...
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
[
{
"package_name": "vllm",
"version": "0.10.2rc2",
"build_tag": null,
"python_tag": "cp38",
"abi_tag": "abi3",
"platform_tag": "manylinux2014_aarch64",
"variant": "cu129",
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
},
...
]
"""
parsed_files = [parse_from_filename(f) for f in whl_files]
if not parsed_files:
print("No wheel files found, skipping index generation.")
return
# Group by variant
variant_to_files: dict[str, list[WheelFileInfo]] = {}
for file in parsed_files:
variant = file.variant or "default"
if variant not in variant_to_files:
variant_to_files[variant] = []
variant_to_files[variant].append(file)
print(f"Found variants: {list(variant_to_files.keys())}")
# sanity check for default variant
if default_variant:
if "default" in variant_to_files:
raise ValueError(
"All wheel files must have variant suffixes when `default_variant` is specified."
)
if default_variant not in variant_to_files:
raise ValueError(
f"Default variant '{default_variant}' not found among wheel files."
)
if alias_to_default:
if "default" not in variant_to_files:
# e.g. only some wheels are uploaded to S3 currently
print(
"[WARN] Alias to default variant specified, but no default variant found."
)
elif alias_to_default in variant_to_files:
raise ValueError(
f"Alias variant name '{alias_to_default}' already exists among wheel files."
)
else:
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.")
# Generate index for each variant
subdir_names = set()
for variant, files in variant_to_files.items():
if variant == "default":
variant_dir = index_base_dir
else:
variant_dir = index_base_dir / variant
subdir_names.add(variant)
variant_dir.mkdir(parents=True, exist_ok=True)
# gather all package names in this variant
packages = set(f.package_name for f in files)
if variant == "default":
# these packages should also appear in the "project list"
# generate after all variants are processed
subdir_names = subdir_names.union(packages)
else:
# generate project list for this variant directly
project_list_str = generate_project_list(sorted(packages))
with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str)
for package in packages:
# filter files belonging to this package only
package_files = [f for f in files if f.package_name == package]
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
package_files, wheel_base_dir, package_dir
)
with open(package_dir / "index.html", "w") as f:
f.write(index_str)
with open(package_dir / "metadata.json", "w") as f:
f.write(metadata_str)
# Generate top-level project list index
project_list_str = generate_project_list(sorted(subdir_names))
with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str)
if __name__ == "__main__":
"""
Arguments:
--version <version> : version string for the current build (e.g., commit hash)
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
"""
parser = argparse.ArgumentParser(
description="Process nightly build wheel files to generate indices."
)
parser.add_argument(
"--version",
type=str,
required=True,
help="Version string for the current build (e.g., commit hash)",
)
parser.add_argument(
"--current-objects",
type=str,
required=True,
help="Path to JSON file containing current S3 objects listing in this version directory",
)
parser.add_argument(
"--output-dir",
type=str,
required=True,
help="Directory to store generated index files",
)
parser.add_argument(
"--alias-to-default",
type=str,
default=None,
help="Alias variant name for the default variant",
)
args = parser.parse_args()
version = args.version
if "/" in version or "\\" in version:
raise ValueError("Version string must not contain slashes.")
current_objects_path = Path(args.current_objects)
output_dir = Path(args.output_dir)
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)
# Read current objects JSON
with open(current_objects_path) as f:
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
# current_objects looks like from list_objects_v2 S3 API:
"""
"Contents": [
{
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
"LastModified": "2025-11-28T14:00:32+00:00",
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
"ChecksumAlgorithm": [
"CRC64NVME"
],
"ChecksumType": "FULL_OBJECT",
"Size": 435649349,
"StorageClass": "STANDARD"
},
...
]
"""
# Extract wheel file keys
wheel_files = []
for item in current_objects.get("Contents", []):
key: str = item["Key"]
if key.endswith(".whl"):
wheel_files.append(key.split("/")[-1]) # only the filename is used
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
wheel_base_dir = Path(output_dir).parent / version
index_base_dir = Path(output_dir)
generate_index_and_metadata(
whl_files=wheel_files,
wheel_base_dir=wheel_base_dir,
index_base_dir=index_base_dir,
default_variant=None,
alias_to_default=args.alias_to_default,
)
print(f"Successfully generated index and metadata in {output_dir}")

View File

@ -7,53 +7,51 @@ set -ex
# allow to bind to different cores # allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-0-16} CORE_RANGE=${CORE_RANGE:-0-16}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16} OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
NUMA_NODE=${NUMA_NODE:-0}
export CMAKE_BUILD_PARALLEL_LEVEL=32 export CMAKE_BUILD_PARALLEL_LEVEL=16
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
set -e; set -e;
docker rm -f cpu-test-"$NUMA_NODE" || true; docker rm -f cpu-test || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
function cpu_tests() { function cpu_tests() {
set -e set -e
export NUMA_NODE=$2
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test bash -c "
set -e set -e
pip list" pip list"
# offline inference # offline inference
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test bash -c "
set -e set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests # Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test bash -c "
set -e set -e
pytest -x -v -s tests/kernels/test_onednn.py pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
# basic online serving # basic online serving
docker exec cpu-test-"$NUMA_NODE" bash -c ' docker exec cpu-test bash -c '
set -e set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 & VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
server_pid=$! server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \ --model Qwen/Qwen3-0.6B \
--num-prompts 20 \ --num-prompts 20 \
--endpoint /v1/completions --endpoint /v1/completions
kill -s SIGTERM $server_pid &' kill -s SIGTERM $server_pid &'
@ -61,4 +59,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests export -f cpu_tests
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" timeout 2h bash -c cpu_tests

View File

@ -21,8 +21,8 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"

View File

@ -35,7 +35,7 @@ docker run \
echo $ZE_AFFINITY_MASK echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0 pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager

View File

@ -2,6 +2,28 @@
set -ex set -ex
# ======== part 0: setup ========
BUCKET="vllm-wheels"
INDICES_OUTPUT_DIR="indices"
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
# detect if python3.10+ is available
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
if [[ "$has_new_python" -eq 0 ]]; then
# use new python from docker
docker pull python:3-slim
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
fi
echo "Using python interpreter: $PYTHON"
echo "Python version: $($PYTHON --version)"
# ========= part 1: collect, rename & upload the wheel ==========
# Assume wheels are in artifacts/dist/*.whl # Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl) wheel_files=(artifacts/dist/*.whl)
@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1 exit 1
fi fi
# Get the single wheel file
wheel="${wheel_files[0]}" wheel="${wheel_files[0]}"
# Detect architecture and rename 'linux' to appropriate manylinux version # current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
arch=$(uname -m) # refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
if [[ $arch == "x86_64" ]]; then manylinux_version="manylinux_2_31"
manylinux_version="manylinux1"
elif [[ $arch == "aarch64" ]]; then
manylinux_version="manylinux2014"
else
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
manylinux_version="manylinux1"
fi
# Rename 'linux' to the appropriate manylinux version in the wheel filename # Rename 'linux' to the appropriate manylinux version in the wheel filename
if [[ "$wheel" != *"linux"* ]]; then
echo "Error: Wheel filename does not contain 'linux': $wheel"
exit 1
fi
new_wheel="${wheel/linux/$manylinux_version}" new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel" mv -- "$wheel" "$new_wheel"
wheel="$new_wheel" wheel="$new_wheel"
echo "Renamed wheel to: $wheel"
# Extract the version from the wheel # Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version" echo "Version in wheel: $version"
pure_version="${version%%+*}"
echo "Pure version (without variant): $pure_version"
normal_wheel="$wheel" # Save the original wheel filename # copy wheel to its own bucket
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
# If the version contains "dev", rename it to v1.0.0.dev for consistency # ========= part 2: generate and upload indices ==========
if [[ $version == *dev* ]]; then # generate indices for all existing wheels in the commit directory
suffix="${version##*.}" # this script might be run multiple times if there are multiple variants being built
if [[ $suffix == cu* ]]; then # so we need to guarantee there is little chance for "TOCTOU" issues
new_version="1.0.0.dev+${suffix}" # i.e., one process is generating indices while another is uploading a new wheel
else # so we need to ensure no time-consuming operations happen below
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
# use cp to keep both files in the artifacts directory
cp -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi
# Upload the wheel to S3 # list all wheels in the commit directory
python3 .buildkite/generate_index.py --wheel "$normal_wheel" echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
obj_json="objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# generate index for this commit # call script to generate indicies for all existing wheels
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" # this indices have relative paths that could work as long as it is next to the wheel directory in s3
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" # i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
if [[ $normal_wheel == *"cu129"* ]]; then if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
# only upload index.html for cu129 wheels (default wheels) as it alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
else else
echo "Skipping index files for non-cu129 wheels" alias_arg=""
fi fi
# generate index for nightly $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu129"* ]]; then # copy indices to /<commit>/ unconditionally
# only upload index.html for cu129 wheels (default wheels) as it echo "Uploading indices to $S3_COMMIT_PREFIX"
# is available on both x86 and arm64 aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
else # copy to /nightly/ only if it is on the main branch and not a PR
echo "Skipping index files for non-cu129 wheels" if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
echo "Uploading indices to overwrite /nightly/"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/" # copy to /<pure_version>/ only if it does not have "dev" in the version
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" if [[ "$version" != *"dev"* ]]; then
echo "Uploading indices to overwrite /$pure_version/"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@ -39,9 +39,9 @@ steps:
# if this test fails, it means the nightly torch version is not compatible with some # if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist # of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking grade: Blocking
soft_fail: true soft_fail: true
source_file_dependencies: source_file_dependencies:
- requirements/nightly_torch_test.txt - requirements/nightly_torch_test.txt
@ -50,9 +50,9 @@ steps:
- label: Async Engine, Inputs, Utils, Worker Test # 10min - label: Async Engine, Inputs, Utils, Worker Test # 10min
timeout_in_minutes: 15 timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/multimodal - tests/multimodal
@ -61,17 +61,18 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_ - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
timeout_in_minutes: 10 timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/transformers_utils - tests/transformers_utils
- tests/config - tests/config
no_gpu: true no_gpu: true
@ -80,6 +81,7 @@ steps:
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
- pytest -v -s config - pytest -v -s config
@ -113,9 +115,9 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- label: Entrypoints Unit Tests # 5min - label: Entrypoints Unit Tests # 5min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking grade: Blocking
timeout_in_minutes: 10 timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
@ -212,6 +214,7 @@ steps:
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -250,9 +253,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min - label: EPLB Algorithm Test # 5min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking grade: Blocking
timeout_in_minutes: 15 timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
@ -308,23 +311,20 @@ steps:
- pytest -v -s test_regression.py - pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 25min - label: Engine Test # 9min
timeout_in_minutes: 40 timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/engine - tests/engine
- tests/tokenization
- tests/test_sequence - tests/test_sequence
- tests/test_config - tests/test_config
- tests/test_logger - tests/test_logger
- tests/test_vllm_port - tests/test_vllm_port
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min - label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45 timeout_in_minutes: 45
@ -342,9 +342,9 @@ steps:
- label: V1 Test entrypoints # 35min - label: V1 Test entrypoints # 35min
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
@ -392,6 +392,20 @@ steps:
commands: commands:
- pytest -v -s v1/attention - pytest -v -s v1/attention
- label: Batch Invariance Tests (H100) # 10min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
- label: V1 Test attention (B200) # 10min - label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30 timeout_in_minutes: 30
gpu: b200 gpu: b200
@ -402,9 +416,9 @@ steps:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins - label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
@ -496,7 +510,7 @@ steps:
- label: PyTorch Compilation Unit Tests # 15min - label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
torch_nightly: true torch_nightly: true
@ -513,7 +527,7 @@ steps:
- label: PyTorch Fullgraph Smoke Test # 15min - label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30 timeout_in_minutes: 30
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
torch_nightly: true torch_nightly: true
@ -569,7 +583,7 @@ steps:
- label: Kernels Attention Test %N # 23min - label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8 agent_pool: mi325_8
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
@ -596,7 +610,7 @@ steps:
- label: Kernels MoE Test %N # 40min - label: Kernels MoE Test %N # 40min
timeout_in_minutes: 60 timeout_in_minutes: 60
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8 agent_pool: mi325_8
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
@ -623,6 +637,26 @@ steps:
commands: commands:
- pytest -v -s kernels/mamba - pytest -v -s kernels/mamba
- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
# Not replicating for CUTLAS & CuTe
timeout_in_minutes: 45
gpu: h100
num_gpus: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization
- tests/kernels/quantization/test_block_fp8.py
- tests/kernels/moe/test_deepgemm.py
- tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py
commands:
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Model Executor Test # 23min - label: Model Executor Test # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
torch_nightly: true torch_nightly: true
@ -681,6 +715,7 @@ steps:
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 - uv pip install --system torchao==0.13.0
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 15min - label: LM Eval Small Models # 15min
@ -900,6 +935,18 @@ steps:
commands: commands:
- pytest -v -s models/language/pooling_mteb_test - pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test (CPU)
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor Test # 44min - label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60 timeout_in_minutes: 60
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -1056,6 +1103,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min - label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40 timeout_in_minutes: 40
@ -1065,11 +1113,19 @@ steps:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/ - vllm/compilation/
# can affect pattern matching # can affect pattern matching
- vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands: commands:
- nvidia-smi - nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_fusion_attn.py
@ -1080,7 +1136,7 @@ steps:
# Wrap with quotes to escape yaml # Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min - label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40 timeout_in_minutes: 40
@ -1102,7 +1158,7 @@ steps:
commands: commands:
- nvidia-smi - nvidia-smi
# Run all e2e fusion tests # Run all e2e fusion tests
- pytest -v -s tests/compile/test_fusions_e2e.py - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: ROCm GPT-OSS Eval - label: ROCm GPT-OSS Eval
timeout_in_minutes: 60 timeout_in_minutes: 60
@ -1217,6 +1273,7 @@ steps:
- tests/v1/worker/test_worker_memory_snapshot.py - tests/v1/worker/test_worker_memory_snapshot.py
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
@ -1252,7 +1309,7 @@ steps:
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60 timeout_in_minutes: 60
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2 agent_pool: mi325_2
# grade: Blocking # grade: Blocking
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
@ -1328,7 +1385,7 @@ steps:
- label: Weight Loading Multiple GPU Test # 33min - label: Weight Loading Multiple GPU Test # 33min
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2 agent_pool: mi325_2
# grade: Blocking # grade: Blocking
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
@ -1428,14 +1485,14 @@ steps:
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_gpus: 2
commands: commands:
- pytest -v -s tests/compile/distributed/test_async_tp.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- pytest -v -s tests/compile/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####
@ -1465,7 +1522,7 @@ steps:
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
timeout_in_minutes: 60 timeout_in_minutes: 60
@ -1476,8 +1533,8 @@ steps:
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy - label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
timeout_in_minutes: 60 timeout_in_minutes: 60
@ -1487,3 +1544,12 @@ steps:
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

View File

@ -57,14 +57,15 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_ - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
timeout_in_minutes: 10 timeout_in_minutes: 20
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/transformers_utils - tests/transformers_utils
- tests/config - tests/config
no_gpu: true no_gpu: true
@ -73,6 +74,7 @@ steps:
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
- pytest -v -s config - pytest -v -s config
@ -192,6 +194,7 @@ steps:
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -275,21 +278,18 @@ steps:
- pytest -v -s test_regression.py - pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 25min - label: Engine Test # 9min
timeout_in_minutes: 40 timeout_in_minutes: 15
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/engine - tests/engine
- tests/tokenization
- tests/test_sequence - tests/test_sequence
- tests/test_config - tests/test_config
- tests/test_logger - tests/test_logger
- tests/test_vllm_port - tests/test_vllm_port
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min - label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45 timeout_in_minutes: 45
@ -390,20 +390,24 @@ steps:
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic
- python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@ -631,6 +635,7 @@ steps:
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
@ -818,14 +823,24 @@ steps:
commands: commands:
- pytest -v -s models/language/pooling_mteb_test - pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test # 44min - label: Multi-Modal Processor Test (CPU)
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor Test
timeout_in_minutes: 60 timeout_in_minutes: 60
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing - pytest -v -s models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Models Test (Standard) # 60min - label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80 timeout_in_minutes: 80
@ -902,11 +917,12 @@ steps:
- label: Transformers Nightly Models Test - label: Transformers Nightly Models Test
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
optional: true optional: true
soft_fail: true
commands: commands:
- pip install --upgrade git+https://github.com/huggingface/transformers - pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py - pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
@ -1116,6 +1132,7 @@ steps:
# https://github.com/NVIDIA/nccl/issues/1838 # https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0 - export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
@ -1299,11 +1316,11 @@ steps:
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_gpus: 2
commands: commands:
- pytest -v -s tests/compile/distributed/test_async_tp.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py

3
.github/CODEOWNERS vendored
View File

@ -146,9 +146,10 @@ mkdocs.yaml @hmellor
/requirements/kv_connectors.txt @NickLucche /requirements/kv_connectors.txt @NickLucche
# Pooling models # Pooling models
/examples/*/pooling/ @noooop /examples/pooling @noooop
/tests/models/*/pooling* @noooop /tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop /tests/entrypoints/pooling @noooop
/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop /vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop /vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop /vllm/model_executor/layers/pooler.py @noooop

View File

@ -13,10 +13,10 @@ jobs:
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- name: Set up Python - name: Set up Python
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with: with:
python-version: '3.12' python-version: '3.12'

View File

@ -105,6 +105,31 @@ jobs:
} }
], ],
}, },
cpu: {
// Keyword search - matches whole words only (with word boundaries)
keywords: [
{
term: "CPU Backend",
searchIn: "title"
},
{
term: "x86",
searchIn: "title"
},
{
term: "ARM",
searchIn: "title"
},
{
term: "Apple Silicon",
searchIn: "title"
},
{
term: "IBM Z",
searchIn: "title"
},
],
},
// Add more label configurations here as needed // Add more label configurations here as needed
// example: { // example: {
// keywords: [...], // keywords: [...],

View File

@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30 timeout-minutes: 30
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v6
- uses: astral-sh/setup-uv@v7 - uses: astral-sh/setup-uv@v7
with: with:

View File

@ -16,8 +16,8 @@ jobs:
pre-commit: pre-commit:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with: with:
python-version: "3.12" python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

View File

@ -136,7 +136,7 @@ elseif(HIP_FOUND)
# ROCm 5.X and 6.X # ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM}) Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} " message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.") "expected for ROCm build, saw ${Torch_VERSION} instead.")
endif() endif()
@ -354,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs. # Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") # marlin arches for fp16 output
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
# marlin arches for fp8 input
# - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
# #
@ -365,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MARLIN_GEN_SCRIPT set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py) ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH) file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}") message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}") message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH} if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH}) OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process( execute_process(
COMMAND ${CMAKE_COMMAND} -E env COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE marlin_generation_result RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result OUTPUT_VARIABLE marlin_generation_result
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@ -387,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: " "\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log") "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
else() else()
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH} set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
CACHE STRING "Last run Marlin generate script hash" FORCE) CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
message(STATUS "Marlin generation completed successfully.") message(STATUS "Marlin generation completed successfully.")
endif() endif()
else() else()
message(STATUS "Marlin generation script has not changed, skipping generation.") message(STATUS "Marlin generation script has not changed, skipping generation.")
endif() endif()
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu") file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}") CUDA_ARCHS "${MARLIN_ARCHS}")
@ -403,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif() endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
if (MARLIN_FP8_ARCHS)
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
endif()
set(MARLIN_SRCS set(MARLIN_SRCS
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu") "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
@ -604,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu") "csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}") CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1") list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else() else()
message(STATUS "Not building NVFP4 as no compatible archs were found.") message(STATUS "Not building NVFP4 as no compatible archs were found.")
@ -938,8 +974,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${CUDA_ARCHS}") CUDA_ARCHS "${CUDA_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX # moe marlin arches
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") # note that we always set `use_atomic_add=False` for moe marlin now,
# so we don't need 9.0 for bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
# moe marlin arches for fp8 input
# - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
# #
@ -949,16 +992,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_MARLIN_GEN_SCRIPT set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py) ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH) file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}") message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}") message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH}) OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process( execute_process(
COMMAND ${CMAKE_COMMAND} -E env COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE moe_marlin_generation_result RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@ -971,7 +1016,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"\nCheck the log for details: " "\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log") "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else() else()
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH} set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE) CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.") message(STATUS "Marlin MOE generation completed successfully.")
endif() endif()
@ -979,16 +1024,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Marlin MOE generation script has not changed, skipping generation.") message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif() endif()
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu") file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}" SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}") CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC} set_source_files_properties(${MARLIN_MOE_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif() endif()
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) if (MARLIN_MOE_FP8_ARCHS)
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_FP8_SRC}"
CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_MOE_FP8_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
endif()
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else() else()

View File

@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥 *Latest News* 🔥
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI) - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link). - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6). - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).

View File

@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
``` ```
#### 2. Maximize Throughput with a Latency Requirement ### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms. - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**: - **Configuration**:
@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=500 MAX_LATENCY_ALLOWED_MS=500
``` ```
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements ### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms. - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**: - **Configuration**:

View File

@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False kwargs["use_fast"] = False
if tokenizer_mode == "mistral": if tokenizer_mode == "mistral":
try: try:
from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.tokenizers import MistralTokenizer
except ImportError as e: except ImportError as e:
raise ImportError( raise ImportError(
"MistralTokenizer requires vllm package.\n" "MistralTokenizer requires vllm package.\n"

View File

@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type), device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(), parallel_config=ParallelConfig(),
load_config=LoadConfig(), load_config=LoadConfig(),
scheduler_config=SchedulerConfig(), scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
) )
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group

View File

@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
try: try:
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.tokenizers import get_tokenizer
except ImportError: except ImportError:
from backend_request_func import get_tokenizer from backend_request_func import get_tokenizer

View File

@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
try: try:
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.tokenizers import get_tokenizer
except ImportError: except ImportError:
from backend_request_func import get_tokenizer from backend_request_func import get_tokenizer

View File

@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
b_q_weight=w_q, b_q_weight=w_q,
b_bias=None, b_bias=None,
b_scales=w_s, b_scales=w_s,
a_scales=None,
global_scale=None, global_scale=None,
b_zeros=w_zp, b_zeros=w_zp,
g_idx=g_idx, g_idx=g_idx,

View File

@ -263,7 +263,7 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@ -273,7 +273,7 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,

View File

@ -495,7 +495,13 @@ function (define_extension_target MOD_NAME)
set(SOABI_KEYWORD "") set(SOABI_KEYWORD "")
endif() endif()
if (ARG_USE_SABI) run_python(IS_FREETHREADED_PYTHON
"import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
"Failed to determine whether interpreter is free-threaded")
# Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
# so avoid using the stable ABI under free-threading only.
if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}") Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
else() else()
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}") Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")

View File

@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
scalar_t* output, float* output_lse, const scalar_t* prefix_output, scalar_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output, const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads, const float* suffix_lse, const uint num_tokens, const uint num_heads,
const uint head_size) { const uint head_size, const uint prefix_head_stride,
const uint output_head_stride) {
using pack_128b_t = uint4; using pack_128b_t = uint4;
const uint pack_size = 16 / sizeof(scalar_t); const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size; const uint threads_per_head = head_size / pack_size;
@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
const uint head_idx = token_head_idx % num_heads; const uint head_idx = token_head_idx % num_heads;
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc. const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
const uint head_offset = const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
token_idx * num_heads * head_size + head_idx * head_size; head_idx * prefix_head_stride;
const scalar_t* prefix_head_ptr = prefix_output + head_offset; const uint dst_head_offset = token_idx * num_heads * output_head_stride +
const scalar_t* suffix_head_ptr = suffix_output + head_offset; head_idx * output_head_stride;
scalar_t* output_head_ptr = output + head_offset; const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
scalar_t* output_head_ptr = output + dst_head_offset;
float p_lse = prefix_lse[head_idx * num_tokens + token_idx]; float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx]; float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
reinterpret_cast<float*>(prefix_lse.data_ptr()), \ reinterpret_cast<float*>(prefix_lse.data_ptr()), \
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \ reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \ reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
num_heads, head_size); \ num_heads, head_size, prefix_head_stride, output_head_stride); \
} }
/*@brief Merges the attention states from prefix and suffix /*@brief Merges the attention states from prefix and suffix
@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
const uint num_tokens = output.size(0); const uint num_tokens = output.size(0);
const uint num_heads = output.size(1); const uint num_heads = output.size(1);
const uint head_size = output.size(2); const uint head_size = output.size(2);
const uint prefix_head_stride = prefix_output.stride(1);
const uint output_head_stride = output.stride(1);
const uint pack_size = 16 / sizeof(scalar_t); const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0, TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size); "headsize must be multiple of pack_size:", pack_size);
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
"output heads must be contiguous in memory");
TORCH_CHECK(
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
"prefix_output heads must be contiguous in memory");
TORCH_CHECK(
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
"suffix_output heads must be contiguous in memory");
float* output_lse_ptr = nullptr; float* output_lse_ptr = nullptr;
if (output_lse.has_value()) { if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr<float>(); output_lse_ptr = output_lse.value().data_ptr<float>();

View File

@ -51,12 +51,13 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
if (node_id != -1) { if (node_id != -1) {
node_ids.insert(node_id); node_ids.insert(node_id);
} }
TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ", if (node_id != mem_node_id) {
node_id, ", but CPU ", omp_cpu_ids.front(), TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
" is on NUMA node ", mem_node_id, omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
". All CPUs should be on the same NUMA node for optimal " ". All CPUs should be on the same NUMA node for optimal "
"performance. Memory will be bound to NUMA node ", "performance. Memory will be bound to NUMA node ",
mem_node_id, "."); mem_node_id, ".");
}
} }
// Concatenate all node_ids into a single comma-separated string // Concatenate all node_ids into a single comma-separated string
if (!node_ids.empty()) { if (!node_ids.empty()) {

View File

@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
} }
auto Y_all = at::empty({offsets[E], H}, x_c.options()); auto Y_all = at::empty({offsets[E], H}, x_c.options());
at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) { at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) {
c10::InferenceMode guard; c10::InferenceMode guard;
for (int64_t e = e_begin; e < e_end; ++e) { for (int64_t e = 0; e < E; ++e) {
const int64_t te = counts[e]; int64_t start = std::max(offsets[e], idx_begin);
if (te == 0) { int64_t end = std::min(offsets[e + 1], idx_end);
int64_t te = end - start;
if (te <= 0) {
continue; continue;
} }
const int64_t start = offsets[e];
auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
auto w13_e = w13_packed.select(/*dim=*/0, e); auto w13_e = w13_packed.select(/*dim=*/0, e);

View File

@ -1 +1,2 @@
kernel_*.cu sm*_kernel_*.cu
kernel_selector.h

View File

@ -4,134 +4,282 @@ import glob
import itertools import itertools
import os import os
import subprocess import subprocess
import sys
import jinja2 import jinja2
FILE_HEAD = """ ARCHS = []
// auto generated by generate.py SUPPORT_FP8 = False
// clang-format off for arch in sys.argv[1].split(","):
arch = arch[: arch.index(".") + 2].replace(".", "")
arch = int(arch)
# only SM89 and SM120 fully support
# mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
# SM90 and SM100 can use this PTX, but its simulated
# with FP16 MMA, so it cannot achieve any acceleration.
if arch in [89, 120]:
SUPPORT_FP8 = True
FILE_HEAD_COMMENT = """
// auto generated by generate_kernels.py
// clang-format off
""".lstrip()
FILE_HEAD = (
FILE_HEAD_COMMENT
+ """
#include "kernel.h" #include "kernel.h"
#include "marlin_template.h" #include "marlin_template.h"
namespace MARLIN_NAMESPACE_NAME { namespace MARLIN_NAMESPACE_NAME {
""".strip() """
)
TEMPLATE = ( TEMPLATE = (
"template __global__ void Marlin<" "template __global__ void Marlin<"
"{{scalar_t}}, " "{{a_type_id}}, "
"{{w_type_id}}, " "{{b_type_id}}, "
"{{c_type_id}}, "
"{{s_type_id}}, " "{{s_type_id}}, "
"{{threads}}, " "{{threads}}, "
"{{thread_m_blocks}}, " "{{thread_m_blocks}}, "
"{{thread_n_blocks}}, " "{{thread_n_blocks}}, "
"{{thread_k_blocks}}, " "{{thread_k_blocks}}, "
"{{'true' if m_block_size_8 else 'false'}}, " "{{m_block_size_8}}, "
"{{stages}}, " "{{stages}}, "
"{{group_blocks}}, " "{{group_blocks}}, "
"{{'true' if is_zp_float else 'false'}}>" "{{is_zp_float}}>"
"( MARLIN_KERNEL_PARAMS );" "( MARLIN_KERNEL_PARAMS );"
) )
# int8 with zero point case (vllm::kU8) is also supported,
# we don't add it to reduce wheel size.
SCALAR_TYPES = [
"vllm::kU4",
"vllm::kU4B8",
"vllm::kU8B128",
"vllm::kFE4M3fn",
"vllm::kFE2M1f",
]
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)] THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4] THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
# group_blocks:
# = 0 : act order case QUANT_CONFIGS = [
# = -1 : channelwise quantization # AWQ-INT4
# > 0 : group_size=16*group_blocks {
GROUP_BLOCKS = [0, -1, 1, 2, 4, 8] "b_type": "kU4",
DTYPES = ["fp16", "bf16"] "thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [-1, 2, 4, 8],
},
# GPTQ-INT4
{
"b_type": "kU4B8",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [-1, 0, 2, 4, 8],
},
# AWQ-INT8
{
"b_type": "kU8B128",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [-1, 0, 2, 4, 8],
},
# FP8
{
"b_type": "kFE4M3fn",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [-1, 8],
},
# NVFP4
{
"b_type": "kFE2M1f",
"s_type": "kFE4M3fn",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [1],
},
# MXFP4
{
"a_type": ["kBFloat16"],
"b_type": "kFE2M1f",
"s_type": "kFE8M0fnu",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [2],
},
# AWQ-INT4 with INT8 activation
{
"a_type": ["kS8"],
"b_type": "kU4",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [-1, 2, 4, 8],
},
# GPTQ-INT4 with INT8 activation
{
"a_type": ["kS8"],
"b_type": "kU4B8",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [-1, 2, 4, 8],
},
# GPTQ-INT4 with FP8 activation
{
"a_type": ["kFE4M3fn"],
"b_type": "kU4B8",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [-1, 2, 4, 8],
},
# AWQ-INT4 with FP8 activation
{
"a_type": ["kFE4M3fn"],
"b_type": "kU4",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [-1, 2, 4, 8],
},
# MXFP4 with FP8 activation
{
"a_type": ["kFE4M3fn"],
"b_type": "kFE2M1f",
"c_type": ["kBFloat16"],
"s_type": "kFE8M0fnu",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [2],
},
]
def remove_old_kernels(): def remove_old_kernels():
for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"): for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
subprocess.call(["rm", "-f", filename]) subprocess.call(["rm", "-f", filename])
filename = os.path.dirname(__file__) + "/kernel_selector.h"
subprocess.call(["rm", "-f", filename])
def generate_new_kernels(): def generate_new_kernels():
for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES): result_dict = {}
for quant_config in QUANT_CONFIGS:
c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
b_type = quant_config["b_type"]
all_group_blocks = quant_config["group_blocks"]
all_m_blocks = quant_config["thread_m_blocks"]
all_thread_configs = quant_config["thread_configs"]
for a_type, c_type in itertools.product(a_types, c_types):
if not SUPPORT_FP8 and a_type == "kFE4M3fn":
continue
if "16" in a_type and "16" in c_type and a_type != c_type:
continue
s_type = quant_config.get("s_type", c_type)
if (a_type, b_type, c_type) not in result_dict:
result_dict[(a_type, b_type, c_type)] = []
for group_blocks, m_blocks, thread_configs in itertools.product(
all_group_blocks, all_m_blocks, all_thread_configs
):
thread_k, thread_n, threads = thread_configs
if threads == 256:
# for small batch (m_blocks == 1),
# we only need (128, 128, 256)
# for large batch (m_blocks > 1),
# we only need (64, 256, 256)
if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
continue
if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
continue
config = {
"threads": threads,
"s_type": s_type,
"thread_m_blocks": max(m_blocks, 1),
"thread_k_blocks": thread_k // 16,
"thread_n_blocks": thread_n // 16,
"m_block_size_8": "true" if m_blocks == 0.5 else "false",
"stages": "pipe_stages",
"group_blocks": group_blocks,
"is_zp_float": "false",
}
result_dict[(a_type, b_type, c_type)].append(config)
kernel_selector_str = FILE_HEAD_COMMENT
for (a_type, b_type, c_type), config_list in result_dict.items():
all_template_str_list = [] all_template_str_list = []
for config in config_list:
for group_blocks, m_blocks, thread_configs in itertools.product( s_type = config["s_type"]
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
):
# act order case only support gptq-int4 and gptq-int8
if group_blocks == 0 and scalar_type not in [
"vllm::kU4B8",
"vllm::kU8B128",
]:
continue
if thread_configs[2] == 256:
# for small batch (m_blocks == 1), we only need (128, 128, 256)
# for large batch (m_blocks > 1), we only need (64, 256, 256)
if m_blocks <= 1 and thread_configs[0] != 128:
continue
if m_blocks > 1 and thread_configs[0] != 64:
continue
# we only support channelwise quantization and group_size == 128
# for fp8
if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
continue
# nvfp4 only supports group_size == 16
# mxfp4 only supports group_size == 32
if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
continue
# other quantization methods don't support group_size = 16
if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
continue
k_blocks = thread_configs[0] // 16
n_blocks = thread_configs[1] // 16
threads = thread_configs[2]
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
s_type = "vllm::kFE4M3fn"
elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
s_type = "vllm::kFE8M0fnu"
if dtype == "fp16":
# we cannot safely dequantize e8m0 to fp16, so skip this
continue
elif dtype == "fp16":
s_type = "vllm::kFloat16"
elif dtype == "bf16":
s_type = "vllm::kBFloat16"
template_str = jinja2.Template(TEMPLATE).render( template_str = jinja2.Template(TEMPLATE).render(
scalar_t=c_dtype, a_type_id=f"vllm::{a_type}.id()",
w_type_id=scalar_type + ".id()", b_type_id=f"vllm::{b_type}.id()",
s_type_id=s_type + ".id()", c_type_id=f"vllm::{c_type}.id()",
threads=threads, s_type_id=f"vllm::{s_type}.id()",
thread_m_blocks=max(m_blocks, 1), **config,
thread_n_blocks=n_blocks, )
thread_k_blocks=k_blocks, all_template_str_list.append(template_str)
m_block_size_8=m_blocks == 0.5,
stages="pipe_stages", conditions = [
group_blocks=group_blocks, f"a_type == vllm::{a_type}",
is_zp_float=False, f"b_type == vllm::{b_type}",
f"c_type == vllm::{c_type}",
f"s_type == vllm::{s_type}",
f"threads == {config['threads']}",
f"thread_m_blocks == {config['thread_m_blocks']}",
f"thread_n_blocks == {config['thread_n_blocks']}",
f"thread_k_blocks == {config['thread_k_blocks']}",
f"m_block_size_8 == {config['m_block_size_8']}",
f"group_blocks == {config['group_blocks']}",
f"is_zp_float == {config['is_zp_float']}",
]
conditions = " && ".join(conditions)
if kernel_selector_str == FILE_HEAD_COMMENT:
kernel_selector_str += f"if ({conditions})\n kernel = "
else:
kernel_selector_str += f"else if ({conditions})\n kernel = "
kernel_template2 = (
"Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
"{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
"{{thread_n_blocks}}, {{thread_k_blocks}}, "
"{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
"{{is_zp_float}}>;"
) )
all_template_str_list.append(template_str) kernel_selector_str += (
jinja2.Template(kernel_template2).render(
a_type_id=f"vllm::{a_type}.id()",
b_type_id=f"vllm::{b_type}.id()",
c_type_id=f"vllm::{c_type}.id()",
s_type_id=f"vllm::{s_type}.id()",
**config,
)
+ "\n"
)
file_content = FILE_HEAD + "\n\n" file_content = FILE_HEAD + "\n\n"
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n" file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu" if a_type == "kFE4M3fn":
filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
else:
filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
filename = filename.lower()
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f: with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
f.write(file_content) f.write(file_content)
if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
kernel_selector_str += (
"else if (a_type == vllm::kFE4M3fn)\n"
" TORCH_CHECK(false, "
'"marlin kernel with fp8 activation is not built.");'
)
with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
f.write(kernel_selector_str)
if __name__ == "__main__": if __name__ == "__main__":
remove_old_kernels() remove_old_kernels()

View File

@ -11,8 +11,9 @@
const int4 *__restrict__ A, const int4 *__restrict__ B, \ const int4 *__restrict__ A, const int4 *__restrict__ B, \
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
const int4 *__restrict__ b_bias_ptr, \ const int4 *__restrict__ b_bias_ptr, \
const float *__restrict__ a_scales_ptr, \
const int4 *__restrict__ scales_ptr, \ const int4 *__restrict__ scales_ptr, \
const uint16_t *__restrict__ scale2_ptr, \ const uint16_t *__restrict__ global_scale_ptr, \
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \ const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
const int32_t *__restrict__ sorted_token_ids_ptr, \ const int32_t *__restrict__ sorted_token_ids_ptr, \
const int32_t *__restrict__ expert_ids_ptr, \ const int32_t *__restrict__ expert_ids_ptr, \
@ -20,12 +21,13 @@
const float *__restrict__ topk_weights_ptr, int top_k, \ const float *__restrict__ topk_weights_ptr, int top_k, \
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \ bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \ int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
bool use_fp32_reduce, int max_shared_mem bool use_fp32_reduce
namespace MARLIN_NAMESPACE_NAME { namespace MARLIN_NAMESPACE_NAME {
template <typename scalar_t, // compute dtype, half or nv_float16 template <const vllm::ScalarTypeId a_type_id, // A ScalarType id
const vllm::ScalarTypeId w_type_id, // weight ScalarType id const vllm::ScalarTypeId b_type_id, // B ScalarType id
const vllm::ScalarTypeId s_type_id, // weight scale ScalarType id const vllm::ScalarTypeId c_type_id, // C ScalarType id
const vllm::ScalarTypeId s_type_id, // B_SCALE ScalarType id
const int threads, // number of threads in a threadblock const int threads, // number of threads in a threadblock
const int thread_m_blocks, // number of 16x16 blocks in the m const int thread_m_blocks, // number of 16x16 blocks in the m
// dimension (batchsize) of the // dimension (batchsize) of the

File diff suppressed because it is too large Load Diff

View File

@ -37,39 +37,6 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS); using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
template <int moe_block_size>
__global__ void permute_cols_kernel(
int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
int4* __restrict__ out_int4_ptr,
const int32_t* __restrict__ sorted_token_ids_ptr,
const int32_t* __restrict__ expert_ids_ptr,
const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
int size_k, int top_k) {};
} // namespace marlin
torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
torch::Tensor& b_q_weight,
std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
std::optional<torch::Tensor> const& b_zeros_or_none,
std::optional<torch::Tensor> const& g_idx_or_none,
std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
bool is_zp_float) {
TORCH_CHECK_NOT_IMPLEMENTED(false,
"marlin_gemm(..) requires CUDA_ARCH >= 8.0");
return torch::empty({1, 1});
}
#else
// For a given "a" of size [M,K] performs a permutation of the K columns based // For a given "a" of size [M,K] performs a permutation of the K columns based
// on the given "perm" indices. // on the given "perm" indices.
template <int moe_block_size> template <int moe_block_size>
@ -207,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
int thread_m_blocks, int prob_m, int prob_n, int thread_m_blocks, int prob_m, int prob_n,
int prob_k, int num_bits, int group_size, int prob_k, int num_bits, int group_size,
bool has_act_order, bool is_k_full, int has_zp, bool has_act_order, bool is_k_full, int has_zp,
int is_zp_float) { int is_zp_float, bool is_a_8bit) {
int pack_factor = 32 / num_bits; int pack_factor = 32 / num_bits;
// Get B size // Get B size
@ -217,8 +184,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
// shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
// both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32) // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
int sh_block_meta_size = tb_m * 4; int sh_block_meta_size = tb_m * 16;
int sh_a_size = pipe_stages * (tb_m * tb_k) * 2; int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4; int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
int sh_red_size = tb_m * (tb_n + 8) * 2; int sh_red_size = tb_m * (tb_n + 8) * 2;
int sh_bias_size = tb_n * 2; int sh_bias_size = tb_n * 2;
@ -250,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
int thread_m_blocks, int prob_m, int prob_n, int prob_k, int thread_m_blocks, int prob_m, int prob_n, int prob_k,
int num_bits, int group_size, bool has_act_order, int num_bits, int group_size, bool has_act_order,
bool is_k_full, int has_zp, int is_zp_float, bool is_k_full, int has_zp, int is_zp_float,
int max_shared_mem) { int max_shared_mem, bool is_a_8bit) {
// Sanity // Sanity
if (th_config.thread_k == -1 || th_config.thread_n == -1 || if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
th_config.num_threads == -1) { th_config.num_threads == -1) {
@ -273,188 +240,34 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
} }
// Check that pipeline fits into cache // Check that pipeline fits into cache
int cache_size = get_kernel_cache_size( int cache_size =
th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float); prob_n, prob_k, num_bits, group_size, has_act_order,
return cache_size + 512 <= max_shared_mem; is_k_full, has_zp, is_zp_float, is_a_8bit);
return cache_size <= max_shared_mem;
} }
#define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ MarlinFuncPtr get_marlin_kernel(
M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ const vllm::ScalarType a_type, const vllm::ScalarType b_type,
else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ const vllm::ScalarType c_type, const vllm::ScalarType s_type,
thread_n_blocks == THREAD_N_BLOCKS && \ int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
thread_k_blocks == THREAD_K_BLOCKS && \ bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
m_block_size_8 == M_BLOCK_SIZE_8 && \ int threads, bool is_zp_float) {
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ int num_bits = b_type.size_bits();
is_zp_float == IS_ZP_FLOAT) { \
constexpr auto S_TYPE = \
W_TYPE == vllm::kFE2M1f \
? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu) \
: (std::is_same<scalar_t, half>::value ? vllm::kFloat16 \
: vllm::kBFloat16); \
kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS, \
THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
}
// COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
// this is the most common cases
// BIGGROUP: cases for big group size (group_blocks in [-1, 8])
// FZP: cases for float-zero-point (is_zp_float = true)
// ACT: cases for act order case (group_blocks == 0)
// FP4: cases for nvfp4(e2m1) (group_blocks == 1)
#define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
\
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
\
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define COMMON_GET_IF(W_TYPE) \
COMMON_GET_IF_M1(W_TYPE, 8, 8, 256) \
COMMON_GET_IF_M1(W_TYPE, 8, 4, 128) \
COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)
#define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define BIGGROUP_GET_IF(W_TYPE) \
BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256) \
BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128) \
BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
#define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
#define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
#define NVFP4_GET_IF(W_TYPE) \
NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \
NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \
NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
#define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
#define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
#define MXFP4_GET_IF(W_TYPE) \
MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \
MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \
MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
// We currently have 4-bit models only with group_blocks == 4
#define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
#define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
#define FZP_GET_IF(W_TYPE) \
FZP_GET_IF_M1(W_TYPE, 8, 8, 256) \
FZP_GET_IF_M1(W_TYPE, 8, 4, 128) \
FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
FZP_GET_IF_M234(W_TYPE, 8, 4, 128)
// We currently have 4-bit models only with group_blocks == 4
#define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
#define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
#define ACT_GET_IF(W_TYPE) \
ACT_GET_IF_M1(W_TYPE, 8, 8, 256) \
ACT_GET_IF_M1(W_TYPE, 8, 4, 128) \
ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
ACT_GET_IF_M234(W_TYPE, 8, 4, 128)
template <typename scalar_t>
MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
int thread_m_blocks, int thread_n_blocks,
int thread_k_blocks, bool m_block_size_8,
bool has_act_order, bool has_zp,
int group_blocks, int num_threads,
bool is_zp_float) {
int num_bits = q_type.size_bits();
auto kernel = MarlinDefault; auto kernel = MarlinDefault;
if (false) {
}
COMMON_GET_IF(vllm::kU4) #include "kernel_selector.h"
COMMON_GET_IF(vllm::kU4B8)
COMMON_GET_IF(vllm::kU8B128)
NVFP4_GET_IF(vllm::kFE2M1f)
BIGGROUP_GET_IF(vllm::kFE4M3fn)
ACT_GET_IF(vllm::kU4B8)
ACT_GET_IF(vllm::kU8B128)
if (std::is_same<scalar_t, nv_bfloat16>::value) {
if (false) {
}
MXFP4_GET_IF(vllm::kFE2M1f)
}
return kernel; return kernel;
} }
template <typename scalar_t> exec_config_t determine_exec_config(
exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m, const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
int prob_n, int prob_k, int thread_m_blocks, const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
bool m_block_size_8, int num_bits, int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
int group_size, bool has_act_order, bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
bool is_k_full, bool has_zp, bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
bool is_zp_float, int max_shared_mem) { bool is_a_8bit) {
exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}}; exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
thread_config_t* thread_configs = thread_m_blocks > 1 thread_config_t* thread_configs = thread_m_blocks > 1
? large_batch_thread_configs ? large_batch_thread_configs
@ -471,73 +284,69 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m, if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
prob_n, prob_k, num_bits, group_size, has_act_order, prob_n, prob_k, num_bits, group_size, has_act_order,
is_k_full, has_zp, is_zp_float, max_shared_mem)) { is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
is_a_8bit)) {
continue; continue;
} }
int cache_size = get_kernel_cache_size( int cache_size = get_kernel_cache_size(
th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float); num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
is_a_8bit);
int group_blocks = 0; int group_blocks = 0;
if (!has_act_order) { if (!has_act_order) {
group_blocks = group_size == -1 ? -1 : (group_size / 16); group_blocks = group_size == -1 ? -1 : (group_size / 16);
} }
auto kernel = get_marlin_kernel<scalar_t>( auto kernel =
q_type, thread_m_blocks, th_config.thread_n / 16, get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp, th_config.thread_n / 16, th_config.thread_k / 16,
group_blocks, th_config.num_threads, is_zp_float); m_block_size_8, has_act_order, has_zp, group_blocks,
th_config.num_threads, is_zp_float);
if (kernel == MarlinDefault) continue; if (kernel == MarlinDefault) continue;
if (thread_m_blocks > 1) { cudaFuncAttributes attr;
exec_cfg = {1, th_config}; cudaFuncGetAttributes(&attr, kernel);
break; int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
} else { int allow_count = min(device_max_reg_size / reg_size,
cudaFuncAttributes attr; max_shared_mem / (cache_size + 1536));
cudaFuncGetAttributes(&attr, kernel); if (thread_m_blocks == 1)
int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
int allow_count = min(device_max_reg_size / reg_size,
max_shared_mem / (cache_size + 1024));
allow_count = max(min(allow_count, 4), 1); allow_count = max(min(allow_count, 4), 1);
if (allow_count > count) { else
count = allow_count; allow_count = max(min(allow_count, 2), 1);
exec_cfg = {count, th_config};
}; if (prob_n / th_config.thread_n * prob_m * top_k * 4 < sms * allow_count) {
allow_count =
max(prob_n / th_config.thread_n * prob_m * top_k * 4 / sms, 1);
} }
if (allow_count > count) {
count = allow_count;
exec_cfg = {count, th_config};
};
} }
return exec_cfg; return exec_cfg;
} }
template <typename scalar_t>
void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
void* s, void* s2, void* zp, void* g_idx, void* perm, void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
void* a_tmp, void* sorted_token_ids, void* expert_ids, void* perm, void* a_tmp, void* sorted_token_ids,
void* num_tokens_past_padded, void* topk_weights, void* expert_ids, void* num_tokens_past_padded,
int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep, void* topk_weights, int moe_block_size, int num_experts,
int prob_m, int prob_n, int prob_k, void* workspace, int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
vllm::ScalarType const& q_type, bool has_bias, int prob_n, int prob_k, void* workspace,
bool has_act_order, bool is_k_full, bool has_zp, int num_groups, vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
int group_size, int dev, cudaStream_t stream, int thread_k, vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce, bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
bool is_zp_float) { int num_groups, int group_size, int dev, cudaStream_t stream,
int thread_k, int thread_n, int sms, int blocks_per_sm,
bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
int thread_m_blocks = div_ceil(moe_block_size, 16); int thread_m_blocks = div_ceil(moe_block_size, 16);
bool m_block_size_8 = moe_block_size == 8; bool m_block_size_8 = moe_block_size == 8;
bool is_a_8bit = a_type.size_bits() == 8;
if (has_zp) {
TORCH_CHECK(
q_type == vllm::kU4 || q_type == vllm::kU8,
"q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
} else {
TORCH_CHECK(
q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
"q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
"has_zp = False. Got = ",
q_type.str());
}
TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
", ", prob_n, ", ", prob_k, "]"); ", ", prob_n, ", ", prob_k, "]");
@ -563,14 +372,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
} }
} }
int num_bits = q_type.size_bits(); int num_bits = b_type.size_bits();
const int4* A_ptr = (const int4*)A; const int4* A_ptr = (const int4*)A;
const int4* B_ptr = (const int4*)B; const int4* B_ptr = (const int4*)B;
int4* C_ptr = (int4*)C; int4* C_ptr = (int4*)C;
int4* C_tmp_ptr = (int4*)C_tmp; int4* C_tmp_ptr = (int4*)C_tmp;
const int4* bias_ptr = (const int4*)b_bias; const int4* bias_ptr = (const int4*)b_bias;
const int4* s_ptr = (const int4*)s; const float* a_s_ptr = (const float*)a_s;
const uint16_t* s2_ptr = (const uint16_t*)s2; const int4* b_s_ptr = (const int4*)b_s;
const uint16_t* g_s_ptr = (const uint16_t*)g_s;
const int4* zp_ptr = (const int4*)zp; const int4* zp_ptr = (const int4*)zp;
const int* g_idx_ptr = (const int*)g_idx; const int* g_idx_ptr = (const int*)g_idx;
const int* perm_ptr = (const int*)perm; const int* perm_ptr = (const int*)perm;
@ -618,22 +428,41 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
TORCH_CHECK(max_shared_mem > 0); TORCH_CHECK(max_shared_mem > 0);
int major_capability, minor_capability;
cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
dev);
cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
dev);
TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
"marlin kernel only support Ampere or newer GPUs.");
if (a_type == vllm::kFE4M3fn) {
TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
"FP8 only support Ada Lovelace or newer GPUs.");
TORCH_CHECK(
major_capability * 10 + minor_capability == 89 ||
major_capability * 10 + minor_capability == 120,
"Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
"Marlin W4A16 on other devices).");
}
// Set thread config // Set thread config
exec_config_t exec_cfg; exec_config_t exec_cfg;
thread_config_t thread_tfg; thread_config_t thread_tfg;
if (thread_k != -1 && thread_n != -1) { if (thread_k != -1 && thread_n != -1) {
thread_tfg = thread_config_t{thread_k, thread_n, default_threads}; thread_tfg = thread_config_t{thread_k, thread_n, thread_k * thread_n / 64};
exec_cfg = exec_config_t{1, thread_tfg}; if (blocks_per_sm == -1) blocks_per_sm = 1;
exec_cfg = exec_config_t{blocks_per_sm, thread_tfg};
TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
" is not divisible by thread_n = ", thread_n); " is not divisible by thread_n = ", thread_n);
TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
" is not divisible by thread_k = ", thread_k); " is not divisible by thread_k = ", thread_k);
} else { } else {
// Auto config // Auto config
exec_cfg = determine_exec_config<scalar_t>( exec_cfg = determine_exec_config(
q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8, a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float, top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
max_shared_mem); has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
is_a_8bit);
thread_tfg = exec_cfg.tb_cfg; thread_tfg = exec_cfg.tb_cfg;
} }
@ -647,22 +476,29 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
int thread_k_blocks = thread_k / 16; int thread_k_blocks = thread_k / 16;
int thread_n_blocks = thread_n / 16; int thread_n_blocks = thread_n / 16;
TORCH_CHECK( TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks, prob_m, prob_m, prob_n, prob_k, num_bits, group_size,
prob_n, prob_k, num_bits, group_size, has_act_order, has_act_order, is_k_full, has_zp, is_zp_float,
is_k_full, has_zp, is_zp_float, max_shared_mem), max_shared_mem, is_a_8bit),
"Invalid thread config: thread_m_blocks = ", thread_m_blocks, "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
", thread_k = ", thread_tfg.thread_k, ", thread_k = ", thread_tfg.thread_k,
", thread_n = ", thread_tfg.thread_n, ", thread_n = ", thread_tfg.thread_n,
", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m, ", ", ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
prob_k, ", ", prob_n, "] and num_bits = ", num_bits, prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
", group_size = ", group_size, ", has_act_order = ", has_act_order, ", group_size = ", group_size,
", is_k_full = ", is_k_full, ", has_zp = ", has_zp, ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
", is_zp_float = ", is_zp_float, ", max_shared_mem = ", max_shared_mem); ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
", max_shared_mem = ", max_shared_mem);
auto kernel = get_marlin_kernel<scalar_t>( int sh_cache_size =
q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8, get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
has_act_order, has_zp, group_blocks, num_threads, is_zp_float); prob_n, prob_k, num_bits, group_size, has_act_order,
is_k_full, has_zp, is_zp_float, is_a_8bit);
auto kernel = get_marlin_kernel(
a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
num_threads, is_zp_float);
if (kernel == MarlinDefault) { if (kernel == MarlinDefault) {
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n, TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@ -679,19 +515,20 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
// avoid ">>>" being formatted to "> > >" // avoid ">>>" being formatted to "> > >"
// clang-format off // clang-format off
kernel<<<blocks, num_threads, max_shared_mem, stream>>>( kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr, sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m, topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem); prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
// clang-format on // clang-format on
} }
} // namespace MARLIN_NAMESPACE_NAME } // namespace MARLIN_NAMESPACE_NAME
torch::Tensor moe_wna16_marlin_gemm( torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none, torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
torch::Tensor& b_q_weight, torch::Tensor& b_q_weight,
std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales, std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
std::optional<torch::Tensor> const& a_scales_or_none,
std::optional<torch::Tensor> const& global_scale_or_none, std::optional<torch::Tensor> const& global_scale_or_none,
std::optional<torch::Tensor> const& b_zeros_or_none, std::optional<torch::Tensor> const& b_zeros_or_none,
std::optional<torch::Tensor> const& g_idx_or_none, std::optional<torch::Tensor> const& g_idx_or_none,
@ -699,11 +536,70 @@ torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids, torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights, torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep, int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n, vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
bool is_zp_float) { bool is_zp_float, int64_t thread_k, int64_t thread_n,
vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id); int64_t blocks_per_sm) {
int pack_factor = 32 / b_q_type.size_bits(); vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
auto c_dtype = a.dtype();
if (a.scalar_type() == at::ScalarType::Half) {
a_type_id = vllm::kFloat16.id();
c_type_id = vllm::kFloat16.id();
} else if (a.scalar_type() == at::ScalarType::BFloat16) {
a_type_id = vllm::kBFloat16.id();
c_type_id = vllm::kBFloat16.id();
} else {
c_dtype = b_scales.dtype();
if (b_scales.scalar_type() == at::ScalarType::Half) {
c_type_id = vllm::kFloat16.id();
} else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
c_type_id = vllm::kBFloat16.id();
} else {
c_type_id = vllm::kBFloat16.id();
TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
torch::Tensor c = c_or_none.value();
c_dtype = c.dtype();
if (c.scalar_type() == at::ScalarType::Half) {
c_type_id = vllm::kFloat16.id();
} else if (c.scalar_type() == at::ScalarType::BFloat16) {
c_type_id = vllm::kBFloat16.id();
} else {
TORCH_CHECK(false, "unsupported c dtype");
}
}
if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
a_type_id = vllm::kFE4M3fn.id();
} else if (a.scalar_type() == at::ScalarType::Char) {
a_type_id = vllm::kS8.id();
} else {
TORCH_CHECK(false, "unsupported `a` scalar_type");
}
}
s_type_id = c_type_id;
if (b_type_id == vllm::kFE2M1f.id()) {
if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
s_type_id = vllm::kFE4M3fn.id();
} else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
s_type_id = vllm::kFE8M0fnu.id();
} else {
TORCH_CHECK(false,
"When b_type = float4_e2m1f, b_scale scalar type must be",
"float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
}
}
vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
int pack_factor = 32 / b_type.size_bits();
int num_experts = b_q_weight.size(0);
if (moe_block_size != 8) { if (moe_block_size != 8) {
TORCH_CHECK(moe_block_size % 16 == 0, TORCH_CHECK(moe_block_size % 16 == 0,
@ -745,19 +641,27 @@ torch::Tensor moe_wna16_marlin_gemm(
TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
// thread_k: `k` size of a thread_tile in `weights` (can usually be left as torch::Tensor a_scales;
// auto -1) auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
int thread_k = -1; auto options_fp32 =
// thread_n: `n` size of a thread_tile in `weights` (can usually be left as torch::TensorOptions().dtype(at::kFloat).device(a.device());
// auto -1)
int thread_n = -1; if (a_scales_or_none.has_value()) {
a_scales = a_scales_or_none.value();
TORCH_CHECK(a_type.size_bits() == 8,
"a_scales can only be used for 8bit activation.");
} else {
a_scales = torch::empty({0}, options_fp32);
TORCH_CHECK(a_type.size_bits() != 8,
"the a_scales parameter must be passed for 8bit activation.");
}
// sms: number of SMs to use for the kernel // sms: number of SMs to use for the kernel
int sms = -1; int sms = -1;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device()); cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
// Alloc buffers // Alloc buffers
const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
torch::Tensor c; torch::Tensor c;
if (c_or_none.has_value()) { if (c_or_none.has_value()) {
c = c_or_none.value(); c = c_or_none.value();
@ -774,8 +678,6 @@ torch::Tensor moe_wna16_marlin_gemm(
// Alloc C tmp buffer that is going to be used for the global reduce // Alloc C tmp buffer that is going to be used for the global reduce
torch::Tensor c_tmp; torch::Tensor c_tmp;
auto options_fp32 =
torch::TensorOptions().dtype(at::kFloat).device(a.device());
if (use_fp32_reduce && !use_atomic_add) { if (use_fp32_reduce && !use_atomic_add) {
// max num of threadblocks is sms * 4 // max num of threadblocks is sms * 4
long max_c_tmp_size = min( long max_c_tmp_size = min(
@ -846,11 +748,11 @@ torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor global_scale; torch::Tensor global_scale;
if (global_scale_or_none.has_value()) { if (global_scale_or_none.has_value()) {
global_scale = global_scale_or_none.value(); global_scale = global_scale_or_none.value();
TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16, TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
"global_scale can only be used for nvfp4 format."); "global_scale can only be used for nvfp4 format.");
} else { } else {
global_scale = torch::empty({0}, options); global_scale = torch::empty({0}, options);
TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16), TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
"the global_scale parameter must be passed for nvfp4 format."); "the global_scale parameter must be passed for nvfp4 format.");
} }
@ -877,15 +779,15 @@ torch::Tensor moe_wna16_marlin_gemm(
bool has_zp = b_zeros.size(-1) > 0; bool has_zp = b_zeros.size(-1) > 0;
if (has_zp) { if (has_zp) {
TORCH_CHECK( TORCH_CHECK(
b_q_type == vllm::kU4 || b_q_type == vllm::kU8, b_type == vllm::kU4 || b_type == vllm::kU8,
"b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str()); "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
} else { } else {
TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 || TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f, b_type == vllm::kS4 || b_type == vllm::kS8 ||
"b_q_type must be uint4b8, uint8b128, float8_e4m3fn or " b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
"float4_e2m1f when " "b_type must be uint4b8, uint8b128, int4, int8, "
"has_zp = False. Got = ", "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
b_q_type.str()); b_type.str());
} }
if (has_zp && is_zp_float) { if (has_zp && is_zp_float) {
@ -929,71 +831,33 @@ torch::Tensor moe_wna16_marlin_gemm(
" is below min_workspace_size = ", min_workspace_size); " is below min_workspace_size = ", min_workspace_size);
int dev = a.get_device(); int dev = a.get_device();
if (a.scalar_type() == at::ScalarType::Half) {
void* scales_ptr;
if (b_q_type == vllm::kFE2M1f) {
if (group_size == 16)
scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
else if (group_size == 32)
scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
else
TORCH_CHECK(false,
"float4_e2m1f only supports group_size == 16 (NVFP4) ",
"and group_size == 32 (MXFP4)");
} else {
scales_ptr = b_scales.data_ptr<at::Half>();
}
MARLIN_NAMESPACE_NAME::marlin_mm<half>( TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(), "scalar type of a_scales must be float");
c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr, TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(), "scalar type of global_scale must be the same with c");
perm.data_ptr(), a_tmp.data_ptr<at::Half>(), if (a_type.size_bits() == 16) {
sorted_token_ids.data_ptr(), expert_ids.data_ptr(), TORCH_CHECK(
num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(), a.scalar_type() == c.scalar_type(),
moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k, "scalar type of a must be the same with c for 16 bit activation");
workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
has_zp, num_groups, group_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
use_atomic_add, use_fp32_reduce, is_zp_float);
} else if (a.scalar_type() == at::ScalarType::BFloat16) {
void* scales_ptr;
if (b_q_type == vllm::kFE2M1f) {
if (group_size == 16)
scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
else if (group_size == 32)
scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
else
TORCH_CHECK(false,
"float4_e2m1f only supports group_size == 16 (NVFP4) ",
"and group_size == 32 (MXFP4)");
} else {
scales_ptr = b_scales.data_ptr<at::BFloat16>();
}
MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
b_bias.data_ptr<at::BFloat16>(), scales_ptr,
global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
has_zp, num_groups, group_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
use_atomic_add, use_fp32_reduce, is_zp_float);
} else {
TORCH_CHECK(false,
"moe_wna16_marlin_gemm only supports bfloat16 and float16");
} }
MARLIN_NAMESPACE_NAME::marlin_mm(
a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
is_zp_float);
return c; return c;
} }
#endif
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm); m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
} }

View File

@ -63,16 +63,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def( m.def(
"moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none," "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
"Tensor! b_q_weight, Tensor? b_bias_or_none," "Tensor! b_q_weight, Tensor? b_bias_or_none,"
"Tensor! b_scales, Tensor? global_scale, Tensor? " "Tensor! b_scales, Tensor? a_scales, Tensor? global_scale, Tensor? "
"b_zeros_or_none," "b_zeros_or_none,"
"Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace," "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
"Tensor sorted_token_ids," "Tensor sorted_token_ids,"
"Tensor! expert_ids, Tensor! num_tokens_past_padded," "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
"Tensor! topk_weights, int moe_block_size, int top_k, " "Tensor! topk_weights, int moe_block_size, int top_k, "
"bool mul_topk_weights, bool is_ep, int b_q_type_id," "bool mul_topk_weights, bool is_ep, int b_type_id,"
"int size_m, int size_n, int size_k," "int size_m, int size_n, int size_k,"
"bool is_full_k, bool use_atomic_add," "bool is_full_k, bool use_atomic_add,"
"bool use_fp32_reduce, bool is_zp_float) -> Tensor"); "bool use_fp32_reduce, bool is_zp_float,"
"int thread_k, int thread_n, int blocks_per_sm) -> Tensor");
m.def( m.def(
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
"Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "

View File

@ -52,14 +52,13 @@ void paged_attention_v2(
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
const int64_t blocksparse_head_sliding_step); const int64_t blocksparse_head_sliding_step);
#ifndef USE_ROCM
void merge_attn_states(torch::Tensor& output, void merge_attn_states(torch::Tensor& output,
std::optional<torch::Tensor> output_lse, std::optional<torch::Tensor> output_lse,
const torch::Tensor& prefix_output, const torch::Tensor& prefix_output,
const torch::Tensor& prefix_lse, const torch::Tensor& prefix_lse,
const torch::Tensor& suffix_output, const torch::Tensor& suffix_output,
const torch::Tensor& suffix_lse); const torch::Tensor& suffix_lse);
#ifndef USE_ROCM
void convert_vertical_slash_indexes( void convert_vertical_slash_indexes(
torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS] torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS]
torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S] torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]

View File

@ -22,6 +22,7 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include "cutlass_extensions/common.hpp"
#include "cute/tensor.hpp" #include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h" #include "cutlass/tensor_ref.h"
@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
} }
template <typename OutType> template <typename OutType>
void run_fp4_blockwise_scaled_group_mm( void run_fp4_blockwise_scaled_group_mm_sm100(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes, const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
auto can_implement_status = gemm_op.can_implement(args); auto can_implement_status = gemm_op.can_implement(args);
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
"Failed to implement GEMM"); "Failed to implement GEMM: status=", (int)can_implement_status);
// Run the GEMM // Run the GEMM
auto status = gemm_op.initialize(args, workspace.data_ptr()); auto status = gemm_op.initialize(args, workspace.data_ptr());
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM"); TORCH_CHECK(status == cutlass::Status::kSuccess,
"Failed to initialize GEMM: status=", (int)status,
" workspace_size=", workspace_size, " num_experts=", num_experts,
" M=", M, " N=", N, " K=", K);
status = gemm_op.run(args, workspace.data_ptr(), stream); status = gemm_op.run(args, workspace.data_ptr(), stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM"); TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
} }
void run_fp4_blockwise_scaled_group_mm_sm120(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
int N, int K) {
using ProblemShape =
cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
using ElementType = cutlass::float_e2m1_t;
using ElementSFType = cutlass::float_ue4m3_t;
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
// NOTE: For SM120 it seems templating the output type is not supported and
// we need to hardcode the output type to bfloat16
using ElementC = cutlass::bfloat16_t;
using ElementD = ElementC;
using ElementAccumulator = float;
// Layout definitions
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using LayoutD = LayoutC;
// Alignment constraints
static constexpr int AlignmentA = 32;
static constexpr int AlignmentB = 32;
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
// Architecture definitions
using ArchTag = cutlass::arch::Sm120;
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
using ClusterShape = Shape<_1, _1, _1>;
using MmaTileShape = Shape<_128, _128, _128>;
using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass, MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
LayoutD*, AlignmentD,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
using GemmKernel =
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
CollectiveEpilogue>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
using LayoutSFA =
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
using LayoutSFB =
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
using ScaleConfig =
typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
int num_experts = static_cast<int>(expert_offsets.size(0));
auto options_int =
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
torch::Tensor c_strides1 =
torch::full({num_experts}, output.stride(0), options_int);
torch::Tensor a_strides1 =
torch::full({num_experts}, a.stride(0) * 2, options_int);
torch::Tensor b_strides1 =
torch::full({num_experts}, b.stride(1) * 2, options_int);
run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
expert_offsets, sf_offsets, problem_sizes, M, N, K);
// Create an instance of the GEMM
Gemm gemm_op;
// Initialize problem_sizes_as_shapes correctly
UnderlyingProblemShape* problem_sizes_as_shapes =
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
// Set the Scheduler info
cutlass::KernelHardwareInfo hw_info;
using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
scheduler.raster_order = RasterOrderOptions::AlongM;
hw_info.device_id = a.get_device();
static std::unordered_map<int, int> cached_sm_counts;
if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
cached_sm_counts[hw_info.device_id] =
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
hw_info.device_id);
}
hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
// Mainloop Arguments
typename GemmKernel::MainloopArguments mainloop_args{
static_cast<const ElementType**>(a_ptrs.data_ptr()),
static_cast<StrideA*>(a_strides1.data_ptr()),
static_cast<const ElementType**>(b_ptrs.data_ptr()),
static_cast<StrideB*>(b_strides1.data_ptr()),
static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
// Epilogue Arguments
typename GemmKernel::EpilogueArguments epilogue_args{
{}, // epilogue.thread
nullptr,
static_cast<StrideC*>(c_strides1.data_ptr()),
static_cast<ElementD**>(out_ptrs.data_ptr()),
static_cast<StrideC*>(c_strides1.data_ptr())};
auto& fusion_args = epilogue_args.thread;
fusion_args.alpha_ptr_array =
reinterpret_cast<float**>(alpha_ptrs.data_ptr());
fusion_args.dAlpha = {_0{}, _0{}, 1};
fusion_args.beta = 0.0f;
// Gemm Arguments
typename GemmKernel::Arguments args{
cutlass::gemm::GemmUniversalMode::kGrouped,
{num_experts, problem_sizes_as_shapes, nullptr},
mainloop_args,
epilogue_args,
hw_info,
scheduler};
size_t workspace_size = Gemm::get_workspace_size(args);
auto const workspace_options =
torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
auto workspace = torch::empty(workspace_size, workspace_options);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
auto can_implement_status = gemm_op.can_implement(args);
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
"Failed to implement GEMM: status=", (int)can_implement_status);
// Run the GEMM
auto status = gemm_op.initialize(args, workspace.data_ptr());
TORCH_CHECK(status == cutlass::Status::kSuccess,
"Failed to initialize GEMM: status=", (int)status,
" workspace_size=", workspace_size, " num_experts=", num_experts,
" M=", M, " N=", N, " K=", K);
status = gemm_op.run(args, workspace.data_ptr(), stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
}
template <typename OutType>
void run_fp4_blockwise_scaled_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
int N, int K) {
int32_t version_num = get_sm_version_num();
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
if (version_num >= 120 && version_num < 130) {
run_fp4_blockwise_scaled_group_mm_sm120(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
return;
}
#endif
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
if (version_num >= 100 && version_num < 120) {
run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
return;
}
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
version_num, ". Required capability: 100 or 120");
}
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte; constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn; constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
#endif #endif
@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes, const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) { const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 #if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
// Input validation // Input validation
CHECK_INPUT(a, FLOAT4_E2M1X2, "a"); CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
CHECK_INPUT(b, FLOAT4_E2M1X2, "b"); CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes, output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K); expert_offsets, sf_offsets, M, N, K);
} else { } else {
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
int32_t version_num = get_sm_version_num();
if (version_num >= 120 && version_num < 130) {
TORCH_CHECK_NOT_IMPLEMENTED(
false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
output.scalar_type());
}
#endif
run_fp4_blockwise_scaled_group_mm<cutlass::half_t>( run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes, output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K); expert_offsets, sf_offsets, M, N, K);
@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
TORCH_CHECK_NOT_IMPLEMENTED( TORCH_CHECK_NOT_IMPLEMENTED(
false, false,
"No compiled cutlass_fp4_group_mm kernel, vLLM must " "No compiled cutlass_fp4_group_mm kernel, vLLM must "
"be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA " "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
"12.8 or above."); "and CUDA 12.8 or above.");
#endif #endif
} }

View File

@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
constexpr auto INT = at::ScalarType::Int; constexpr auto INT = at::ScalarType::Int;
constexpr auto UINT8 = at::ScalarType::Byte; constexpr auto UINT8 = at::ScalarType::Byte;
void scaled_fp4_experts_quant_sm100a( void scaled_fp4_experts_quant_sm1xxa(
torch::Tensor& output, torch::Tensor& output_scale, torch::Tensor& output, torch::Tensor& output_scale,
torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts, torch::Tensor const& input_offset_by_experts,

View File

@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
torch::Tensor const& input_sf); torch::Tensor const& input_sf);
#endif #endif
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
void scaled_fp4_experts_quant_sm100a( (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
void scaled_fp4_experts_quant_sm1xxa(
torch::Tensor& output, torch::Tensor& output_scale, torch::Tensor& output, torch::Tensor& output_scale,
torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts, torch::Tensor const& input_offset_by_experts,
@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts, torch::Tensor const& input_offset_by_experts,
torch::Tensor const& output_scale_offset_by_experts) { torch::Tensor const& output_scale_offset_by_experts) {
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
return scaled_fp4_experts_quant_sm100a( (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
return scaled_fp4_experts_quant_sm1xxa(
output, output_scale, input, input_global_scale, input_offset_by_experts, output, output_scale, input, input_global_scale, input_offset_by_experts,
output_scale_offset_by_experts); output_scale_offset_by_experts);
#endif #endif

View File

@ -15,6 +15,8 @@
*/ */
#include <torch/all.h> #include <torch/all.h>
#include <c10/cuda/CUDAGuard.h>
#include "cutlass_extensions/common.hpp"
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A, void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
@ -32,23 +34,34 @@ void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
torch::Tensor const& alpha); torch::Tensor const& alpha);
#endif #endif
void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A, void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A,
torch::Tensor const& B, torch::Tensor const& A_sf, const torch::Tensor& B, const torch::Tensor& A_sf,
torch::Tensor const& B_sf, const torch::Tensor& B_sf,
torch::Tensor const& alpha) { const torch::Tensor& alpha) {
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 // Make sure were on As device.
return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha); const c10::cuda::OptionalCUDAGuard device_guard(device_of(A));
#elif defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120 const int32_t sm = get_sm_version_num();
return cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
if (sm >= 100 && sm < 120) {
cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
return;
}
#endif #endif
TORCH_CHECK_NOT_IMPLEMENTED(false,
"No compiled nvfp4 mm kernel, vLLM should " #if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
"be compiled using CUDA 12.8 and target " if (sm >= 120 && sm < 130) {
"compute capability 100 or above."); cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
return;
}
#endif
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel for SM ", sm,
". Recompile with CUDA >= 12.8 and CC >= 100.");
} }
bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) { bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
int runtimeVersion; int runtimeVersion;
cudaRuntimeGetVersion(&runtimeVersion); cudaRuntimeGetVersion(&runtimeVersion);
return cuda_device_capability >= 100 && runtimeVersion >= 12080; return cuda_device_capability >= 100 && runtimeVersion >= 12080;
} }

View File

@ -437,10 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
#pragma unroll #pragma unroll
for (int k_idx = 0; k_idx < 2; ++k_idx) { for (int k_idx = 0; k_idx < 2; ++k_idx) {
FType low16 = FType low16 = MarlinScalarType2<FType>::float2num(
ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]); C_frag[m_idx][n_idx][k_idx * 2]);
FType high16 = FType high16 = MarlinScalarType2<FType>::float2num(
ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]); C_frag[m_idx][n_idx][k_idx * 2 + 1]);
uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) | uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
(reinterpret_cast<uint32_t&>(high16) << 16); (reinterpret_cast<uint32_t&>(high16) << 16);
int sts_offset = int sts_offset =

View File

@ -8,7 +8,7 @@
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <iostream> #include <iostream>
#include "../gptq_marlin/marlin_dtypes.cuh" #include "../gptq_marlin/marlin_dtypes.cuh"
using marlin::ScalarType; using marlin::MarlinScalarType2;
namespace allspark { namespace allspark {
@ -72,10 +72,10 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix; int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
for (int i = 0; i < n_mat; ++i) { for (int i = 0; i < n_mat; ++i) {
sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]); sum += MarlinScalarType2<FType>::num2float(C_split[idx + i * matrix_size]);
} }
C[idx] = ScalarType<FType>::float2num(sum); C[idx] = MarlinScalarType2<FType>::float2num(sum);
} }
template <typename FType> template <typename FType>

View File

@ -1 +1,2 @@
kernel_*.cu sm*_kernel_*.cu
kernel_selector.h

View File

@ -4,14 +4,16 @@
namespace marlin { namespace marlin {
template <int const num_threads, int const num_bits> template <int const num_threads, int const num_bits, bool is_a_8bit>
__global__ void awq_marlin_repack_kernel( __global__ void awq_marlin_repack_kernel(
uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr, uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
int size_k, int size_n) { int size_k, int size_n) {
constexpr int pack_factor = 32 / num_bits; constexpr int pack_factor = 32 / num_bits;
int k_tiles = size_k / tile_k_size; constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
int n_tiles = size_n / tile_n_size; constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
int k_tiles = size_k / target_tile_k_size;
int n_tiles = size_n / target_tile_n_size;
int block_k_tiles = div_ceil(k_tiles, gridDim.x); int block_k_tiles = div_ceil(k_tiles, gridDim.x);
auto start_k_tile = blockIdx.x * block_k_tiles; auto start_k_tile = blockIdx.x * block_k_tiles;
@ -33,10 +35,10 @@ __global__ void awq_marlin_repack_kernel(
extern __shared__ int4 sh[]; extern __shared__ int4 sh[];
constexpr int tile_n_ints = tile_n_size / pack_factor; constexpr int tile_n_ints = target_tile_n_size / pack_factor;
constexpr int stage_n_threads = tile_n_ints / 4; constexpr int stage_n_threads = tile_n_ints / 4;
constexpr int stage_k_threads = tile_k_size; constexpr int stage_k_threads = target_tile_k_size;
constexpr int stage_size = stage_k_threads * stage_n_threads; constexpr int stage_size = stage_k_threads * stage_n_threads;
auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) { auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
@ -45,7 +47,7 @@ __global__ void awq_marlin_repack_kernel(
return; return;
} }
int first_n = n_tile_id * tile_n_size; int first_n = n_tile_id * target_tile_n_size;
int first_n_packed = first_n / pack_factor; int first_n_packed = first_n / pack_factor;
int4* sh_ptr = sh + stage_size * pipe; int4* sh_ptr = sh + stage_size * pipe;
@ -54,7 +56,7 @@ __global__ void awq_marlin_repack_kernel(
auto k_id = threadIdx.x / stage_n_threads; auto k_id = threadIdx.x / stage_n_threads;
auto n_id = threadIdx.x % stage_n_threads; auto n_id = threadIdx.x % stage_n_threads;
int first_k = k_tile_id * tile_k_size; int first_k = k_tile_id * target_tile_k_size;
cp_async4(&sh_ptr[k_id * stage_n_threads + n_id], cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
reinterpret_cast<int4 const*>( reinterpret_cast<int4 const*>(
@ -78,11 +80,11 @@ __global__ void awq_marlin_repack_kernel(
} }
int tc_col = th_id / 4; int tc_col = th_id / 4;
int tc_row = (th_id % 4) * 2; int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
constexpr int tc_offsets[4] = {0, 1, 8, 9}; constexpr int tc_offsets[4] = {0, 1, 8, 9};
int cur_n = warp_id * 16 + tc_col; int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
int cur_n_packed = cur_n / pack_factor; int cur_n_packed = cur_n / pack_factor;
int cur_n_pos = cur_n % pack_factor; int cur_n_pos = cur_n % pack_factor;
@ -105,23 +107,50 @@ __global__ void awq_marlin_repack_kernel(
uint32_t vals[8]; uint32_t vals[8];
#pragma unroll #pragma unroll
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int cur_elem = tc_row + tc_offsets[i]; if constexpr (is_a_8bit) {
int cur_elem = tc_row + i;
int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem]; int packed_src_0 =
int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) + sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
sh_stride * cur_elem]; sh_stride * cur_elem];
int packed_src_1 =
sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
sh_stride * (cur_elem + 16)];
vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask; vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask; vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
} else {
int cur_elem = tc_row + tc_offsets[i];
int packed_src_0 =
sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
sh_stride * cur_elem];
vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
}
} }
constexpr int tile_size = tile_k_size * tile_n_size / pack_factor; constexpr int tile_size =
target_tile_k_size * target_tile_n_size / pack_factor;
int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size; int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
// Result of: // Result of:
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
if constexpr (num_bits == 4) { if constexpr (!is_a_8bit && num_bits == 4) {
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
uint32_t res = 0;
#pragma unroll
for (int i = 0; i < 8; i++) {
res |= vals[pack_idx[i]] << (i * 4);
}
out_ptr[out_offset + th_id * 4 + warp_id] = res;
} else if constexpr (is_a_8bit && num_bits == 4) {
int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
uint32_t res = 0; uint32_t res = 0;
#pragma unroll #pragma unroll
@ -138,8 +167,9 @@ __global__ void awq_marlin_repack_kernel(
uint32_t res2 = 0; uint32_t res2 = 0;
#pragma unroll #pragma unroll
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
res1 |= vals[pack_idx[i]] << (i * 8); const int ii = is_a_8bit ? i : pack_idx[i];
res2 |= vals[4 + pack_idx[i]] << (i * 8); res1 |= vals[ii] << (i * 8);
res2 |= vals[4 + ii] << (i * 8);
} }
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1; out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@ -176,18 +206,21 @@ __global__ void awq_marlin_repack_kernel(
} // namespace marlin } // namespace marlin
#define CALL_IF(NUM_BITS) \ #define CALL_IF(NUM_BITS, IS_A_8BIT) \
else if (num_bits == NUM_BITS) { \ else if (num_bits == NUM_BITS && is_a_8bit == IS_A_8BIT) { \
cudaFuncSetAttribute( \ cudaFuncSetAttribute( \
marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \ marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ IS_A_8BIT>, \
marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS> \ cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \ marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
b_q_weight_ptr, out_ptr, size_k, size_n); \ IS_A_8BIT> \
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \
b_q_weight_ptr, out_ptr, size_k, size_n); \
} }
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k, torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
int64_t size_n, int64_t num_bits) { int64_t size_n, int64_t num_bits,
bool is_a_8bit) {
// Verify compatibility with marlin tile of 16x64 // Verify compatibility with marlin tile of 16x64
TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k, TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
" is not divisible by tile_k_size = ", marlin::tile_k_size); " is not divisible by tile_k_size = ", marlin::tile_k_size);
@ -238,10 +271,13 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
if (false) { if (false) {
} }
CALL_IF(4) CALL_IF(4, false)
CALL_IF(8) CALL_IF(8, false)
CALL_IF(4, true)
CALL_IF(8, true)
else { else {
TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits); TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
", is_a_8bit = ", is_a_8bit);
} }
return out; return out;

View File

@ -470,6 +470,50 @@ __device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
frag_b[0] = __hmul2(frag_b[0], bias_reg); frag_b[0] = __hmul2(frag_b[0], bias_reg);
} }
template <>
__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kFE2M1f.id(), true>(
int q, __nv_fp8x4_e4m3* frag_b) {
// Constants for FP4 (E2M1) and FP16 formats
constexpr int FP4_EXPONENT = 2, FP8_EXPONENT = 4;
constexpr int RIGHT_SHIFT = FP8_EXPONENT - FP4_EXPONENT;
constexpr int MASK = 0x70707070;
// Extract and shift FP4 values to FP16 format
int Out1 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
q <<= 4;
int Out2 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
// Note1: reverse indexing is intentional because weights are permuted
// Note2: when dequant to 8bit type, we write to `frag_b[2]` instead of
// `frag_b[1]` to fit the layout of tensorcore
frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
}
template <>
__device__ inline void dequant<int32_t, vllm::kU4B8.id(), true>(
int q, int32_t* frag_b) {
constexpr int repeated_zp = 0x08080808;
constexpr int MASK = 0x80808080;
frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
q >>= 4;
frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
}
template <>
__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kU4B8.id(), true>(
int q, __nv_fp8x4_e4m3* frag_b) {
int s = q & 0x08080808;
int Out1 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
q >>= 4;
s = q & 0x08080808;
int Out2 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
}
template <typename scalar_t2, vllm::ScalarTypeId s_type_id> template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
__device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b); __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
@ -515,6 +559,49 @@ __device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
// Note: reverse indexing is intentional because weights are permuted // Note: reverse indexing is intentional because weights are permuted
frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1); frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2); frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
};
// subtract zero point in quanted format and then dequant
template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
bool skip_flop = false>
__device__ inline void sub_zp_and_dequant(int q, scalar_t2* frag_b, int zp);
template <>
__device__ inline void sub_zp_and_dequant<int32_t, vllm::kU4.id(), true>(
int q, int32_t* frag_b, int zp) {
// INT4 with zp -> INT8
// see https://github.com/vllm-project/vllm/pull/24722
int repeated_zp = 0x01010101 * zp;
int MASK = 0x80808080;
frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
q >>= 4;
frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
}
template <>
__device__ inline void sub_zp_and_dequant<__nv_fp8x4_e4m3, vllm::kU4.id(),
true>(int q, __nv_fp8x4_e4m3* frag_b,
int zp) {
// INT4 with zp -> FP8
// see https://github.com/vllm-project/vllm/pull/24722
uint32_t u_q = *reinterpret_cast<uint32_t*>(&q);
uint32_t u_zp = *reinterpret_cast<uint32_t*>(&zp);
uint32_t u_zp1 = u_zp + 1;
uint32_t repeated_zp = 0x01010101 * u_zp;
uint32_t q0, s;
q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
s = (q0 + repeated_zp) & 0x80808080;
uint32_t Out1 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
u_q >>= 4;
q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
s = (q0 + repeated_zp) & 0x80808080;
uint32_t Out2 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
} }
#endif #endif

View File

@ -4,141 +4,292 @@ import glob
import itertools import itertools
import os import os
import subprocess import subprocess
import sys
import jinja2 import jinja2
FILE_HEAD = """ ARCHS = []
// auto generated by generate.py SUPPORT_FP8 = False
// clang-format off for arch in sys.argv[1].split(","):
arch = arch[: arch.index(".") + 2].replace(".", "")
arch = int(arch)
# only SM89 and SM120 fully support
# mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
# SM90 and SM100 can use this PTX, but its simulated
# with FP16 MMA, so it cannot achieve any acceleration.
if arch in [89, 120]:
SUPPORT_FP8 = True
FILE_HEAD_COMMENT = """
// auto generated by generate_kernels.py
// clang-format off
""".lstrip()
FILE_HEAD = (
FILE_HEAD_COMMENT
+ """
#include "kernel.h" #include "kernel.h"
#include "marlin_template.h" #include "marlin_template.h"
namespace MARLIN_NAMESPACE_NAME { namespace MARLIN_NAMESPACE_NAME {
""".strip() """
)
TEMPLATE = ( TEMPLATE = (
"template __global__ void Marlin<" "template __global__ void Marlin<"
"{{scalar_t}}, " "{{a_type_id}}, "
"{{w_type_id}}, " "{{b_type_id}}, "
"{{c_type_id}}, "
"{{s_type_id}}, " "{{s_type_id}}, "
"{{threads}}, " "{{threads}}, "
"{{thread_m_blocks}}, " "{{thread_m_blocks}}, "
"{{thread_n_blocks}}, " "{{thread_n_blocks}}, "
"{{thread_k_blocks}}, " "{{thread_k_blocks}}, "
"{{'true' if m_block_size_8 else 'false'}}, " "{{m_block_size_8}}, "
"{{stages}}, " "{{stages}}, "
"{{group_blocks}}, " "{{group_blocks}}, "
"{{'true' if is_zp_float else 'false'}}>" "{{is_zp_float}}>"
"( MARLIN_KERNEL_PARAMS );" "( MARLIN_KERNEL_PARAMS );"
) )
# int8 with zero point case (vllm::kU8) is also supported,
# we don't add it to reduce wheel size.
SCALAR_TYPES = [
"vllm::kU4",
"vllm::kU4B8",
"vllm::kU8B128",
"vllm::kFE4M3fn",
"vllm::kFE2M1f",
]
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)] THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4] THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
# group_blocks:
# = 0 : act order case QUANT_CONFIGS = [
# = -1 : channelwise quantization # AWQ-INT4
# > 0 : group_size=16*group_blocks {
GROUP_BLOCKS = [0, 1, -1, 2, 4, 8] "b_type": "kU4",
DTYPES = ["fp16", "bf16"] "thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [-1, 2, 4, 8],
},
# HQQ
{
"a_type": ["kFloat16"],
"b_type": "kU4",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [4],
"is_zp_float": True,
},
# GPTQ-INT4
{
"b_type": "kU4B8",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [-1, 0, 2, 4, 8],
},
# GPTQ-INT8
{
"b_type": "kU8B128",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [-1, 0, 2, 4, 8],
},
# FP8
{
"b_type": "kFE4M3fn",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [-1, 8],
},
# NVFP4
{
"b_type": "kFE2M1f",
"s_type": "kFE4M3fn",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [1],
},
# MXFP4
{
"a_type": ["kBFloat16"],
"b_type": "kFE2M1f",
"s_type": "kFE8M0fnu",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": THREAD_M_BLOCKS,
"group_blocks": [2],
},
# AWQ-INT4 with INT8 activation
{
"a_type": ["kS8"],
"b_type": "kU4",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [-1, 2, 4, 8],
},
# GPTQ-INT4 with INT8 activation
{
"a_type": ["kS8"],
"b_type": "kU4B8",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [-1, 2, 4, 8],
},
# GPTQ-INT4 with FP8 activation
{
"a_type": ["kFE4M3fn"],
"b_type": "kU4B8",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [-1, 2, 4, 8],
},
# AWQ-INT4 with FP8 activation
{
"a_type": ["kFE4M3fn"],
"b_type": "kU4",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [-1, 2, 4, 8],
},
# MXFP4 with FP8 activation
{
"a_type": ["kFE4M3fn"],
"b_type": "kFE2M1f",
"c_type": ["kBFloat16"],
"s_type": "kFE8M0fnu",
"thread_configs": THREAD_CONFIGS,
"thread_m_blocks": [1, 2, 3, 4],
"group_blocks": [2],
},
]
def remove_old_kernels(): def remove_old_kernels():
for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"): for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
subprocess.call(["rm", "-f", filename]) subprocess.call(["rm", "-f", filename])
filename = os.path.dirname(__file__) + "/kernel_selector.h"
subprocess.call(["rm", "-f", filename])
def generate_new_kernels(): def generate_new_kernels():
for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES): result_dict = {}
for quant_config in QUANT_CONFIGS:
c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
b_type = quant_config["b_type"]
is_zp_float = quant_config.get("is_zp_float", False)
all_group_blocks = quant_config["group_blocks"]
all_m_blocks = quant_config["thread_m_blocks"]
all_thread_configs = quant_config["thread_configs"]
for a_type, c_type in itertools.product(a_types, c_types):
if not SUPPORT_FP8 and a_type == "kFE4M3fn":
continue
if "16" in a_type and "16" in c_type and a_type != c_type:
continue
s_type = quant_config.get("s_type", c_type)
if (a_type, b_type, c_type) not in result_dict:
result_dict[(a_type, b_type, c_type)] = []
for group_blocks, m_blocks, thread_configs in itertools.product(
all_group_blocks, all_m_blocks, all_thread_configs
):
thread_k, thread_n, threads = thread_configs
if threads == 256:
# for small batch (m_blocks == 1),
# we only need (128, 128, 256)
# for large batch (m_blocks > 1),
# we only need (64, 256, 256)
if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
continue
if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
continue
config = {
"threads": threads,
"s_type": s_type,
"thread_m_blocks": max(m_blocks, 1),
"thread_k_blocks": thread_k // 16,
"thread_n_blocks": thread_n // 16,
"m_block_size_8": "true" if m_blocks == 0.5 else "false",
"stages": "pipe_stages",
"group_blocks": group_blocks,
"is_zp_float": "true" if is_zp_float else "false",
}
result_dict[(a_type, b_type, c_type)].append(config)
kernel_selector_str = FILE_HEAD_COMMENT
for (a_type, b_type, c_type), config_list in result_dict.items():
all_template_str_list = [] all_template_str_list = []
for config in config_list:
s_type = config["s_type"]
template_str = jinja2.Template(TEMPLATE).render(
a_type_id=f"vllm::{a_type}.id()",
b_type_id=f"vllm::{b_type}.id()",
c_type_id=f"vllm::{c_type}.id()",
s_type_id=f"vllm::{s_type}.id()",
**config,
)
all_template_str_list.append(template_str)
for group_blocks, m_blocks, thread_configs in itertools.product( conditions = [
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS f"a_type == vllm::{a_type}",
): f"b_type == vllm::{b_type}",
# act order case only support gptq-int4 and gptq-int8 f"c_type == vllm::{c_type}",
if group_blocks == 0 and scalar_type not in [ f"s_type == vllm::{s_type}",
"vllm::kU4B8", f"threads == {config['threads']}",
"vllm::kU8B128", f"thread_m_blocks == {config['thread_m_blocks']}",
]: f"thread_n_blocks == {config['thread_n_blocks']}",
continue f"thread_k_blocks == {config['thread_k_blocks']}",
if thread_configs[2] == 256: f"m_block_size_8 == {config['m_block_size_8']}",
# for small batch (m_blocks == 1), we only need (128, 128, 256) f"group_blocks == {config['group_blocks']}",
# for large batch (m_blocks > 1), we only need (64, 256, 256) f"is_zp_float == {config['is_zp_float']}",
if m_blocks <= 1 and thread_configs[0] != 128: ]
continue conditions = " && ".join(conditions)
if m_blocks > 1 and thread_configs[0] != 64:
continue
# we only support channelwise quantization and group_size == 128 if kernel_selector_str == FILE_HEAD_COMMENT:
# for fp8 kernel_selector_str += f"if ({conditions})\n kernel = "
if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]: else:
continue kernel_selector_str += f"else if ({conditions})\n kernel = "
# nvfp4 only supports group_size == 16
# mxfp4 only supports group_size == 32
if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
continue
# other quantization methods don't support group_size = 16
if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
continue
k_blocks = thread_configs[0] // 16 kernel_template2 = (
n_blocks = thread_configs[1] // 16 "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
threads = thread_configs[2] "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
"{{thread_n_blocks}}, {{thread_k_blocks}}, "
"{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
"{{is_zp_float}}>;"
)
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16" kernel_selector_str += (
jinja2.Template(kernel_template2).render(
is_zp_float_list = [False] a_type_id=f"vllm::{a_type}.id()",
if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4: b_type_id=f"vllm::{b_type}.id()",
# HQQ (is_zp_float = true) only supports c_type_id=f"vllm::{c_type}.id()",
# 4bit quantization and fp16 s_type_id=f"vllm::{s_type}.id()",
is_zp_float_list.append(True) **config,
if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
s_type = "vllm::kFE4M3fn"
elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
s_type = "vllm::kFE8M0fnu"
if dtype == "fp16":
# we cannot safely dequantize e8m0 to fp16, so skip this
continue
elif dtype == "fp16":
s_type = "vllm::kFloat16"
elif dtype == "bf16":
s_type = "vllm::kBFloat16"
for is_zp_float in is_zp_float_list:
template_str = jinja2.Template(TEMPLATE).render(
scalar_t=c_dtype,
w_type_id=scalar_type + ".id()",
s_type_id=s_type + ".id()",
threads=threads,
thread_m_blocks=max(m_blocks, 1),
thread_n_blocks=n_blocks,
thread_k_blocks=k_blocks,
m_block_size_8=m_blocks == 0.5,
stages="pipe_stages",
group_blocks=group_blocks,
is_zp_float=is_zp_float,
) )
+ "\n"
all_template_str_list.append(template_str) )
file_content = FILE_HEAD + "\n\n" file_content = FILE_HEAD + "\n\n"
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n" file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu" if a_type == "kFE4M3fn":
filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
else:
filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
filename = filename.lower()
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f: with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
f.write(file_content) f.write(file_content)
if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
kernel_selector_str += (
"else if (a_type == vllm::kFE4M3fn)\n"
" TORCH_CHECK(false, "
'"marlin kernel with fp8 activation is not built.");'
)
with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
f.write(kernel_selector_str)
if __name__ == "__main__": if __name__ == "__main__":
remove_old_kernels() remove_old_kernels()

View File

@ -53,7 +53,7 @@ torch::Tensor gptq_marlin_gemm(
std::optional<torch::Tensor> const& b_zeros_or_none, std::optional<torch::Tensor> const& b_zeros_or_none,
std::optional<torch::Tensor> const& g_idx_or_none, std::optional<torch::Tensor> const& g_idx_or_none,
std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace, std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n, vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
bool is_zp_float) { bool is_zp_float) {
TORCH_CHECK_NOT_IMPLEMENTED(false, TORCH_CHECK_NOT_IMPLEMENTED(false,
@ -243,204 +243,29 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
int cache_size = get_kernel_cache_size( int cache_size = get_kernel_cache_size(
th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
has_act_order, is_k_full, has_zp, is_zp_float); has_act_order, is_k_full, has_zp, is_zp_float);
return cache_size + 512 <= max_shared_mem; return cache_size <= max_shared_mem;
} }
#define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ MarlinFuncPtr get_marlin_kernel(
M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ const vllm::ScalarType a_type, const vllm::ScalarType b_type,
else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ const vllm::ScalarType c_type, const vllm::ScalarType s_type,
thread_n_blocks == THREAD_N_BLOCKS && \ int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
thread_k_blocks == THREAD_K_BLOCKS && \ bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
m_block_size_8 == M_BLOCK_SIZE_8 && \ int threads, bool is_zp_float) {
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ int num_bits = b_type.size_bits();
is_zp_float == IS_ZP_FLOAT) { \
constexpr auto S_TYPE = \
W_TYPE == vllm::kFE2M1f \
? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu) \
: (std::is_same<scalar_t, half>::value ? vllm::kFloat16 \
: vllm::kBFloat16); \
kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS, \
THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
}
// COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
// this is the most common cases
// BIGGROUP: cases for big group size (group_blocks in [-1, 8])
// FZP: cases for float-zero-point (is_zp_float = true)
// ACT: cases for act order case (group_blocks == 0)
// FP4: cases for nvfp4(e2m1) (group_blocks == 1)
#define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
\
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
\
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define COMMON_GET_IF(W_TYPE) \
COMMON_GET_IF_M1(W_TYPE, 8, 8, 256) \
COMMON_GET_IF_M1(W_TYPE, 8, 4, 128) \
COMMON_GET_IF_M1(W_TYPE, 4, 8, 128) \
COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
COMMON_GET_IF_M234(W_TYPE, 8, 4, 128) \
COMMON_GET_IF_M234(W_TYPE, 4, 8, 128)
#define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define BIGGROUP_GET_IF(W_TYPE) \
BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256) \
BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128) \
BIGGROUP_GET_IF_M1(W_TYPE, 4, 8, 128) \
BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128) \
BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
#define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
#define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
#define NVFP4_GET_IF(W_TYPE) \
NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \
NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \
NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128) \
NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128) \
NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
#define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
#define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
#define MXFP4_GET_IF(W_TYPE) \
MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \
MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \
MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128) \
MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128) \
MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
// We currently have 4-bit models only with group_blocks == 4
#define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
#define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
#define FZP_GET_IF(W_TYPE) \
FZP_GET_IF_M1(W_TYPE, 8, 8, 256) \
FZP_GET_IF_M1(W_TYPE, 8, 4, 128) \
FZP_GET_IF_M1(W_TYPE, 4, 8, 128) \
FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
FZP_GET_IF_M234(W_TYPE, 8, 4, 128) \
FZP_GET_IF_M234(W_TYPE, 4, 8, 128)
// We currently have 4-bit models only with group_blocks == 4
#define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
#define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
#define ACT_GET_IF(W_TYPE) \
ACT_GET_IF_M1(W_TYPE, 8, 8, 256) \
ACT_GET_IF_M1(W_TYPE, 8, 4, 128) \
ACT_GET_IF_M1(W_TYPE, 4, 8, 128) \
ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
ACT_GET_IF_M234(W_TYPE, 8, 4, 128) \
ACT_GET_IF_M234(W_TYPE, 4, 8, 128)
template <typename scalar_t>
MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
int thread_m_blocks, int thread_n_blocks,
int thread_k_blocks, bool m_block_size_8,
bool has_act_order, bool has_zp,
int group_blocks, int num_threads,
bool is_zp_float) {
int num_bits = q_type.size_bits();
auto kernel = MarlinDefault; auto kernel = MarlinDefault;
if (false) {
}
COMMON_GET_IF(vllm::kU4) #include "kernel_selector.h"
COMMON_GET_IF(vllm::kU4B8)
COMMON_GET_IF(vllm::kU8B128)
NVFP4_GET_IF(vllm::kFE2M1f)
BIGGROUP_GET_IF(vllm::kFE4M3fn)
ACT_GET_IF(vllm::kU4B8)
ACT_GET_IF(vllm::kU8B128)
if (std::is_same<scalar_t, half>::value) {
if (false) {
}
FZP_GET_IF(vllm::kU4)
}
if (std::is_same<scalar_t, nv_bfloat16>::value) {
if (false) {
}
MXFP4_GET_IF(vllm::kFE2M1f)
}
return kernel; return kernel;
} }
template <typename scalar_t> exec_config_t determine_exec_config(
exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m, const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
int prob_n, int prob_k, int thread_m_blocks, const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
bool m_block_size_8, int num_bits, int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
int group_size, bool has_act_order, int num_bits, int group_size, bool has_act_order, bool is_k_full,
bool is_k_full, bool has_zp, bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
bool is_zp_float, int max_shared_mem,
int sms) {
exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}}; exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
thread_config_t* thread_configs = thread_m_blocks > 1 thread_config_t* thread_configs = thread_m_blocks > 1
? large_batch_thread_configs ? large_batch_thread_configs
@ -455,7 +280,7 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k, if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
num_bits, group_size, has_act_order, is_k_full, has_zp, num_bits, group_size, has_act_order, is_k_full, has_zp,
is_zp_float, max_shared_mem)) { is_zp_float, max_shared_mem - 512)) {
continue; continue;
} }
@ -468,10 +293,11 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
group_blocks = group_size == -1 ? -1 : group_size / 16; group_blocks = group_size == -1 ? -1 : group_size / 16;
} }
auto kernel = get_marlin_kernel<scalar_t>( auto kernel =
q_type, thread_m_blocks, th_config.thread_n / 16, get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp, th_config.thread_n / 16, th_config.thread_k / 16,
group_blocks, th_config.num_threads, is_zp_float); m_block_size_8, has_act_order, has_zp, group_blocks,
th_config.num_threads, is_zp_float);
if (kernel == MarlinDefault) continue; if (kernel == MarlinDefault) continue;
@ -485,28 +311,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
return exec_cfg; return exec_cfg;
} }
template <typename scalar_t>
void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
void* s, void* s2, void* zp, void* g_idx, void* perm, void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
void* a_tmp, int prob_m, int prob_n, int prob_k, int lda, void* perm, void* a_tmp, int prob_m, int prob_n, int prob_k,
void* workspace, vllm::ScalarType const& q_type, bool has_bias, int lda, void* workspace, vllm::ScalarType const& a_type,
vllm::ScalarType const& b_type, vllm::ScalarType const& c_type,
vllm::ScalarType const& s_type, bool has_bias,
bool has_act_order, bool is_k_full, bool has_zp, int num_groups, bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
int group_size, int dev, cudaStream_t stream, int thread_k_init, int group_size, int dev, cudaStream_t stream, int thread_k_init,
int thread_n_init, int sms, bool use_atomic_add, int thread_n_init, int sms, bool use_atomic_add,
bool use_fp32_reduce, bool is_zp_float) { bool use_fp32_reduce, bool is_zp_float) {
if (has_zp) {
TORCH_CHECK(
q_type == vllm::kU4 || q_type == vllm::kU8,
"q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
} else {
TORCH_CHECK(
q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
"q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
"has_zp = False. Got = ",
q_type.str());
}
TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
", ", prob_n, ", ", prob_k, "]"); ", ", prob_n, ", ", prob_k, "]");
@ -531,19 +345,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
} }
} }
int num_bits = q_type.size_bits(); int num_bits = b_type.size_bits();
const int4* A_ptr = (const int4*)A; const int4* A_ptr = (const int4*)A;
const int4* B_ptr = (const int4*)B; const int4* B_ptr = (const int4*)B;
int4* C_ptr = (int4*)C; int4* C_ptr = (int4*)C;
int4* C_tmp_ptr = (int4*)C_tmp; int4* C_tmp_ptr = (int4*)C_tmp;
const int4* bias_ptr = (const int4*)b_bias; const int4* bias_ptr = (const int4*)b_bias;
const int4* s_ptr = (const int4*)s; const float* a_s_ptr = (const float*)a_s;
const uint16_t* s2_ptr = (const uint16_t*)s2; const int4* b_s_ptr = (const int4*)b_s;
const uint16_t* g_s_ptr = (const uint16_t*)g_s;
const int4* zp_ptr = (const int4*)zp; const int4* zp_ptr = (const int4*)zp;
const int* g_idx_ptr = (const int*)g_idx; const int* g_idx_ptr = (const int*)g_idx;
const int* perm_ptr = (const int*)perm; const int* perm_ptr = (const int*)perm;
int4* a_tmp_ptr = (int4*)a_tmp; int4* a_tmp_ptr = (int4*)a_tmp;
int* locks = (int*)workspace; int* locks = (int*)workspace;
if (has_act_order) { if (has_act_order) {
@ -568,6 +384,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
TORCH_CHECK(max_shared_mem > 0); TORCH_CHECK(max_shared_mem > 0);
int major_capability, minor_capability;
cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
dev);
cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
dev);
TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
"marlin kernel only support Ampere or newer GPUs.");
if (a_type == vllm::kFE4M3fn) {
TORCH_CHECK(
major_capability * 10 + minor_capability == 89 ||
major_capability * 10 + minor_capability == 120,
"Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
"Marlin W4A16 on other devices).");
}
int max_par = 16; int max_par = 16;
if (prob_n <= 4096) max_par = 16 * 8; if (prob_n <= 4096) max_par = 16 * 8;
int max_shared_mem_new = max_shared_mem; int max_shared_mem_new = max_shared_mem;
@ -583,7 +414,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
int thread_n = thread_n_init; int thread_n = thread_n_init;
int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks); int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks);
int m_block_size_8 = prob_m_split <= 8; int m_block_size_8 = prob_m_split <= 8 && a_type.size_bits() == 16;
// Set thread config // Set thread config
exec_config_t exec_cfg; exec_config_t exec_cfg;
@ -597,11 +428,25 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
" is not divisible by thread_k = ", thread_k); " is not divisible by thread_k = ", thread_k);
} else { } else {
// Auto config // Auto config
exec_cfg = determine_exec_config<scalar_t>( exec_cfg = determine_exec_config(
q_type, prob_m_split, prob_n, prob_k, thread_m_blocks, m_block_size_8, a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float, thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
max_shared_mem, sms); is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
thread_tfg = exec_cfg.tb_cfg; thread_tfg = exec_cfg.tb_cfg;
if (thread_tfg.thread_n != -1) {
if (prob_n / thread_tfg.thread_n *
div_ceil(prob_m_split, thread_m_blocks * 16) * 4 <=
sms) {
if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
prob_n, prob_k, num_bits, group_size,
has_act_order, is_k_full, has_zp, is_zp_float,
max_shared_mem_new)) {
thread_tfg = {128, 64, 128};
exec_cfg = {1, thread_tfg};
}
}
}
if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) { if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) {
max_thread_m_blocks--; max_thread_m_blocks--;
continue; continue;
@ -632,10 +477,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float, ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
", max_shared_mem_new = ", max_shared_mem_new); ", max_shared_mem_new = ", max_shared_mem_new);
auto kernel = get_marlin_kernel<scalar_t>( auto kernel = get_marlin_kernel(
q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
m_block_size_8, has_act_order, has_zp, group_blocks, num_threads, thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
is_zp_float); num_threads, is_zp_float);
if (kernel == MarlinDefault) { if (kernel == MarlinDefault) {
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n, TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@ -657,13 +502,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
// avoid ">>>" being formatted to "> > >" // avoid ">>>" being formatted to "> > >"
// clang-format off // clang-format off
kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>( kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr,
g_idx_ptr, num_groups, g_idx_ptr, num_groups,
prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add, prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
use_fp32_reduce, max_shared_mem_new); use_fp32_reduce, max_shared_mem_new);
// clang-format on // clang-format on
A_ptr += prob_m_split * (lda / 8); bool is_a_8bit = a_type.size_bits() == 8;
A_ptr += prob_m_split * (lda / (is_a_8bit ? 16 : 8));
a_s_ptr += prob_m_split;
C_ptr += prob_m_split * (prob_n / 8); C_ptr += prob_m_split * (prob_n / 8);
rest_m -= prob_m_split; rest_m -= prob_m_split;
} }
@ -675,15 +522,73 @@ torch::Tensor gptq_marlin_gemm(
torch::Tensor& a, std::optional<torch::Tensor> c_or_none, torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
torch::Tensor& b_q_weight, torch::Tensor& b_q_weight,
std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales, std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
std::optional<torch::Tensor> const& a_scales_or_none,
std::optional<torch::Tensor> const& global_scale_or_none, std::optional<torch::Tensor> const& global_scale_or_none,
std::optional<torch::Tensor> const& b_zeros_or_none, std::optional<torch::Tensor> const& b_zeros_or_none,
std::optional<torch::Tensor> const& g_idx_or_none, std::optional<torch::Tensor> const& g_idx_or_none,
std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace, std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n, vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
bool is_zp_float) { bool is_zp_float) {
vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id); vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
int pack_factor = 32 / b_q_type.size_bits();
auto c_dtype = a.dtype();
if (a.scalar_type() == at::ScalarType::Half) {
a_type_id = vllm::kFloat16.id();
c_type_id = vllm::kFloat16.id();
} else if (a.scalar_type() == at::ScalarType::BFloat16) {
a_type_id = vllm::kBFloat16.id();
c_type_id = vllm::kBFloat16.id();
} else {
c_dtype = b_scales.dtype();
if (b_scales.scalar_type() == at::ScalarType::Half) {
c_type_id = vllm::kFloat16.id();
} else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
c_type_id = vllm::kBFloat16.id();
} else {
c_type_id = vllm::kBFloat16.id();
TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
torch::Tensor c = c_or_none.value();
c_dtype = c.dtype();
if (c.scalar_type() == at::ScalarType::Half) {
c_type_id = vllm::kFloat16.id();
} else if (c.scalar_type() == at::ScalarType::BFloat16) {
c_type_id = vllm::kBFloat16.id();
} else {
TORCH_CHECK(false, "unsupported c dtype");
}
}
if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
a_type_id = vllm::kFE4M3fn.id();
} else if (a.scalar_type() == at::ScalarType::Char) {
a_type_id = vllm::kS8.id();
} else {
TORCH_CHECK(false, "unsupported `a` scalar_type");
}
}
s_type_id = c_type_id;
if (b_type_id == vllm::kFE2M1f.id()) {
if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
s_type_id = vllm::kFE4M3fn.id();
} else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
s_type_id = vllm::kFE8M0fnu.id();
} else {
TORCH_CHECK(false,
"When b_type = float4_e2m1f, b_scale scalar type must be",
"float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
}
}
vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
int pack_factor = 32 / b_type.size_bits();
// Verify A // Verify A
TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0), TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
@ -721,6 +626,21 @@ torch::Tensor gptq_marlin_gemm(
TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
torch::Tensor a_scales;
auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
auto options_fp32 =
torch::TensorOptions().dtype(at::kFloat).device(a.device());
if (a_scales_or_none.has_value()) {
a_scales = a_scales_or_none.value();
TORCH_CHECK(a_type.size_bits() == 8,
"a_scales can only be used for 8bit activation.");
} else {
a_scales = torch::empty({0}, options_fp32);
TORCH_CHECK(a_type.size_bits() != 8,
"the a_scales parameter must be passed for 8bit activation.");
}
// thread_k: `k` size of a thread_tile in `weights` (can usually be left as // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
// auto -1) // auto -1)
int thread_k = -1; int thread_k = -1;
@ -733,7 +653,6 @@ torch::Tensor gptq_marlin_gemm(
// Alloc buffers // Alloc buffers
const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
torch::Tensor c; torch::Tensor c;
if (c_or_none.has_value()) { if (c_or_none.has_value()) {
c = c_or_none.value(); c = c_or_none.value();
@ -750,8 +669,6 @@ torch::Tensor gptq_marlin_gemm(
// Alloc C tmp buffer that is going to be used for the global reduce // Alloc C tmp buffer that is going to be used for the global reduce
torch::Tensor c_tmp; torch::Tensor c_tmp;
auto options_fp32 =
torch::TensorOptions().dtype(at::kFloat).device(a.device());
if (use_fp32_reduce) { if (use_fp32_reduce) {
int max_m_block_size = (size_m + 16 - 1) / 16 * 16; int max_m_block_size = (size_m + 16 - 1) / 16 * 16;
max_m_block_size = min(max_m_block_size, 64); max_m_block_size = min(max_m_block_size, 64);
@ -821,11 +738,11 @@ torch::Tensor gptq_marlin_gemm(
torch::Tensor global_scale; torch::Tensor global_scale;
if (global_scale_or_none.has_value()) { if (global_scale_or_none.has_value()) {
global_scale = global_scale_or_none.value(); global_scale = global_scale_or_none.value();
TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16, TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
"global_scale can only be used for nvfp4 format."); "global_scale can only be used for nvfp4 format.");
} else { } else {
global_scale = torch::empty({0}, options); global_scale = torch::empty({0}, options);
TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16), TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
"the global_scale parameter must be passed for nvfp4 format."); "the global_scale parameter must be passed for nvfp4 format.");
} }
@ -852,15 +769,15 @@ torch::Tensor gptq_marlin_gemm(
bool has_zp = b_zeros.size(-1) > 0; bool has_zp = b_zeros.size(-1) > 0;
if (has_zp) { if (has_zp) {
TORCH_CHECK( TORCH_CHECK(
b_q_type == vllm::kU4 || b_q_type == vllm::kU8, b_type == vllm::kU4 || b_type == vllm::kU8,
"b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str()); "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
} else { } else {
TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 || TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f, b_type == vllm::kS4 || b_type == vllm::kS8 ||
"b_q_type must be uint4b8, uint8b128, float8_e4m3fn or " b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
"float4_e2m1f when " "b_type must be uint4b8, uint8b128, int4, int8, "
"has_zp = False. Got = ", "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
b_q_type.str()); b_type.str());
} }
if (has_zp && is_zp_float) { if (has_zp && is_zp_float) {
@ -902,59 +819,27 @@ torch::Tensor gptq_marlin_gemm(
" is below min_workspace_size = ", min_workspace_size); " is below min_workspace_size = ", min_workspace_size);
int dev = a.get_device(); int dev = a.get_device();
if (a.scalar_type() == at::ScalarType::Half) {
void* scales_ptr;
if (b_q_type == vllm::kFE2M1f) {
if (group_size == 16)
scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
else if (group_size == 32)
scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
else
TORCH_CHECK(false,
"float4_e2m1f only supports group_size == 16 (NVFP4) ",
"and group_size == 32 (MXFP4)");
} else {
scales_ptr = b_scales.data_ptr<at::Half>();
}
marlin::marlin_mm<half>( TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(), "scalar type of a_scales must be float");
c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr, TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(), "scalar type of global_scale must be the same with c");
perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, if (a_type.size_bits() == 16) {
a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order, TORCH_CHECK(
is_k_full, has_zp, num_groups, group_size, dev, a.scalar_type() == c.scalar_type(),
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, "scalar type of a must be the same with c for 16 bit activation");
use_atomic_add, use_fp32_reduce, is_zp_float);
} else if (a.scalar_type() == at::ScalarType::BFloat16) {
void* scales_ptr;
if (b_q_type == vllm::kFE2M1f) {
if (group_size == 16)
scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
else if (group_size == 32)
scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
else
TORCH_CHECK(false,
"float4_e2m1f only supports group_size == 16 (NVFP4) ",
"and group_size == 32 (MXFP4)");
} else {
scales_ptr = b_scales.data_ptr<at::BFloat16>();
}
marlin::marlin_mm<nv_bfloat16>(
a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
b_bias.data_ptr<at::BFloat16>(), scales_ptr,
global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
use_atomic_add, use_fp32_reduce, is_zp_float);
} else {
TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
} }
marlin::marlin_mm(
a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, a.stride(0),
workspace.data_ptr(), a_type, b_type, c_type, s_type, has_bias,
has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
use_atomic_add, use_fp32_reduce, is_zp_float);
return c; return c;
} }

View File

@ -4,15 +4,18 @@
namespace marlin { namespace marlin {
template <int const num_threads, int const num_bits, bool const has_perm> template <int const num_threads, int const num_bits, bool const has_perm,
bool is_a_8bit>
__global__ void gptq_marlin_repack_kernel( __global__ void gptq_marlin_repack_kernel(
uint32_t const* __restrict__ b_q_weight_ptr, uint32_t const* __restrict__ b_q_weight_ptr,
uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr, uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
int size_k, int size_n) { int size_k, int size_n) {
constexpr int pack_factor = 32 / num_bits; constexpr int pack_factor = 32 / num_bits;
int k_tiles = size_k / tile_k_size; constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
int n_tiles = size_n / tile_n_size; constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
int k_tiles = size_k / target_tile_k_size;
int n_tiles = size_n / target_tile_n_size;
int block_k_tiles = div_ceil(k_tiles, gridDim.x); int block_k_tiles = div_ceil(k_tiles, gridDim.x);
auto start_k_tile = blockIdx.x * block_k_tiles; auto start_k_tile = blockIdx.x * block_k_tiles;
@ -34,7 +37,7 @@ __global__ void gptq_marlin_repack_kernel(
extern __shared__ int4 sh[]; extern __shared__ int4 sh[];
constexpr int perm_size = tile_k_size / 4; constexpr int perm_size = target_tile_k_size / 4;
int4* sh_perm_ptr = sh; int4* sh_perm_ptr = sh;
int4* sh_pipe_ptr = sh_perm_ptr; int4* sh_pipe_ptr = sh_perm_ptr;
@ -42,14 +45,14 @@ __global__ void gptq_marlin_repack_kernel(
sh_pipe_ptr += perm_size; sh_pipe_ptr += perm_size;
} }
constexpr int tile_ints = tile_k_size / pack_factor; constexpr int tile_ints = target_tile_k_size / pack_factor;
constexpr int stage_n_threads = tile_n_size / 4; constexpr int stage_n_threads = target_tile_n_size / 4;
constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints; constexpr int stage_k_threads = has_perm ? target_tile_k_size : tile_ints;
constexpr int stage_size = stage_k_threads * stage_n_threads; constexpr int stage_size = stage_k_threads * stage_n_threads;
auto load_perm_to_shared = [&](int k_tile_id) { auto load_perm_to_shared = [&](int k_tile_id) {
int first_k_int4 = (k_tile_id * tile_k_size) / 4; int first_k_int4 = (k_tile_id * target_tile_k_size) / 4;
int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr); int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
@ -65,7 +68,7 @@ __global__ void gptq_marlin_repack_kernel(
return; return;
} }
int first_n = n_tile_id * tile_n_size; int first_n = n_tile_id * target_tile_n_size;
int4* sh_ptr = sh_pipe_ptr + stage_size * pipe; int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
@ -91,7 +94,7 @@ __global__ void gptq_marlin_repack_kernel(
auto k_id = threadIdx.x / stage_n_threads; auto k_id = threadIdx.x / stage_n_threads;
auto n_id = threadIdx.x % stage_n_threads; auto n_id = threadIdx.x % stage_n_threads;
int first_k = k_tile_id * tile_k_size; int first_k = k_tile_id * target_tile_k_size;
int first_k_packed = first_k / pack_factor; int first_k_packed = first_k / pack_factor;
cp_async4(&sh_ptr[k_id * stage_n_threads + n_id], cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
@ -117,13 +120,13 @@ __global__ void gptq_marlin_repack_kernel(
} }
int tc_col = th_id / 4; int tc_col = th_id / 4;
int tc_row = (th_id % 4) * 2; int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
constexpr int tc_offsets[4] = {0, 1, 8, 9}; constexpr int tc_offsets[4] = {0, 1, 8, 9};
int cur_n = warp_id * 16 + tc_col; int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
constexpr int sh_stride = 64; constexpr int sh_stride = target_tile_n_size;
constexpr uint32_t mask = (1 << num_bits) - 1; constexpr uint32_t mask = (1 << num_bits) - 1;
int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe; int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
@ -134,6 +137,7 @@ __global__ void gptq_marlin_repack_kernel(
uint32_t vals[8]; uint32_t vals[8];
if constexpr (has_perm) { if constexpr (has_perm) {
static_assert(!is_a_8bit);
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int k_idx = tc_row + tc_offsets[i]; int k_idx = tc_row + tc_offsets[i];
@ -156,28 +160,49 @@ __global__ void gptq_marlin_repack_kernel(
#pragma unroll #pragma unroll
for (int i = 0; i < tile_ints; i++) { for (int i = 0; i < tile_ints; i++) {
b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i]; if constexpr (is_a_8bit) {
b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i]; b1_vals[i] =
sh_stage_int_ptr[cur_n + sh_stride * i + (warp_id % 2) * 8];
} else {
b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
}
} }
#pragma unroll #pragma unroll
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int cur_elem = tc_row + tc_offsets[i]; int cur_elem = tc_row + (is_a_8bit ? i : tc_offsets[i]);
int cur_int = cur_elem / pack_factor; int cur_int = cur_elem / pack_factor;
int cur_pos = cur_elem % pack_factor; int cur_pos = cur_elem % pack_factor;
vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask; vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask; if constexpr (is_a_8bit)
vals[4 + i] =
(b1_vals[cur_int + tile_ints / 2] >> (cur_pos * num_bits)) & mask;
else
vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
} }
} }
constexpr int tile_size = tile_k_size * tile_n_size / pack_factor; constexpr int tile_size =
target_tile_k_size * target_tile_n_size / pack_factor;
int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size; int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
// Result of: // Result of:
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
if constexpr (num_bits == 4) { if constexpr (!is_a_8bit && num_bits == 4) {
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
uint32_t res = 0;
#pragma unroll
for (int i = 0; i < 8; i++) {
res |= vals[pack_idx[i]] << (i * 4);
}
out_ptr[out_offset + th_id * 4 + warp_id] = res;
} else if constexpr (is_a_8bit && num_bits == 4) {
int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
uint32_t res = 0; uint32_t res = 0;
#pragma unroll #pragma unroll
@ -194,8 +219,9 @@ __global__ void gptq_marlin_repack_kernel(
uint32_t res2 = 0; uint32_t res2 = 0;
#pragma unroll #pragma unroll
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
res1 |= vals[pack_idx[i]] << (i * 8); const int ii = is_a_8bit ? i : pack_idx[i];
res2 |= vals[4 + pack_idx[i]] << (i * 8); res1 |= vals[ii] << (i * 8);
res2 |= vals[4 + ii] << (i * 8);
} }
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1; out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@ -236,21 +262,22 @@ __global__ void gptq_marlin_repack_kernel(
} // namespace marlin } // namespace marlin
#define CALL_IF(NUM_BITS, HAS_PERM) \ #define CALL_IF(NUM_BITS, HAS_PERM, IS_A_8BIT) \
else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \ else if (num_bits == NUM_BITS && has_perm == HAS_PERM && \
is_a_8bit == IS_A_8BIT) { \
cudaFuncSetAttribute( \ cudaFuncSetAttribute( \
marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \ marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
HAS_PERM>, \ HAS_PERM, IS_A_8BIT>, \
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \ marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
HAS_PERM> \ HAS_PERM, IS_A_8BIT> \
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \ <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \
b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \ b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \
} }
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
int64_t size_k, int64_t size_n, int64_t size_k, int64_t size_n,
int64_t num_bits) { int64_t num_bits, bool is_a_8bit) {
// Verify compatibility with marlin tile of 16x64 // Verify compatibility with marlin tile of 16x64
TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k, TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
" is not divisible by tile_k_size = ", marlin::tile_k_size); " is not divisible by tile_k_size = ", marlin::tile_k_size);
@ -309,13 +336,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
if (false) { if (false) {
} }
CALL_IF(4, false) CALL_IF(4, false, false)
CALL_IF(4, true) CALL_IF(4, true, false)
CALL_IF(8, false) CALL_IF(8, false, false)
CALL_IF(8, true) CALL_IF(8, true, false)
CALL_IF(4, false, true)
CALL_IF(8, false, true)
else { else {
TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits, TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
", has_perm = ", has_perm); ", has_perm = ", has_perm, ", is_a_8bit = ", is_a_8bit);
} }
return out; return out;

View File

@ -11,17 +11,19 @@
const int4 *__restrict__ A, const int4 *__restrict__ B, \ const int4 *__restrict__ A, const int4 *__restrict__ B, \
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
const int4 *__restrict__ b_bias_ptr, \ const int4 *__restrict__ b_bias_ptr, \
const float *__restrict__ a_scales_ptr, \
const int4 *__restrict__ scales_ptr, \ const int4 *__restrict__ scales_ptr, \
const uint16_t *__restrict__ scale2_ptr, \ const uint16_t *__restrict__ global_scale_ptr, \
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \ const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \ int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
bool has_bias, bool use_atomic_add, bool use_fp32_reduce, \ bool has_bias, bool use_atomic_add, bool use_fp32_reduce, \
int max_shared_mem int max_shared_mem
namespace MARLIN_NAMESPACE_NAME { namespace MARLIN_NAMESPACE_NAME {
template <typename scalar_t, // compute dtype, half or nv_float16 template <const vllm::ScalarTypeId a_type_id, // A ScalarType id
const vllm::ScalarTypeId w_type_id, // weight ScalarType id const vllm::ScalarTypeId b_type_id, // B ScalarType id
const vllm::ScalarTypeId s_type_id, // weight ScalarType id const vllm::ScalarTypeId c_type_id, // C ScalarType id
const vllm::ScalarTypeId s_type_id, // B_SCALE ScalarType id
const int threads, // number of threads in a threadblock const int threads, // number of threads in a threadblock
const int thread_m_blocks, // number of 16x16 blocks in the m const int thread_m_blocks, // number of 16x16 blocks in the m
// dimension (batchsize) of the // dimension (batchsize) of the

View File

@ -55,6 +55,45 @@ constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
// No support for async // No support for async
#else #else
__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
const int BYTES = 4;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.ca.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
const int BYTES = 8;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.ca.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.ca.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) { bool pred = true) {
const int BYTES = 16; const int BYTES = 16;

View File

@ -2,8 +2,10 @@
#ifndef _data_types_cuh #ifndef _data_types_cuh
#define _data_types_cuh #define _data_types_cuh
#include "marlin.cuh" #include "marlin.cuh"
#include "core/scalar_type.hpp"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <cuda_fp8.h>
#ifndef MARLIN_NAMESPACE_NAME #ifndef MARLIN_NAMESPACE_NAME
#define MARLIN_NAMESPACE_NAME marlin #define MARLIN_NAMESPACE_NAME marlin
@ -11,14 +13,16 @@
namespace MARLIN_NAMESPACE_NAME { namespace MARLIN_NAMESPACE_NAME {
template <typename scalar_t> template <long scalar_type_id>
class ScalarType {}; class MarlinScalarType {};
template <> template <>
class ScalarType<half> { class MarlinScalarType<vllm::kFloat16.id()> {
public: public:
using scalar_t = half; using scalar_t = half;
using scalar_t2 = half2; using scalar_t2 = half2;
using scalar_t4 = half2;
using scalar_32bit_t = half2;
// Matrix fragments for tensor core instructions; their precise layout is // Matrix fragments for tensor core instructions; their precise layout is
// documented here: // documented here:
@ -27,6 +31,7 @@ class ScalarType<half> {
using FragB = Vec<half2, 2>; using FragB = Vec<half2, 2>;
using FragC = Vec<float, 4>; using FragC = Vec<float, 4>;
using FragS = Vec<half2, 1>; using FragS = Vec<half2, 1>;
using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
using FragZP = Vec<half2, 4>; using FragZP = Vec<half2, 4>;
static __device__ float inline num2float(const half x) { static __device__ float inline num2float(const half x) {
@ -44,18 +49,25 @@ class ScalarType<half> {
static __host__ __device__ half inline float2num(const float x) { static __host__ __device__ half inline float2num(const float x) {
return __float2half(x); return __float2half(x);
} }
static __host__ __device__ float2 inline num22float2(const half2 x) {
return __half22float2(x);
}
}; };
template <> template <>
class ScalarType<nv_bfloat16> { class MarlinScalarType<vllm::kBFloat16.id()> {
public: public:
using scalar_t = nv_bfloat16; using scalar_t = nv_bfloat16;
using scalar_t2 = nv_bfloat162; using scalar_t2 = nv_bfloat162;
using scalar_t4 = nv_bfloat162;
using scalar_32bit_t = nv_bfloat162;
using FragA = Vec<nv_bfloat162, 4>; using FragA = Vec<nv_bfloat162, 4>;
using FragB = Vec<nv_bfloat162, 2>; using FragB = Vec<nv_bfloat162, 2>;
using FragC = Vec<float, 4>; using FragC = Vec<float, 4>;
using FragS = Vec<nv_bfloat162, 1>; using FragS = Vec<nv_bfloat162, 1>;
using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
using FragZP = Vec<nv_bfloat162, 4>; using FragZP = Vec<nv_bfloat162, 4>;
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
@ -75,9 +87,63 @@ class ScalarType<nv_bfloat16> {
static __host__ __device__ nv_bfloat16 inline float2num(const float x) { static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
return __float2bfloat16(x); return __float2bfloat16(x);
} }
static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
return __bfloat1622float2(x);
}
#endif #endif
}; };
template <>
class MarlinScalarType<vllm::kFE4M3fn.id()> {
public:
using scalar_t = __nv_fp8_e4m3;
using scalar_t2 = __nv_fp8x2_e4m3;
using scalar_t4 = __nv_fp8x4_e4m3;
using scalar_32bit_t = __nv_fp8x4_e4m3;
using FragA = Vec<__nv_fp8x4_e4m3, 4>;
using FragB = Vec<__nv_fp8x4_e4m3, 2>;
using FragC = Vec<float, 4>;
using FragZP = Vec<__nv_fp8x2_e4m3, 4>;
static __host__ __device__
float2 inline num22float2(const __nv_fp8x2_e4m3 x) {
return (float2)x;
}
};
template <>
class MarlinScalarType<vllm::kS8.id()> {
public:
using scalar_t = int8_t;
using scalar_t2 = int16_t;
using scalar_t4 = int32_t;
using scalar_32bit_t = int32_t;
using FragA = Vec<int32_t, 4>;
using FragB = Vec<int32_t, 2>;
using FragC = Vec<float, 4>;
using FragZP = Vec<int16_t, 4>;
};
template <typename scalar_t>
class MarlinScalarType2 {};
template <>
class MarlinScalarType2<half> : public MarlinScalarType<vllm::kFloat16.id()> {};
template <>
class MarlinScalarType2<nv_bfloat16>
: public MarlinScalarType<vllm::kBFloat16.id()> {};
template <>
class MarlinScalarType2<__nv_fp8_e4m3>
: public MarlinScalarType<vllm::kFE4M3fn.id()> {};
template <>
class MarlinScalarType2<int8_t> : public MarlinScalarType<vllm::kS8.id()> {};
} // namespace MARLIN_NAMESPACE_NAME } // namespace MARLIN_NAMESPACE_NAME
#endif #endif

View File

@ -0,0 +1,106 @@
#include "marlin.cuh"
#include "core/registration.h"
// for only non-zp format (like gptq)
__global__ void marlin_int4_fp8_preprocess_kernel_without_zp(
// qweight: (size_k * size_n // 8,)
const int32_t* __restrict__ qweight,
// output: same shape with qweight
int32_t* __restrict__ output) {
int32_t val = qweight[blockIdx.x * 32 + threadIdx.x];
int32_t new_val = 0;
#pragma unroll
for (int32_t i = 0; i < 8; i++) {
int32_t single_val = val & 0xF;
single_val = single_val >= 8 ? single_val - 8 : 15 - single_val;
new_val |= single_val << (i * 4);
val >>= 4;
}
output[blockIdx.x * 32 + threadIdx.x] = new_val;
}
// for awq format only (with zp and with awq weight layout)
__global__ void marlin_int4_fp8_preprocess_kernel_awq(
// AWQ qweight: (size_k, size_n // 8)
const int32_t* __restrict__ qweight,
// output: same shape with qweight
int32_t* __restrict__ output,
// AWQ zeros: (size_k // group_size, size_n // 8)
const int32_t* __restrict__ qzeros, int32_t size_n, int32_t size_k,
int32_t group_size) {
int32_t val =
qweight[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y];
int32_t zero =
qzeros[(blockIdx.x * 32 + threadIdx.x) / group_size * size_n / 8 +
blockIdx.y];
int32_t new_val = 0;
#pragma unroll
for (int32_t i = 0; i < 8; i++) {
int32_t single_val = val & 0xF;
int32_t single_zero = zero & 0xF;
single_val =
single_val >= single_zero ? single_val - single_zero : 15 - single_val;
new_val |= single_val << (i * 4);
val >>= 4;
zero >>= 4;
}
output[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y] = new_val;
}
torch::Tensor marlin_int4_fp8_preprocess(
torch::Tensor& qweight, std::optional<torch::Tensor> qzeros_or_none,
bool inplace) {
TORCH_CHECK(qweight.device().is_cuda(), "qweight is not on GPU");
TORCH_CHECK(qweight.scalar_type() == at::ScalarType::Int,
"qweight.dtype != torch.int32");
const at::cuda::OptionalCUDAGuard device_guard(device_of(qweight));
torch::Tensor output = inplace ? qweight : torch::empty_like(qweight);
if (!qzeros_or_none.has_value()) {
TORCH_CHECK(qweight.numel() * 8 % 256 == 0,
"qweight.numel() * 8 % 256 != 0");
int blocks = qweight.numel() * 8 / 256;
marlin_int4_fp8_preprocess_kernel_without_zp<<<blocks, 32>>>(
(const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr());
} else {
int32_t size_k = qweight.size(0);
int32_t size_n = qweight.size(1) * 8;
torch::Tensor qzeros = qzeros_or_none.value();
TORCH_CHECK(size_k % 32 == 0, "size_k % 32 != 0");
TORCH_CHECK(qzeros.device().is_cuda(), "qzeros is not on GPU");
TORCH_CHECK(qzeros.scalar_type() == at::ScalarType::Int,
"qweight.dtype != torch.int32");
TORCH_CHECK(device_of(qweight) == device_of(qzeros),
"qzeros is not on the same device with qweight");
int32_t group_size = qweight.size(0) / qzeros.size(0);
TORCH_CHECK(qweight.size(1) == qzeros.size(1),
"qweight.size(1) != qzeros.size(1)");
TORCH_CHECK(qweight.size(0) % qzeros.size(0) == 0,
"qweight.size(0) % qzeros.size(0) != 0");
TORCH_CHECK(group_size % 8 == 0, "group_size % 8 != 0");
dim3 blocks(size_k / 32, size_n / 8);
marlin_int4_fp8_preprocess_kernel_awq<<<blocks, 32>>>(
(const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr(),
(const int32_t*)qzeros.data_ptr(), size_n, size_k, group_size);
}
return output;
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("marlin_int4_fp8_preprocess", &marlin_int4_fp8_preprocess);
}

File diff suppressed because it is too large Load Diff

View File

@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
std::optional<torch::Tensor> const& bias); std::optional<torch::Tensor> const& bias);
#endif #endif
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \ #if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) || \
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \ (defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120 (defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
void get_cutlass_moe_mm_data_caller( void get_cutlass_moe_mm_data_caller(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
// This function currently gets compiled only if we have a valid cutlass moe // This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for. // mm to run it for.
int32_t version_num = get_sm_version_num(); int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1, get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
problem_sizes2, input_permutation, problem_sizes2, input_permutation,
output_permutation, num_experts, n, k, output_permutation, num_experts, n, k,
@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
false, false,
"No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for " "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
"CUDA device capability: ", "CUDA device capability: ",
version_num, ". Required capability: 90 or 100"); version_num, ". Required capability: 90, 100, or 120");
} }
void get_cutlass_moe_mm_problem_sizes( void get_cutlass_moe_mm_problem_sizes(
@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) { const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
int32_t version_num = get_sm_version_num(); int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1, get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
problem_sizes2, num_experts, n, k, problem_sizes2, num_experts, n, k,
blockscale_offsets); blockscale_offsets);
@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
false, false,
"No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm " "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
"kernel for CUDA device capability: ", "kernel for CUDA device capability: ",
version_num, ". Required capability: 90 or 100"); version_num, ". Required capability: 90, 100, or 120");
} }
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
// This function currently gets compiled only if we have a valid cutlass moe // This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for. // mm to run it for.
int32_t version_num = get_sm_version_num(); int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1, get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
problem_sizes2, expert_num_tokens, problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k); num_local_experts, padded_m, n, k);
@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
false, false,
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel " "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
"for CUDA device capability: ", "for CUDA device capability: ",
version_num, ". Required capability: 90 or 100"); version_num, ". Required capability: 90, 100, or 120");
} }
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,

View File

@ -63,7 +63,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" int blocksparse_head_sliding_step) -> ()"); " int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2); ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
#ifndef USE_ROCM
// Merge attn states // Merge attn states
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
// can be used to combine partial attention results (in the split-KV case) // can be used to combine partial attention results (in the split-KV case)
@ -76,7 +75,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor suffix_output," " Tensor suffix_output,"
" Tensor suffix_lse) -> ()"); " Tensor suffix_lse) -> ()");
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states); ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
#ifndef USE_ROCM
ops.def( ops.def(
"convert_vertical_slash_indexes(" "convert_vertical_slash_indexes("
" Tensor! block_count, Tensor! block_offset, " " Tensor! block_count, Tensor! block_offset, "
@ -299,9 +298,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// gptq_marlin Optimized Quantized GEMM for GPTQ. // gptq_marlin Optimized Quantized GEMM for GPTQ.
ops.def( ops.def(
"gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, " "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
"Tensor? b_bias_or_none," "Tensor? b_bias_or_none,Tensor b_scales, "
"Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? " "Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
"g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, " "Tensor? "
"g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_type_id, "
"SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
"bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor"); "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
// conditionally compiled so impl registration is in source file // conditionally compiled so impl registration is in source file
@ -309,13 +309,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// gptq_marlin repack from GPTQ. // gptq_marlin repack from GPTQ.
ops.def( ops.def(
"gptq_marlin_repack(Tensor b_q_weight, Tensor perm, " "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor"); "SymInt size_k, SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
// conditionally compiled so impl registrations are in source file // conditionally compiled so impl registrations are in source file
// awq_marlin repack from AWQ. // awq_marlin repack from AWQ.
ops.def( ops.def(
"awq_marlin_repack(Tensor b_q_weight, SymInt size_k, " "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
"SymInt size_n, int num_bits) -> Tensor"); "SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
// conditionally compiled so impl registrations are in source file
// preprocess W-int4A-fp8 weight for marlin kernel
ops.def(
"marlin_int4_fp8_preprocess(Tensor qweight, "
"Tensor? qzeros_or_none, bool inplace) -> Tensor");
// conditionally compiled so impl registrations are in source file // conditionally compiled so impl registrations are in source file
// CUTLASS w4a8 GEMM // CUTLASS w4a8 GEMM

View File

@ -244,9 +244,15 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
# Install EP kernels(pplx-kernels and DeepEP) # Install EP kernels(pplx-kernels and DeepEP)
ARG PPLX_COMMIT_HASH
ARG DEEPEP_COMMIT_HASH
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \ export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
/tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \ /tmp/install_python_libraries.sh \
--workspace /tmp/ep_kernels_workspace \
--mode wheel \
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
@ -358,7 +364,12 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
cuda-cudart-${CUDA_VERSION_DASH} \ cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \ cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \ cuda-cuobjdump-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} && \ # https://github.com/vllm-project/vllm/issues/29590
libcurand-dev-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} \
# Fixes nccl_allocator requiring nccl.h at runtime
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
libnccl-dev && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_INDEX_URL UV_INDEX_URL
@ -392,8 +403,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# Install FlashInfer pre-compiled kernel cache and binaries # Install FlashInfer pre-compiled kernel cache and binaries
# https://docs.flashinfer.ai/installation.html # https://docs.flashinfer.ai/installation.html
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==0.5.2 \ uv pip install --system flashinfer-cubin==0.5.3 \
&& uv pip install --system flashinfer-jit-cache==0.5.2 \ && uv pip install --system flashinfer-jit-cache==0.5.3 \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config && flashinfer show-config

View File

@ -132,7 +132,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
esac; \ esac; \
}; \ }; \
remove_packages_not_supported_on_aarch64 && \ remove_packages_not_supported_on_aarch64 && \
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \ sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu

View File

@ -65,6 +65,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# ----------------------- # -----------------------
# Test vLLM image # Test vLLM image
@ -88,10 +89,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
# install development dependencies (for testing) # install development dependencies (for testing)
RUN cd /vllm-workspace \ RUN cd /vllm-workspace \
&& rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \ && python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install pytest-shard && python3 -m pip install pytest-shard
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER=1
# Copy in the v1 package (for python-only install test group)
COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
# Source code is used in the `python_only_compile.sh` test
# We hide it inside `src/` so that this source code
# will not be imported by other tests
RUN mkdir src && mv vllm src/vllm
# ----------------------- # -----------------------
# Final vLLM image # Final vLLM image
FROM base AS final FROM base AS final

View File

@ -5,6 +5,8 @@ ARG PYTORCH_BRANCH="1c57644d"
ARG PYTORCH_VISION_BRANCH="v0.23.0" ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
ARG FA_BRANCH="0e60e394" ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="59bd8ff2" ARG AITER_BRANCH="59bd8ff2"
@ -23,6 +25,7 @@ ENV AITER_ROCM_ARCH=gfx942;gfx950
ENV HSA_NO_SCRATCH_RECLAIM=1 ENV HSA_NO_SCRATCH_RECLAIM=1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN mkdir -p /app RUN mkdir -p /app
WORKDIR /app WORKDIR /app
@ -45,6 +48,7 @@ RUN apt-get update -y \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
FROM base AS build_triton FROM base AS build_triton
ARG TRITON_BRANCH ARG TRITON_BRANCH
@ -66,11 +70,14 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
FROM base AS build_pytorch FROM base AS build_pytorch
ARG PYTORCH_BRANCH ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_REPO ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_REPO
RUN git clone ${PYTORCH_REPO} pytorch RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \ RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
pip install -r requirements.txt && git submodule update --init --recursive \ && pip install -r requirements.txt && git submodule update --init --recursive \
&& python3 tools/amd_build/build_amd.py \ && python3 tools/amd_build/build_amd.py \
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl && pip install dist/*.whl
@ -78,8 +85,15 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
&& python3 setup.py bdist_wheel --dist-dir=dist \ && python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl && pip install dist/*.whl
RUN git clone ${PYTORCH_AUDIO_REPO} audio
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt \
&& python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install && cp /app/vision/dist/*.whl /app/install \
&& cp /app/audio/dist/*.whl /app/install
FROM base AS build_fa FROM base AS build_fa
ARG FA_BRANCH ARG FA_BRANCH
@ -130,6 +144,8 @@ ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_AUDIO_REPO
ARG FA_BRANCH ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG AITER_BRANCH ARG AITER_BRANCH
@ -141,7 +157,9 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \ && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
&& echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -5,11 +5,7 @@ nav:
- Getting Started: - Getting Started:
- getting_started/quickstart.md - getting_started/quickstart.md
- getting_started/installation - getting_started/installation
- Examples: - Examples: examples
- examples/README.md
- Offline Inference: examples/offline_inference
- Online Serving: examples/online_serving
- Others: examples/others
- General: - General:
- usage/v1_guide.md - usage/v1_guide.md
- usage/* - usage/*
@ -52,6 +48,11 @@ nav:
- Plugins: - Plugins:
- design/*plugin*.md - design/*plugin*.md
- design/* - design/*
- Benchmarking:
- benchmarking/README.md
- benchmarking/cli.md
- benchmarking/sweeps.md
- benchmarking/dashboard.md
- API Reference: - API Reference:
- api/README.md - api/README.md
- api/vllm - api/vllm

View File

@ -0,0 +1,7 @@
# Benchmark Suites
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
- **[Benchmark CLI](./cli.md)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing.
- **[Parameter Sweeps](./sweeps.md)**: Automate `vllm bench` runs for multiple configurations, useful for [optimization and tuning](../configuration/optimization.md).
- **[Performance Dashboard](./dashboard.md)**: Automated CI that publishes benchmarks on each commit.

View File

@ -1,22 +1,10 @@
--- # Benchmark CLI
toc_depth: 4
---
# Benchmark Suites This section guides you through running benchmark tests with the extensive datasets supported on vLLM.
vLLM provides comprehensive benchmarking tools for performance testing and evaluation: It's a living document, updated as new features and datasets become available.
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing ## Dataset Overview
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
## Benchmark CLI
This section guides you through running benchmark tests with the extensive
datasets supported on vLLM. It's a living document, updated as new features and datasets
become available.
### Dataset Overview
<style> <style>
th { th {
@ -59,9 +47,9 @@ Legend:
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat --dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
``` ```
### Examples ## Examples
#### 🚀 Online Benchmark ### 🚀 Online Benchmark
<details class="admonition abstract" markdown="1"> <details class="admonition abstract" markdown="1">
<summary>Show more</summary> <summary>Show more</summary>
@ -112,7 +100,7 @@ P99 ITL (ms): 8.39
================================================== ==================================================
``` ```
##### Custom Dataset #### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
@ -145,7 +133,7 @@ vllm bench serve --port 9001 --save-result --save-detailed \
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
##### VisionArena Benchmark for Vision Language Models #### VisionArena Benchmark for Vision Language Models
```bash ```bash
# need a model with vision capability here # need a model with vision capability here
@ -163,7 +151,7 @@ vllm bench serve \
--num-prompts 1000 --num-prompts 1000
``` ```
##### InstructCoder Benchmark with Speculative Decoding #### InstructCoder Benchmark with Speculative Decoding
``` bash ``` bash
vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@ -180,7 +168,7 @@ vllm bench serve \
--num-prompts 2048 --num-prompts 2048
``` ```
##### Spec Bench Benchmark with Speculative Decoding #### Spec Bench Benchmark with Speculative Decoding
``` bash ``` bash
vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@ -217,7 +205,7 @@ vllm bench serve \
--spec-bench-category "summarization" --spec-bench-category "summarization"
``` ```
##### Other HuggingFaceDataset Examples #### Other HuggingFaceDataset Examples
```bash ```bash
vllm serve Qwen/Qwen2-VL-7B-Instruct vllm serve Qwen/Qwen2-VL-7B-Instruct
@ -283,7 +271,7 @@ vllm bench serve \
--blazedit-max-distance 0.99 --blazedit-max-distance 0.99
``` ```
##### Running With Sampling Parameters #### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command: parameters can be specified. Example client command:
@ -301,7 +289,7 @@ vllm bench serve \
--num-prompts 10 --num-prompts 10
``` ```
##### Running With Ramp-Up Request Rate #### Running With Ramp-Up Request Rate
The benchmark tool also supports ramping up the request rate over the The benchmark tool also supports ramping up the request rate over the
duration of the benchmark run. This can be useful for stress testing the duration of the benchmark run. This can be useful for stress testing the
@ -318,11 +306,11 @@ The following arguments can be used to control the ramp-up:
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark. - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark. - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
##### Load Pattern Configuration #### Load Pattern Configuration
vLLM's benchmark serving script provides sophisticated load pattern simulation capabilities through three key parameters that control request generation and concurrency behavior: vLLM's benchmark serving script provides sophisticated load pattern simulation capabilities through three key parameters that control request generation and concurrency behavior:
###### Load Pattern Control Parameters ##### Load Pattern Control Parameters
- `--request-rate`: Controls the target request generation rate (requests per second). Set to `inf` for maximum throughput testing or finite values for controlled load simulation. - `--request-rate`: Controls the target request generation rate (requests per second). Set to `inf` for maximum throughput testing or finite values for controlled load simulation.
- `--burstiness`: Controls traffic variability using a Gamma distribution (range: > 0). Lower values create bursty traffic, higher values create uniform traffic. - `--burstiness`: Controls traffic variability using a Gamma distribution (range: > 0). Lower values create bursty traffic, higher values create uniform traffic.
@ -387,7 +375,7 @@ Using KV cache metrics for load pattern configuration:
</details> </details>
#### 📈 Offline Throughput Benchmark ### 📈 Offline Throughput Benchmark
<details class="admonition abstract" markdown="1"> <details class="admonition abstract" markdown="1">
<summary>Show more</summary> <summary>Show more</summary>
@ -408,7 +396,7 @@ Total num prompt tokens: 5014
Total num output tokens: 1500 Total num output tokens: 1500
``` ```
##### VisionArena Benchmark for Vision Language Models #### VisionArena Benchmark for Vision Language Models
```bash ```bash
vllm bench throughput \ vllm bench throughput \
@ -428,7 +416,7 @@ Total num prompt tokens: 14527
Total num output tokens: 1280 Total num output tokens: 1280
``` ```
##### InstructCoder Benchmark with Speculative Decoding #### InstructCoder Benchmark with Speculative Decoding
``` bash ``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
@ -451,7 +439,7 @@ Total num prompt tokens: 261136
Total num output tokens: 204800 Total num output tokens: 204800
``` ```
##### Other HuggingFaceDataset Examples #### Other HuggingFaceDataset Examples
`lmms-lab/LLaVA-OneVision-Data`: `lmms-lab/LLaVA-OneVision-Data`:
@ -509,20 +497,20 @@ vllm bench throughput \
</details> </details>
#### 🛠️ Structured Output Benchmark ### 🛠️ Structured Output Benchmark
<details class="admonition abstract" markdown="1"> <details class="admonition abstract" markdown="1">
<summary>Show more</summary> <summary>Show more</summary>
Benchmark the performance of structured output generation (JSON, grammar, regex). Benchmark the performance of structured output generation (JSON, grammar, regex).
##### Server Setup #### Server Setup
```bash ```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B vllm serve NousResearch/Hermes-3-Llama-3.1-8B
``` ```
##### JSON Schema Benchmark #### JSON Schema Benchmark
```bash ```bash
python3 benchmarks/benchmark_serving_structured_output.py \ python3 benchmarks/benchmark_serving_structured_output.py \
@ -534,7 +522,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
--num-prompts 1000 --num-prompts 1000
``` ```
##### Grammar-based Generation Benchmark #### Grammar-based Generation Benchmark
```bash ```bash
python3 benchmarks/benchmark_serving_structured_output.py \ python3 benchmarks/benchmark_serving_structured_output.py \
@ -546,7 +534,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
--num-prompts 1000 --num-prompts 1000
``` ```
##### Regex-based Generation Benchmark #### Regex-based Generation Benchmark
```bash ```bash
python3 benchmarks/benchmark_serving_structured_output.py \ python3 benchmarks/benchmark_serving_structured_output.py \
@ -557,7 +545,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
--num-prompts 1000 --num-prompts 1000
``` ```
##### Choice-based Generation Benchmark #### Choice-based Generation Benchmark
```bash ```bash
python3 benchmarks/benchmark_serving_structured_output.py \ python3 benchmarks/benchmark_serving_structured_output.py \
@ -568,7 +556,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
--num-prompts 1000 --num-prompts 1000
``` ```
##### XGrammar Benchmark Dataset #### XGrammar Benchmark Dataset
```bash ```bash
python3 benchmarks/benchmark_serving_structured_output.py \ python3 benchmarks/benchmark_serving_structured_output.py \
@ -581,14 +569,14 @@ python3 benchmarks/benchmark_serving_structured_output.py \
</details> </details>
#### 📚 Long Document QA Benchmark ### 📚 Long Document QA Benchmark
<details class="admonition abstract" markdown="1"> <details class="admonition abstract" markdown="1">
<summary>Show more</summary> <summary>Show more</summary>
Benchmark the performance of long document question-answering with prefix caching. Benchmark the performance of long document question-answering with prefix caching.
##### Basic Long Document QA Test #### Basic Long Document QA Test
```bash ```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \ python3 benchmarks/benchmark_long_document_qa_throughput.py \
@ -600,7 +588,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
--repeat-count 5 --repeat-count 5
``` ```
##### Different Repeat Modes #### Different Repeat Modes
```bash ```bash
# Random mode (default) - shuffle prompts randomly # Random mode (default) - shuffle prompts randomly
@ -633,14 +621,14 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
</details> </details>
#### 🗂️ Prefix Caching Benchmark ### 🗂️ Prefix Caching Benchmark
<details class="admonition abstract" markdown="1"> <details class="admonition abstract" markdown="1">
<summary>Show more</summary> <summary>Show more</summary>
Benchmark the efficiency of automatic prefix caching. Benchmark the efficiency of automatic prefix caching.
##### Fixed Prompt with Prefix Caching #### Fixed Prompt with Prefix Caching
```bash ```bash
python3 benchmarks/benchmark_prefix_caching.py \ python3 benchmarks/benchmark_prefix_caching.py \
@ -651,7 +639,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
--input-length-range 128:256 --input-length-range 128:256
``` ```
##### ShareGPT Dataset with Prefix Caching #### ShareGPT Dataset with Prefix Caching
```bash ```bash
# download dataset # download dataset
@ -682,14 +670,14 @@ vllm bench serve \
</details> </details>
#### ⚡ Request Prioritization Benchmark ### ⚡ Request Prioritization Benchmark
<details class="admonition abstract" markdown="1"> <details class="admonition abstract" markdown="1">
<summary>Show more</summary> <summary>Show more</summary>
Benchmark the performance of request prioritization in vLLM. Benchmark the performance of request prioritization in vLLM.
##### Basic Prioritization Test #### Basic Prioritization Test
```bash ```bash
python3 benchmarks/benchmark_prioritization.py \ python3 benchmarks/benchmark_prioritization.py \
@ -700,7 +688,7 @@ python3 benchmarks/benchmark_prioritization.py \
--scheduling-policy priority --scheduling-policy priority
``` ```
##### Multiple Sequences per Prompt #### Multiple Sequences per Prompt
```bash ```bash
python3 benchmarks/benchmark_prioritization.py \ python3 benchmarks/benchmark_prioritization.py \
@ -714,14 +702,14 @@ python3 benchmarks/benchmark_prioritization.py \
</details> </details>
#### 👁️ Multi-Modal Benchmark ### 👁️ Multi-Modal Benchmark
<details class="admonition abstract" markdown="1"> <details class="admonition abstract" markdown="1">
<summary>Show more</summary> <summary>Show more</summary>
Benchmark the performance of multi-modal requests in vLLM. Benchmark the performance of multi-modal requests in vLLM.
##### Images (ShareGPT4V) #### Images (ShareGPT4V)
Start vLLM: Start vLLM:
@ -747,7 +735,7 @@ vllm bench serve \
--endpoint /v1/chat/completions --endpoint /v1/chat/completions
``` ```
##### Videos (ShareGPT4Video) #### Videos (ShareGPT4Video)
Start vLLM: Start vLLM:
@ -773,7 +761,7 @@ vllm bench serve \
--endpoint /v1/chat/completions --endpoint /v1/chat/completions
``` ```
##### Synthetic Random Images (random-mm) #### Synthetic Random Images (random-mm)
Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets. Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
@ -846,14 +834,14 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
</details> </details>
#### Embedding Benchmark ### Embedding Benchmark
Benchmark the performance of embedding requests in vLLM. Benchmark the performance of embedding requests in vLLM.
<details class="admonition abstract" markdown="1"> <details class="admonition abstract" markdown="1">
<summary>Show more</summary> <summary>Show more</summary>
##### Text Embeddings #### Text Embeddings
Unlike generative models which use Completions API or Chat Completions API, Unlike generative models which use Completions API or Chat Completions API,
you should set `--backend openai-embeddings` and `--endpoint /v1/embeddings` to use the Embeddings API. you should set `--backend openai-embeddings` and `--endpoint /v1/embeddings` to use the Embeddings API.
@ -879,7 +867,7 @@ vllm bench serve \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
``` ```
##### Multi-modal Embeddings #### Multi-modal Embeddings
Unlike generative models which use Completions API or Chat Completions API, Unlike generative models which use Completions API or Chat Completions API,
you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backend to use depends on the model: you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backend to use depends on the model:
@ -944,7 +932,7 @@ vllm bench serve \
</details> </details>
#### Reranker Benchmark ### Reranker Benchmark
Benchmark the performance of rerank requests in vLLM. Benchmark the performance of rerank requests in vLLM.
@ -988,222 +976,3 @@ to account for the extra prompt which is the query. The token accounting to repo
throughput numbers correctly is also adjusted. throughput numbers correctly is also adjusted.
</details> </details>
## Parameter Sweeps
### Online Benchmark
[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
Follow these steps to run the script:
1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
- Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
```json
[
{
"max_num_seqs": 32,
"max_num_batched_tokens": 1024
},
{
"max_num_seqs": 64,
"max_num_batched_tokens": 1024
},
{
"max_num_seqs": 64,
"max_num_batched_tokens": 2048
},
{
"max_num_seqs": 128,
"max_num_batched_tokens": 2048
},
{
"max_num_seqs": 128,
"max_num_batched_tokens": 4096
},
{
"max_num_seqs": 256,
"max_num_batched_tokens": 4096
}
]
```
4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
- Example: Using different input/output lengths for random dataset:
```json
[
{
"random_input_len": 128,
"random_output_len": 32
},
{
"random_input_len": 256,
"random_output_len": 64
},
{
"random_input_len": 512,
"random_output_len": 128
}
]
```
5. Determine where you want to save the results, and pass that to `--output-dir`.
Example command:
```bash
vllm bench sweep serve \
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
--serve-params benchmarks/serve_hparams.json \
--bench-params benchmarks/bench_hparams.json \
-o benchmarks/results
```
!!! important
If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
You can use `--dry-run` to preview the commands to be run.
We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
!!! note
By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
!!! tip
You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
### SLA Auto-Tuner
[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
For example, to ensure E2E latency within different target values for 99% of requests:
```json
[
{
"p99_e2el_ms": "<=200"
},
{
"p99_e2el_ms": "<=500"
},
{
"p99_e2el_ms": "<=1000"
},
{
"p99_e2el_ms": "<=2000"
}
]
```
Example command:
```bash
vllm bench sweep serve_sla \
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
--serve-params benchmarks/serve_hparams.json \
--bench-params benchmarks/bench_hparams.json \
--sla-params benchmarks/sla_hparams.json \
--sla-variable max_concurrency \
-o benchmarks/results
```
The algorithm for adjusting the SLA variable is as follows:
1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
- For example, the initial request rate is set to the concurrency under infinite QPS.
2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
!!! important
SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
### Visualizer
[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
Example command:
```bash
vllm bench sweep plot benchmarks/results/<timestamp> \
--var-x max_concurrency \
--row-by random_input_len \
--col-by random_output_len \
--curve-by api_server_count,max_num_batched_tokens \
--filter-by 'max_concurrency<=1024'
```
!!! tip
You can use `--dry-run` to preview the figures to be plotted.
## Performance Benchmarks
The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
### Manually Trigger the benchmark
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
For CPU environment, please use the image with "-cpu" postfix.
Here is an example for docker run command for CPU.
```bash
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
```
Then, run below command inside the docker instance.
```bash
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
```
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
#### Runtime environment variables
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
### Continuous Benchmarking
The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
#### How It Works
The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
- **Serving tests**: Measure request handling and API performance
- **Throughput tests**: Evaluate token generation rates
- **Latency tests**: Assess response time characteristics
#### Benchmark Configuration
The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
1. Navigate to the appropriate GPU directory in the benchmarks configuration
2. Add your model specifications to the corresponding configuration files
3. The new models will be included in the next scheduled benchmark run
#### Viewing Results
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).

View File

@ -0,0 +1,58 @@
# Performance Dashboard
The performance dashboard is used to confirm whether new changes improve/degrade performance under various workloads.
It is updated by triggering benchmark runs on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
The results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
## Manually Trigger the benchmark
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
For CPU environment, please use the image with "-cpu" postfix.
Here is an example for docker run command for CPU.
```bash
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
```
Then, run below command inside the docker instance.
```bash
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
```
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
### Runtime environment variables
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
## Continuous Benchmarking
The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
### How It Works
The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
- **Serving tests**: Measure request handling and API performance
- **Throughput tests**: Evaluate token generation rates
- **Latency tests**: Assess response time characteristics
### Benchmark Configuration
The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
1. Navigate to the appropriate GPU directory in the benchmarks configuration
2. Add your model specifications to the corresponding configuration files
3. The new models will be included in the next scheduled benchmark run

178
docs/benchmarking/sweeps.md Normal file
View File

@ -0,0 +1,178 @@
# Parameter Sweeps
## Online Benchmark
### Basic
`vllm bench sweep serve` automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
Follow these steps to run the script:
1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
- Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
```json
[
{
"max_num_seqs": 32,
"max_num_batched_tokens": 1024
},
{
"max_num_seqs": 64,
"max_num_batched_tokens": 1024
},
{
"max_num_seqs": 64,
"max_num_batched_tokens": 2048
},
{
"max_num_seqs": 128,
"max_num_batched_tokens": 2048
},
{
"max_num_seqs": 128,
"max_num_batched_tokens": 4096
},
{
"max_num_seqs": 256,
"max_num_batched_tokens": 4096
}
]
```
4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
- Example: Using different input/output lengths for random dataset:
```json
[
{
"random_input_len": 128,
"random_output_len": 32
},
{
"random_input_len": 256,
"random_output_len": 64
},
{
"random_input_len": 512,
"random_output_len": 128
}
]
```
5. Determine where you want to save the results, and pass that to `--output-dir`.
Example command:
```bash
vllm bench sweep serve \
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
--serve-params benchmarks/serve_hparams.json \
--bench-params benchmarks/bench_hparams.json \
-o benchmarks/results
```
!!! important
If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
You can use `--dry-run` to preview the commands to be run.
We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
!!! note
By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
!!! tip
You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
### SLA auto-tuner
`vllm bench sweep serve_sla` is a wrapper over `vllm bench sweep serve` that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
For example, to ensure E2E latency within different target values for 99% of requests:
```json
[
{
"p99_e2el_ms": "<=200"
},
{
"p99_e2el_ms": "<=500"
},
{
"p99_e2el_ms": "<=1000"
},
{
"p99_e2el_ms": "<=2000"
}
]
```
Example command:
```bash
vllm bench sweep serve_sla \
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
--serve-params benchmarks/serve_hparams.json \
--bench-params benchmarks/bench_hparams.json \
--sla-params benchmarks/sla_hparams.json \
--sla-variable max_concurrency \
-o benchmarks/results
```
The algorithm for adjusting the SLA variable is as follows:
1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
- For example, the initial request rate is set to the concurrency under infinite QPS.
2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
!!! important
SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
## Visualization
### Basic
`vllm bench sweep plot` can be used to plot performance curves from parameter sweep results.
Example command:
```bash
vllm bench sweep plot benchmarks/results/<timestamp> \
--var-x max_concurrency \
--row-by random_input_len \
--col-by random_output_len \
--curve-by api_server_count,max_num_batched_tokens \
--filter-by 'max_concurrency<=1024'
```
!!! tip
You can use `--dry-run` to preview the figures to be plotted.
### Pareto chart
`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
Example:
```bash
vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
--label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
```

View File

@ -0,0 +1,9 @@
# vllm bench sweep plot_pareto
## JSON CLI Arguments
--8<-- "docs/cli/json_tip.inc.md"
## Arguments
--8<-- "docs/argparse/bench_sweep_plot_pareto.inc.md"

View File

@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
Below you'll find slides and recordings from our previous meetups: Below you'll find slides and recordings from our previous meetups:
- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI) - [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link) - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6) - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)

View File

@ -29,7 +29,7 @@ The initialization code should look like this:
```python ```python
from torch import nn from torch import nn
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.attention import Attention from vllm.attention.layer import Attention
class MyAttention(nn.Module): class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str): def __init__(self, vllm_config: VllmConfig, prefix: str):
@ -113,8 +113,6 @@ See [this page](registration.md) for instructions on how to register your new mo
### How to support models with interleaving sliding windows? ### How to support models with interleaving sliding windows?
For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
To support a model with interleaving sliding windows, we need to take care of the following details: To support a model with interleaving sliding windows, we need to take care of the following details:
- Make sure the model's `config.json` contains `layer_types`. - Make sure the model's `config.json` contains `layer_types`.

View File

@ -11,6 +11,8 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default - `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default - `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default - `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.

View File

@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren
```python ```python
class BatchDescriptor(NamedTuple): class BatchDescriptor(NamedTuple):
num_tokens: int num_tokens: int
uniform_decode: bool = False num_reqs: int
uniform: bool = False
has_lora: bool = False
``` ```
where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`. where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of `1+num_spec_tokens`.
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode. The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item.
!!! note !!! note
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs). The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).

View File

@ -8,9 +8,9 @@ TL;DR:
| Online Flag | Offline Flag | Result | | Online Flag | Offline Flag | Result |
|----------|----------|-------------| |----------|----------|-------------|
| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs | | --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
| -O.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only | | -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only | | -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor | | -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |
## vLLM-torch.compile overview ## vLLM-torch.compile overview
@ -86,11 +86,11 @@ LLM(model, enforce_eager=True)
``` ```
To turn off just torch.compile, pass `mode = NONE` to the compilation config. To turn off just torch.compile, pass `mode = NONE` to the compilation config.
(`-O` is short for `--compilation_config`): (`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
```sh ```sh
# Online # Online
vllm serve -O.mode=0 vllm serve -cc.mode=0
``` ```
```py ```py
@ -103,7 +103,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
```sh ```sh
# Online # Online
vllm serve -O.cudagraph_mode=NONE vllm serve -cc.cudagraph_mode=NONE
``` ```
```py ```py
@ -183,10 +183,10 @@ help debug the issue:
```sh ```sh
# Online - using unbacked mode # Online - using unbacked mode
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
# Online - using backed_size_oblivious mode # Online - using backed_size_oblivious mode
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=backed_size_oblivious
``` ```
```py ```py
@ -233,7 +233,7 @@ to the compilation config:
```sh ```sh
# online # online
vllm serve -O.backend=eager vllm serve -cc.backend=eager
``` ```
```py ```py
@ -252,7 +252,7 @@ You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output
### Editable TorchInductor code ### Editable TorchInductor code
You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked` You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable. or passing `-cc.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`) This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
and print statements in the output code. and print statements in the output code.
@ -299,7 +299,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
```sh ```sh
# Online # Online
vllm serve -O.cudagraph_mode=NONE vllm serve -cc.cudagraph_mode=NONE
``` ```
```py ```py

View File

@ -21,7 +21,7 @@ Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qw
Beyond that, there are two more things vLLM depends on Hugging Face for. Beyond that, there are two more things vLLM depends on Hugging Face for.
1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). 1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. 2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
- It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:

View File

@ -77,9 +77,9 @@ The `parse_request` method is used for validating the user prompt and converting
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference. The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output. The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters. The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py). The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples. An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.
## Using an IO Processor plugin ## Using an IO Processor plugin

View File

@ -263,6 +263,29 @@ record:
- End-to-end latency - the interval between frontend `arrival_time` - End-to-end latency - the interval between frontend `arrival_time`
and the frontend receiving the final token. and the frontend receiving the final token.
### KV Cache Residency Metrics
We also emit a set of histograms that describe how long sampled KV cache
blocks stay resident and how often they are reused. Sampling
(`--kv-cache-metrics-sample`) keeps the overhead tiny; when a block is
chosen we record:
- `lifetime` allocation ⟶ eviction
- `idle before eviction` last touch ⟶ eviction
- `reuse gaps` the pauses between touches when the block gets reused
Those map directly to the Prometheus metrics:
- `vllm:kv_block_lifetime_seconds` how long each sampled block exists.
- `vllm:kv_block_idle_before_evict_seconds` idle tail after the final access.
- `vllm:kv_block_reuse_gap_seconds` time between consecutive touches.
The engine core only ships raw eviction events via `SchedulerStats`; the
frontend drains them, turns them into Prometheus observations, and also
exposes the same data through `LLM.get_metrics()` when logging is on.
Looking at lifetime and idle time on one chart makes it easy to spot
stranded cache or workloads that pin prompts for a long decode.
### Metrics Publishing - Logging ### Metrics Publishing - Logging
The `LoggingStatLogger` metrics publisher outputs a log `INFO` message The `LoggingStatLogger` metrics publisher outputs a log `INFO` message

View File

@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod] - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod] - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod] - [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoEMethod]
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod] - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod] - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod] - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]

View File

@ -0,0 +1,69 @@
<!-- markdownlint-disable -->
# Optimization Levels
## Overview
vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
## Level Summaries and Usage Examples
```bash
# CLI usage
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
# Python API usage
from vllm.entrypoints.llm import LLM
llm = LLM(
model="RedHatAI/Llama-3.2-1B-FP8",
optimization_level=0
)
```
#### `-O1`: Quick Optimizations
- **Startup**: Moderate startup time
- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
- **Use case**: Balance for most development scenarios
```bash
# CLI usage
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
# Python API usage
from vllm.entrypoints.llm import LLM
llm = LLM(
model="RedHatAI/Llama-3.2-1B-FP8",
optimization_level=1
)
```
#### `-O2`: Full Optimizations (Default)
- **Startup**: Longer startup time
- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that noop & fusion flags are enabled.
```bash
# CLI usage (default, so optional)
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
# Python API usage
from vllm.entrypoints.llm import LLM
llm = LLM(
model="RedHatAI/Llama-3.2-1B-FP8",
optimization_level=2 # This is the default
)
```
#### `-O3`: Full Optimization
Still in development. Added infrastructure to prevent changing API in future
release. Currently behaves the same O2.
## Troubleshooting
### Common Issues
1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
3. **Performance Issues**: Ensure using `-O2` for production

View File

@ -86,7 +86,7 @@ Every plugin has three parts:
}, },
... ...
) )
``` ```
Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example: Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example:

View File

@ -117,7 +117,7 @@ vllm serve meta-llama/Llama-3.2-1B \
# Alternative: Using dot notation (simpler for single values) # Alternative: Using dot notation (simpler for single values)
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
``` ```
#### Choosing the Right Mode #### Choosing the Right Mode

View File

@ -216,14 +216,13 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
# import the required packages # import the required packages
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
DeltaMessage)
# define a reasoning parser and register it to vllm # define a reasoning parser and register it to vllm
# the name list in register_module can be used # the name list in register_module can be used
# in --reasoning-parser. # in --reasoning-parser.
class ExampleParser(ReasoningParser): class ExampleParser(ReasoningParser):
def __init__(self, tokenizer: AnyTokenizer): def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer) super().__init__(tokenizer)
def extract_reasoning_streaming( def extract_reasoning_streaming(

View File

@ -371,7 +371,8 @@ Olmo 3 models output tool calls in a format that is very similar to the one expe
Supported models: Supported models:
* TODO (will be updated after Olmo 3 release) * `allenai/Olmo-3-7B-Instruct`
* `allenai/Olmo-3-32B-Think`
Flags: `--tool-call-parser olmo3` Flags: `--tool-call-parser olmo3`
@ -421,7 +422,7 @@ Here is a summary of a plugin file:
# in --tool-call-parser. you can define as many # in --tool-call-parser. you can define as many
# tool parsers as you want here. # tool parsers as you want here.
class ExampleToolParser(ToolParser): class ExampleToolParser(ToolParser):
def __init__(self, tokenizer: AnyTokenizer): def __init__(self, tokenizer: TokenizerLike):
super().__init__(tokenizer) super().__init__(tokenizer)
# adjust request. e.g.: set skip special tokens # adjust request. e.g.: set skip special tokens

View File

@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
### Pre-built wheels ### Pre-built wheels
Currently, there are no pre-built CPU wheels. Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
### Build wheel from source ### Build wheel from source
#### Set up using Python-only build (without compilation) {#python-only-build}
Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
```bash
VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
```
#### Full build (with compilation) {#full-build}
=== "Intel/AMD x86" === "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source" --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
@ -125,6 +138,35 @@ vllm serve facebook/opt-125m --dtype=bfloat16
Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`. Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
### What are supported models on CPU?
For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu)
### How to find benchmark configuration examples for supported CPU models?
For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suites CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details).
To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.
Below is an example command to benchmark all CPU-supported models using optimized configurations.
```bash
ON_CPU=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
```
The benchmark results will be saved in `./benchmark/results/`.
In the directory, the generated `.commands` files contain all example commands for the benchmark.
We recommend configuring tensor-parallel-size to match the number of NUMA nodes on your system. Note that the current release does not support tensor-parallel-size=6.
To determine the number of NUMA nodes available, use the following command:
```bash
lscpu | grep "NUMA node(s):" | awk '{print $3}'
```
For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
, which publishes default-model CPU results produced using the same Benchmark Suite.
### How to decide `VLLM_CPU_OMP_THREADS_BIND`? ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following. - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.

View File

@ -26,42 +26,49 @@ uv pip install vllm --torch-backend=auto
??? console "pip" ??? console "pip"
```bash ```bash
# Install vLLM with CUDA 12.8. # Install vLLM with CUDA 12.9.
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
``` ```
We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
!!! note !!! note
NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration. NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
```bash ```bash
# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6). # Install vLLM with a specific CUDA version (e.g., 13.0).
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
export CUDA_VERSION=118 # or 126 export CUDA_VERSION=130 # or other
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION} uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
``` ```
#### Install the latest code #### Install the latest code
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`. LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
To install from nightly index, run:
```bash ```bash
uv pip install -U vllm \ uv pip install -U vllm \
--torch-backend=auto \ --torch-backend=auto \
--extra-index-url https://wheels.vllm.ai/nightly --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
``` ```
??? console "pip" !!! warning "`pip` caveat"
```bash
pip install -U vllm \
--pre \
--extra-index-url https://wheels.vllm.ai/nightly
```
`--pre` is required for `pip` to consider pre-released versions. Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
```bash
pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
```
##### Install specific revisions ##### Install specific revisions
@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
uv pip install vllm \ uv pip install vllm \
--torch-backend=auto \ --torch-backend=auto \
--extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
``` ```
The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
??? note "pip"
If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
performance regression), due to the limitation of `pip`, you have to specify the full URL of the
wheel file by embedding the commit hash in the URL:
```bash
export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
```
Note that the wheels are built with Python 3.8 ABI (see [PEP
425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
# --8<-- [end:pre-built-wheels] # --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source] # --8<-- [start:build-wheel-from-source]
#### Set up using Python-only build (without compilation) #### Set up using Python-only build (without compilation) {#python-only-build}
If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM: If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
@ -121,18 +108,24 @@ This command will do the following:
In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable. In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
```bash ```bash
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch export VLLM_PRECOMPILED_WHEEL_COMMIT=$(git rev-parse HEAD~1) # or earlier commit on main
export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl export VLLM_USE_PRECOMPILED=1
uv pip install --editable . uv pip install --editable .
``` ```
There are more environment variables to control the behavior of Python-only build:
* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code). You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
!!! note !!! note
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel. It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
#### Full build (with compilation) #### Full build (with compilation) {#full-build}
If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:

View File

@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python" --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
### Pre-built wheels ### Pre-built wheels {#pre-built-wheels}
=== "NVIDIA CUDA" === "NVIDIA CUDA"

View File

@ -94,6 +94,9 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
bench_latency = auto_mock("vllm.benchmarks", "latency") bench_latency = auto_mock("vllm.benchmarks", "latency")
bench_serve = auto_mock("vllm.benchmarks", "serve") bench_serve = auto_mock("vllm.benchmarks", "serve")
bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs") bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
bench_sweep_plot_pareto = auto_mock(
"vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs"
)
bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs") bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
bench_sweep_serve_sla = auto_mock( bench_sweep_serve_sla = auto_mock(
"vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs" "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
@ -221,6 +224,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
"bench_latency": create_parser(bench_latency.add_cli_args), "bench_latency": create_parser(bench_latency.add_cli_args),
"bench_serve": create_parser(bench_serve.add_cli_args), "bench_serve": create_parser(bench_serve.add_cli_args),
"bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args), "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
"bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
"bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args), "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
"bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args), "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
"bench_throughput": create_parser(bench_throughput.add_cli_args), "bench_throughput": create_parser(bench_throughput.add_cli_args),

View File

@ -2,7 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
import logging import logging
from dataclasses import dataclass, field from dataclasses import dataclass
from functools import cached_property
from pathlib import Path from pathlib import Path
from typing import Literal from typing import Literal
@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples"
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples" EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
def fix_case(text: str) -> str: def title(text: str) -> str:
# Default title case
text = text.replace("_", " ").replace("/", " - ").title()
# Custom substitutions
subs = { subs = {
"io": "IO",
"api": "API", "api": "API",
"cli": "CLI", "cli": "CLI",
"cpu": "CPU", "cpu": "CPU",
"llm": "LLM", "llm": "LLM",
"mae": "MAE", "mae": "MAE",
"ner": "NER",
"tpu": "TPU", "tpu": "TPU",
"gguf": "GGUF", "gguf": "GGUF",
"lora": "LoRA", "lora": "LoRA",
@ -48,71 +54,65 @@ class Example:
Attributes: Attributes:
path (Path): The path to the main directory or file. path (Path): The path to the main directory or file.
category (str): The category of the document. category (str): The category of the document.
main_file (Path): The main file in the directory.
other_files (list[Path]): list of other files in the directory. Properties::
title (str): The title of the document. main_file() -> Path | None: Determines the main file in the given path.
other_files() -> list[Path]: Determines other files in the directory excluding
the main file.
title() -> str: Determines the title of the document.
Methods: Methods:
__post_init__(): Initializes the main_file, other_files, and title attributes.
determine_main_file() -> Path: Determines the main file in the given path.
determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
determine_title() -> str: Determines the title of the document.
generate() -> str: Generates the documentation content. generate() -> str: Generates the documentation content.
""" # noqa: E501 """
path: Path path: Path
category: str = None category: str
main_file: Path = field(init=False)
other_files: list[Path] = field(init=False)
title: str = field(init=False)
def __post_init__(self): @cached_property
self.main_file = self.determine_main_file() def main_file(self) -> Path | None:
self.other_files = self.determine_other_files() """Determines the main file in the given path.
self.title = self.determine_title()
@property If path is a file, it returns the path itself. If path is a directory, it
def is_code(self) -> bool: searches for Markdown files (*.md) in the directory and returns the first one
return self.main_file.suffix != ".md" found. If no Markdown files are found, it returns None."""
# Single file example
if self.path.is_file():
return self.path
# Multi file example with a README
if md_paths := list(self.path.glob("*.md")):
return md_paths[0]
# Multi file example without a README
return None
def determine_main_file(self) -> Path: @cached_property
""" def other_files(self) -> list[Path]:
Determines the main file in the given path. """Determine other files in the directory excluding the main file.
If the path is a file, it returns the path itself. Otherwise, it searches
for Markdown files (*.md) in the directory and returns the first one found.
Returns:
Path: The main file path, either the original path if it's a file or the first
Markdown file found in the directory.
Raises:
IndexError: If no Markdown files are found in the directory.
""" # noqa: E501
return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
def determine_other_files(self) -> list[Path]: If path is a file, it returns an empty list. Otherwise, it returns every file
""" in the directory except the main file in a list."""
Determine other files in the directory excluding the main file. # Single file example
This method checks if the given path is a file. If it is, it returns an empty list.
Otherwise, it recursively searches through the directory and returns a list of all
files that are not the main file.
Returns:
list[Path]: A list of Path objects representing the other files in the directory.
""" # noqa: E501
if self.path.is_file(): if self.path.is_file():
return [] return []
# Multi file example
is_other_file = lambda file: file.is_file() and file != self.main_file is_other_file = lambda file: file.is_file() and file != self.main_file
return [file for file in self.path.rglob("*") if is_other_file(file)] return sorted(file for file in self.path.rglob("*") if is_other_file(file))
def determine_title(self) -> str: @cached_property
if not self.is_code: def is_code(self) -> bool:
# Specify encoding for building on Windows return self.main_file is not None and self.main_file.suffix != ".md"
with open(self.main_file, encoding="utf-8") as f:
first_line = f.readline().strip() @cached_property
match = re.match(r"^#\s+(?P<title>.+)$", first_line) def title(self) -> str:
if match: # Generate title from filename if no main md file found
return match.group("title") if self.main_file is None or self.is_code:
return fix_case(self.path.stem.replace("_", " ").title()) return title(self.path.stem)
# Specify encoding for building on Windows
with open(self.main_file, encoding="utf-8") as f:
first_line = f.readline().strip()
match = re.match(r"^#\s+(?P<title>.+)$", first_line)
if match:
return match.group("title")
raise ValueError(f"Title not found in {self.main_file}")
def fix_relative_links(self, content: str) -> str: def fix_relative_links(self, content: str) -> str:
""" """
@ -156,24 +156,35 @@ class Example:
# included files containing code fences too # included files containing code fences too
code_fence = "``````" code_fence = "``````"
if self.is_code: if self.main_file is not None:
content += ( # Single file example or multi file example with a README
f"{code_fence}{self.main_file.suffix[1:]}\n" if self.is_code:
f'--8<-- "{self.main_file}"\n' content += (
f"{code_fence}\n" f"{code_fence}{self.main_file.suffix[1:]}\n"
) f'--8<-- "{self.main_file}"\n'
f"{code_fence}\n"
)
else:
with open(self.main_file, encoding="utf-8") as f:
# Skip the title from md snippets as it's been included above
main_content = f.readlines()[1:]
content += self.fix_relative_links("".join(main_content))
content += "\n"
else: else:
with open(self.main_file) as f: # Multi file example without a README
# Skip the title from md snippets as it's been included above for file in self.other_files:
main_content = f.readlines()[1:] file_title = title(str(file.relative_to(self.path).with_suffix("")))
content += self.fix_relative_links("".join(main_content)) content += f"## {file_title}\n\n"
content += "\n" content += (
f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n'
)
return content
if not self.other_files: if not self.other_files:
return content return content
content += "## Example materials\n\n" content += "## Example materials\n\n"
for file in sorted(self.other_files): for file in self.other_files:
content += f'??? abstract "{file.relative_to(self.path)}"\n' content += f'??? abstract "{file.relative_to(self.path)}"\n'
if file.suffix != ".md": if file.suffix != ".md":
content += f" {code_fence}{file.suffix[1:]}\n" content += f" {code_fence}{file.suffix[1:]}\n"
@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
glob_patterns = ["*.py", "*.md", "*.sh"] glob_patterns = ["*.py", "*.md", "*.sh"]
# Find categorised examples # Find categorised examples
for category in categories: for category in categories:
logger.info("Processing category: %s", category.stem)
globs = [category.glob(pattern) for pattern in glob_patterns] globs = [category.glob(pattern) for pattern in glob_patterns]
for path in itertools.chain(*globs): for path in itertools.chain(*globs):
examples.append(Example(path, category.stem)) examples.append(Example(path, category.stem))
# Find examples in subdirectories # Find examples in subdirectories
for path in category.glob("*/*.md"): globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns]
for path in itertools.chain(*globs):
examples.append(Example(path.parent, category.stem)) examples.append(Example(path.parent, category.stem))
# Generate the example documentation # Generate the example documentation
@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
with open(doc_path, "w+", encoding="utf-8") as f: with open(doc_path, "w+", encoding="utf-8") as f:
f.write(example.generate()) f.write(example.generate())
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR)) logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
logger.info("Total examples generated: %d", len(examples))

View File

@ -1,25 +1,33 @@
# CPU - Intel® Xeon® # CPU - Intel® Xeon®
## Validated Hardware
| Hardware |
| ----------------------------------------- |
| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html) |
| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html) |
## Supported Models ## Supported Models
### Text-only Language Models ### Text-only Language Models
| Model | Architecture | Supported | | Model | Architecture | Supported |
|--------------------------------------|-------------------------------------------|-----------| |--------------------------------------|-------------------------------------------|-----------|
| meta-llama/Llama-3.1 / 3.3 | LlamaForCausalLM | ✅ | | meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ |
| meta-llama/Llama-4-Scout | Llama4ForConditionalGeneration | ✅ | | meta-llama/Llama-3.2-3B-Instruct | LlamaForCausalLM | ✅ |
| meta-llama/Llama-4-Maverick | Llama4ForConditionalGeneration | ✅ | | ibm-granite/granite-3.2-2b-instruct | GraniteForCausalLM | ✅ |
| ibm-granite/granite (Granite-MOE) | GraniteMoeForCausalLM | ✅ | | Qwen/Qwen3-1.7B | Qwen3ForCausalLM | ✅ |
| Qwen/Qwen3 | Qwen3ForCausalLM | ✅ | | Qwen/Qwen3-4B | Qwen3ForCausalLM | ✅ |
| zai-org/GLM-4.5 | GLMForCausalLM | ✅ | | Qwen/Qwen3-8B | Qwen3ForCausalLM | ✅ |
| google/gemma | GemmaForCausalLM | ✅ | | zai-org/glm-4-9b-hf | GLMForCausalLM | ✅ |
| google/gemma-7b | GemmaForCausalLM | ✅ |
### Multimodal Language Models ### Multimodal Language Models
| Model | Architecture | Supported | | Model | Architecture | Supported |
|--------------------------------------|-------------------------------------------|-----------| |--------------------------------------|-------------------------------------------|-----------|
| Qwen/Qwen2.5-VL | Qwen2VLForConditionalGeneration | ✅ | | Qwen/Qwen2.5-VL-7B-Instruct | Qwen2VLForConditionalGeneration | ✅ |
| openai/whisper | WhisperForConditionalGeneration | ✅ | | openai/whisper-large-v3 | WhisperForConditionalGeneration | ✅ |
✅ Runs and optimized. ✅ Runs and optimized.
🟨 Runs and correct but not optimized to green yet. 🟨 Runs and correct but not optimized to green yet.

View File

@ -0,0 +1,65 @@
# XPU - Intel® GPUs
## Validated Hardware
| Hardware |
| ----------------------------------------- |
| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html) |
## Supported Models
### Text-only Language Models
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
| openai/gpt-oss-20b | GPTForCausalLM | | | ✅ |
| openai/gpt-oss-120b | GPTForCausalLM | | | ✅ |
| deepseek-ai/DeepSeek-R1-Distill-Llama-8B | LlamaForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B | QwenForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B | QwenForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM | ✅ | ✅ | |
| Qwen/Qwen2.5-72B-Instruct | Qwen2ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-14B | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-32B | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-30B-A3B | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-30B-A3B-GPTQ-Int4 | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-coder-30B-A3B-Instruct | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/QwQ-32B | QwenForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-V2-Lite | DeepSeekForCausalLM | ✅ | ✅ | |
| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ | ✅ | |
| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM | ✅ | ✅ | |
| THUDM/GLM-4-9B-chat | GLMForCausalLM | ✅ | ✅ | |
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
| chuhac/TeleChat2-35B | LlamaForCausalLM (TeleChat2 based on Llama arch) | ✅ | ✅ | |
| 01-ai/Yi1.5-34B-Chat | YiForCausalLM | ✅ | ✅ | |
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-Coder-33B-base | DeepSeekCoderForCausalLM | ✅ | ✅ | |
| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM | ✅ | ✅ | |
| meta-llama/Llama-2-13b-chat-hf | LlamaForCausalLM | ✅ | ✅ | |
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
| Qwen/Qwen1.5-14B-Chat | QwenForCausalLM | ✅ | ✅ | |
| Qwen/Qwen1.5-32B-Chat | QwenForCausalLM | ✅ | ✅ | |
### Multimodal Language Models
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
| ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
| OpenGVLab/InternVL3_5-8B | InternVLForConditionalGeneration | ✅ | ✅ | |
| OpenGVLab/InternVL3_5-14B | InternVLForConditionalGeneration | ✅ | ✅ | |
| OpenGVLab/InternVL3_5-38B | InternVLForConditionalGeneration | ✅ | ✅ | |
| Qwen/Qwen2-VL-7B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
| THUDM/GLM-4v-9B | GLM4vForConditionalGeneration | ✅ | ✅ | |
| openbmb/MiniCPM-V-4 | MiniCPMVForConditionalGeneration | ✅ | ✅ | |
### Embedding and Reranker Language Models
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
| ----------------------- | ------------------------------ | ---- | ----------- | ----- |
| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding | ✅ | ✅ | |
| Qwen/Qwen3-Reranker-8B | Qwen3ForSequenceClassification | ✅ | ✅ | |
✅ Runs and optimized.
🟨 Runs and correct but not optimized to green yet.
❌ Does not pass accuracy test or does not run.

View File

@ -1,15 +1,15 @@
# Pooling Models # Pooling Models
vLLM also supports pooling models, such as embedding, classification and reward models. vLLM also supports pooling models, such as embedding, classification, and reward models.
In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface. In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
before returning them. before returning them.
!!! note !!! note
We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly. We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions! We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
## Configuration ## Configuration
@ -19,7 +19,7 @@ Run a model in pooling mode via the option `--runner pooling`.
!!! tip !!! tip
There is no need to set this option in the vast majority of cases as vLLM can automatically There is no need to set this option in the vast majority of cases as vLLM can automatically
detect the model runner to use via `--runner auto`. detect the appropriate model runner via `--runner auto`.
### Model Conversion ### Model Conversion
@ -78,7 +78,7 @@ When loading [Sentence Transformers](https://huggingface.co/sentence-transformer
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
You can further customize this via the `--pooler-config` option, You can further customize this via the `--pooler-config` option,
which takes priority over both the model's and Sentence Transformers's defaults. which takes priority over both the model's and Sentence Transformers' defaults.
## Offline Inference ## Offline Inference
@ -168,11 +168,11 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
- For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`. - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
- For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`. - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
- For similarity scores, use `LLM.score(...)`. - For similarity scores, use `LLM.score(...)`.
- For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`. - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
- For token classification, use `pooling_task="token_classify"`. - For token classification, use `pooling_task="token_classify"`.
- For multi-vector retrieval, use `pooling_task="token_embed"` - For multi-vector retrieval, use `pooling_task="token_embed"`.
- For IO Processor Plugins , use `pooling_task="plugin"` - For IO Processor Plugins, use `pooling_task="plugin"`.
```python ```python
from vllm import LLM from vllm import LLM
@ -194,15 +194,15 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. - [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
!!! note !!! note
Please use one of the more specific methods or set the task directly when using [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.: Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
- For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`. - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
- For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`. - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
- For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api). - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
- For rewards, `task":"token_classify"`. - For rewards, use `"task":"token_classify"`.
- For token classification, use `task":"token_classify"`. - For token classification, use `"task":"token_classify"`.
- For multi-vector retrieval, use `task":"token_embed"` - For multi-vector retrieval, use `"task":"token_embed"`.
- For IO Processor Plugins , use `task":"plugin"` - For IO Processor Plugins, use `"task":"plugin"`.
```python ```python
# start a supported embeddings model server with `vllm serve`, e.g. # start a supported embeddings model server with `vllm serve`, e.g.
@ -232,7 +232,7 @@ for output in response.json()["data"]:
## Matryoshka Embeddings ## Matryoshka Embeddings
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost. [Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
!!! warning !!! warning
Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
@ -245,9 +245,9 @@ for output in response.json()["data"]:
### Manually enable Matryoshka Embeddings ### Manually enable Matryoshka Embeddings
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions. There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`, `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online). For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
Here is an example to serve a model with Matryoshka Embeddings enabled. Here is an example to serve a model with Matryoshka Embeddings enabled.
@ -274,11 +274,11 @@ outputs = llm.embed(
print(outputs[0].outputs) print(outputs[0].outputs)
``` ```
A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py) A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py)
### Online Inference ### Online Inference
Use the following command to start vllm server. Use the following command to start the vLLM server.
```bash ```bash
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
@ -304,17 +304,17 @@ Expected output:
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
``` ```
An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py) An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py)
## Deprecated Features ## Deprecated Features
### Encode task ### Encode task
We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`: We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
- `token_embed` is the same as embed, using normalize as activation. - `token_embed` is the same as `embed`, using normalization as the activation.
- `token_classify` is the same as classify, default using softmax as activation. - `token_classify` is the same as `classify`, by default using softmax as the activation.
### Remove softmax from PoolingParams ### Remove softmax from PoolingParams
We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.

View File

@ -417,7 +417,8 @@ th {
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | | `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
@ -479,6 +480,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|-------------------|----------------------|---------------------------| |--------------|--------|-------------------|----------------------|---------------------------|
| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | | `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
@ -566,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
``` ```
!!! note !!! note
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py). Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
```bash ```bash
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@ -604,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
!!! note !!! note
Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py). Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py).
## List of Multimodal Language Models ## List of Multimodal Language Models
@ -710,7 +712,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | | `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
@ -725,6 +727,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ | | `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ |
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
| `UltravoxModel` | Ultravox | T + A<sup>E+</sup> | `fixie-ai/ultravox-v0_5-llama-3_2-1b` | ✅︎ | ✅︎ |
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!

View File

@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
- *Note: `suffix` parameter is not supported.* - *Note: `suffix` parameter is not supported.*
- [Chat Completions API](#chat-api) (`/v1/chat/completions`) - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template). - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
- *Note: `parallel_tool_calls` and `user` parameters are ignored.* - *Note: `user` parameter is ignored.*
- *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
- [Embeddings API](#embeddings-api) (`/v1/embeddings`) - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
- Only applicable to [embedding models](../models/pooling_models.md). - Only applicable to [embedding models](../models/pooling_models.md).
- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`) - [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
@ -233,7 +234,7 @@ The following extra parameters are supported:
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py) Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations: which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
@ -334,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
example below for details. example below for details.
Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py) Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py)
#### Extra parameters #### Extra parameters
@ -350,7 +351,7 @@ The following extra parameters are supported by default:
??? code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params" --8<-- "vllm/entrypoints/pooling/embed/protocol.py:embedding-extra-params"
``` ```
For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
@ -358,7 +359,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
??? code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params" --8<-- "vllm/entrypoints/pooling/embed/protocol.py:chat-embedding-extra-params"
``` ```
### Transcriptions API ### Transcriptions API
@ -455,6 +456,7 @@ For `verbose_json` response format:
] ]
} }
``` ```
Currently “verbose_json” response format doesnt support avg_logprob, compression_ratio, no_speech_prob.
#### Extra Parameters #### Extra Parameters
@ -514,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py) Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py)
### Classification API ### Classification API
@ -522,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities. We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py) Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py)
#### Example Requests #### Example Requests
@ -628,7 +630,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
The following extra parameters are supported: The following extra parameters are supported:
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params" --8<-- "vllm/entrypoints/pooling/classify/protocol.py:classification-extra-params"
``` ```
### Score API ### Score API
@ -638,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py) Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
#### Single inference #### Single inference
@ -819,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
print("Scoring output:", response_json["data"][0]["score"]) print("Scoring output:", response_json["data"][0]["score"])
print("Scoring output:", response_json["data"][1]["score"]) print("Scoring output:", response_json["data"][1]["score"])
``` ```
Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py) Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py)
#### Extra parameters #### Extra parameters
@ -833,7 +835,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
The following extra parameters are supported: The following extra parameters are supported:
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params" --8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
``` ```
### Re-rank API ### Re-rank API
@ -849,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
popular open-source tools. popular open-source tools.
Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py) Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
#### Example Request #### Example Request
@ -914,7 +916,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
The following extra parameters are supported: The following extra parameters are supported:
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params" --8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
``` ```
## Ray Serve LLM ## Ray Serve LLM

15
examples/offline_inference/audio_language.py Normal file → Executable file
View File

@ -425,6 +425,13 @@ def parse_args():
default=None, default=None,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args() return parser.parse_args()
@ -434,6 +441,12 @@ def main(args):
if model not in model_example_map: if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.") raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
audio_count = args.num_audios audio_count = args.num_audios
req_data = model_example_map[model]( req_data = model_example_map[model](
question_per_audio_count[audio_count], audio_count question_per_audio_count[audio_count], audio_count
@ -446,6 +459,8 @@ def main(args):
) )
engine_args = asdict(req_data.engine_args) | {"seed": args.seed} engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
# We set temperature to 0.2 so that outputs can be different # We set temperature to 0.2 so that outputs can be different

View File

@ -0,0 +1,98 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates preempt requests when using the `LLMEngine`
for processing prompts with various sampling parameters.
"""
import argparse
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser
def create_test_prompts() -> list[tuple[str, SamplingParams]]:
"""Create a list of test prompts with their sampling parameters."""
return [
(
"A robot may not injure a human being " * 50,
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
),
),
(
"A robot may not injure a human being " * 50,
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
),
),
(
"To be or not to be,",
SamplingParams(
temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128
),
),
(
"What is the meaning of life?",
SamplingParams(
n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1, max_tokens=128
),
),
]
def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
print("-" * 50)
step_id = 0
while test_prompts or engine.has_unfinished_requests():
print("-" * 50)
import os
print(f"Step {step_id} (pid={os.getpid()})")
if test_prompts:
prompt, sampling_params = test_prompts.pop(0)
engine.add_request(str(request_id), prompt, sampling_params)
request_id += 1
if step_id == 10:
print(f"Resetting prefix cache at {step_id}")
engine.reset_prefix_cache(reset_running_requests=True)
request_outputs: list[RequestOutput] = engine.step()
for request_output in request_outputs:
if request_output.finished:
print("-" * 50)
print(request_output)
print("-" * 50)
step_id += 1
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
"""Initialize the LLMEngine from the command line arguments."""
engine_args = EngineArgs.from_cli_args(args)
return LLMEngine.from_engine_args(engine_args)
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using the LLMEngine class directly"
)
parser = EngineArgs.add_cli_args(parser)
return parser.parse_args()
def main(args: argparse.Namespace):
"""Main function that sets up and runs the prompt processing."""
engine = initialize_engine(args)
test_prompts = create_test_prompts()
process_requests(engine, test_prompts)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -46,7 +46,6 @@ def create_test_prompts(
logprobs=1, logprobs=1,
prompt_logprobs=1, prompt_logprobs=1,
max_tokens=128, max_tokens=128,
stop_token_ids=[32003],
), ),
LoRARequest("sql-lora", 1, lora_path), LoRARequest("sql-lora", 1, lora_path),
), ),
@ -57,7 +56,6 @@ def create_test_prompts(
logprobs=1, logprobs=1,
prompt_logprobs=1, prompt_logprobs=1,
max_tokens=128, max_tokens=128,
stop_token_ids=[32003],
), ),
LoRARequest("sql-lora2", 2, lora_path), LoRARequest("sql-lora2", 2, lora_path),
), ),
@ -98,7 +96,7 @@ def initialize_engine() -> LLMEngine:
# use the same rank, it is recommended to set this as low as possible. # use the same rank, it is recommended to set this as low as possible.
# max_cpu_loras: controls the size of the CPU LoRA cache. # max_cpu_loras: controls the size of the CPU LoRA cache.
engine_args = EngineArgs( engine_args = EngineArgs(
model="meta-llama/Llama-2-7b-hf", model="meta-llama/Llama-3.2-3B-Instruct",
enable_lora=True, enable_lora=True,
max_loras=1, max_loras=1,
max_lora_rank=8, max_lora_rank=8,
@ -111,7 +109,7 @@ def initialize_engine() -> LLMEngine:
def main(): def main():
"""Main function that sets up and runs the prompt processing.""" """Main function that sets up and runs the prompt processing."""
engine = initialize_engine() engine = initialize_engine()
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
test_prompts = create_test_prompts(lora_path) test_prompts = create_test_prompts(lora_path)
process_requests(engine, test_prompts) process_requests(engine, test_prompts)

View File

@ -1,57 +0,0 @@
# Pooling models
## Convert llm model to seq cls
```bash
# for BAAI/bge-reranker-v2-gemma
# Caution: "Yes" and "yes" are two different tokens
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
# for mxbai-rerank-v2
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
# for Qwen3-Reranker
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
```
## Embed jina_embeddings_v3 usage
Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120>
```bash
python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
```
## Embed matryoshka dimensions usage
```bash
python examples/offline_inference/pooling/embed_matryoshka_fy.py
```
## Multi vector retrieval usage
```bash
python examples/offline_inference/pooling/multi_vector_retrieval.py
```
## Named Entity Recognition (NER) usage
```bash
python examples/offline_inference/pooling/ner.py
```
## Prithvi Geospatial MAE usage
```bash
python examples/offline_inference/pooling/prithvi_geospatial_mae.py
```
## IO Processor Plugins for Prithvi Geospatial MAE
```bash
python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
```
## Qwen3 reranker usage
```bash
python examples/offline_inference/pooling/qwen3_reranker.py
```

View File

@ -133,7 +133,7 @@ def main(args):
tensor_parallel_size=args.tp, tensor_parallel_size=args.tp,
enable_chunked_prefill=args.enable_chunked_prefill, enable_chunked_prefill=args.enable_chunked_prefill,
enforce_eager=args.enforce_eager, enforce_eager=args.enforce_eager,
gpu_memory_utilization=0.8, gpu_memory_utilization=0.9,
speculative_config=speculative_config, speculative_config=speculative_config,
disable_log_stats=False, disable_log_stats=False,
max_model_len=args.max_model_len, max_model_len=args.max_model_len,

20
examples/offline_inference/vision_language.py Normal file → Executable file
View File

@ -1801,7 +1801,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}, hf_overrides={
"architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2",
},
limit_mm_per_prompt={modality: 1}, limit_mm_per_prompt={modality: 1},
) )
@ -2064,6 +2067,13 @@ def parse_args():
help="If True, will send all requests in a second batch with empty mm " help="If True, will send all requests in a second batch with empty mm "
"data to verify cache hits with UUIDs.", "data to verify cache hits with UUIDs.",
) )
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args() return parser.parse_args()
@ -2072,6 +2082,12 @@ def main(args):
if model not in model_example_map: if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.") raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
modality = args.modality modality = args.modality
mm_input = get_multi_modal_input(args) mm_input = get_multi_modal_input(args)
data = mm_input["data"] data = mm_input["data"]
@ -2089,6 +2105,8 @@ def main(args):
"seed": args.seed, "seed": args.seed,
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4, "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
} }
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`. # Don't want to check the flag multiple times, so just hijack `prompts`.

View File

@ -1222,7 +1222,10 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
max_model_len=32768, max_model_len=32768,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}, hf_overrides={
"architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2",
},
) )
prompt = ( prompt = (
@ -1352,10 +1355,18 @@ model_example_map = {
} }
def run_generate(model, question: str, image_urls: list[str], seed: int | None): def run_generate(
model,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed} engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
sampling_params = SamplingParams( sampling_params = SamplingParams(
@ -1378,7 +1389,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
print("-" * 50) print("-" * 50)
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None): def run_chat(
model: str,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
# Disable other modalities to save memory # Disable other modalities to save memory
@ -1388,6 +1405,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
) )
engine_args = asdict(req_data.engine_args) | {"seed": seed} engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
sampling_params = ( sampling_params = (
@ -1463,6 +1482,13 @@ def parse_args():
default=2, default=2,
help="Number of images to use for the demo.", help="Number of images to use for the demo.",
) )
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args() return parser.parse_args()
@ -1470,13 +1496,20 @@ def main(args: Namespace):
model = args.model_type model = args.model_type
method = args.method method = args.method
seed = args.seed seed = args.seed
tensor_parallel_size = args.tensor_parallel_size
if tensor_parallel_size is not None and tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {tensor_parallel_size}"
)
image_urls = IMAGE_URLS[: args.num_images] image_urls = IMAGE_URLS[: args.num_images]
if method == "generate": if method == "generate":
run_generate(model, QUESTION, image_urls, seed) run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
elif method == "chat": elif method == "chat":
run_chat(model, QUESTION, image_urls, seed) run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
else: else:
raise ValueError(f"Invalid method: {method}") raise ValueError(f"Invalid method: {method}")

View File

@ -0,0 +1,184 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example demonstrating MCP (Model Context Protocol) tools with the Responses API.
This example shows how to use MCP tools with different allowed_tools configurations:
1. No filter (allows all tools from the MCP server)
2. Wildcard "*" (explicitly allows all tools)
3. Specific tool names (filters to only those tools)
Set up this example by starting a vLLM OpenAI-compatible server with MCP tools enabled.
For example:
vllm serve openai/gpt-oss-20b --enforce-eager --tool-server demo
Environment variables:
- VLLM_ENABLE_RESPONSES_API_STORE=1
- VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=code_interpreter,container
- VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS=1
"""
from openai import OpenAI
from utils import get_first_model
def example_no_filter():
"""Example with no allowed_tools filter - allows all tools."""
print("=" * 60)
print("Example 1: No allowed_tools filter (allows all tools)")
print("=" * 60)
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model,
input="Execute this code: print('Hello from Python!')",
instructions="Use the Python tool to execute code.",
tools=[
{
"type": "mcp",
"server_label": "code_interpreter",
"server_url": "http://localhost:8888",
# No allowed_tools specified - all tools are available
}
],
)
print(f"Status: {response.status}")
print(f"Output: {response.output_text}")
print()
def example_wildcard():
"""Example with allowed_tools=['*'] - explicitly allows all tools."""
print("=" * 60)
print("Example 2: allowed_tools=['*'] (select all tools)")
print("=" * 60)
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model,
input="Execute this code: print('Hello from Python with wildcard!')",
instructions="Use the Python tool to execute code.",
tools=[
{
"type": "mcp",
"server_label": "code_interpreter",
"server_url": "http://localhost:8888",
# Using "*" to explicitly allow all tools from this MCP server
# This is equivalent to not specifying allowed_tools
"allowed_tools": ["*"],
}
],
)
print(f"Status: {response.status}")
print(f"Output: {response.output_text}")
print()
def example_specific_tools():
"""Example with specific allowed_tools list - filters available tools.
Note: This example uses 'web_search_preview' (browser) which has multiple
sub-tools: 'search', 'open', 'find'. The code_interpreter (python) doesn't
have sub-tools, so filtering doesn't apply there.
"""
print("=" * 60)
print("Example 3: allowed_tools=['search'] (filter browser to specific tools)")
print("=" * 60)
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model,
input="Search for 'Python programming tutorials'",
instructions="Use the browser tool to search.",
tools=[
{
"type": "mcp",
"server_label": "web_search_preview",
"server_url": "http://localhost:8888",
# Browser has tools: 'search', 'open', 'find'
# Only allow 'search' - blocks 'open' and 'find'
"allowed_tools": ["search"],
}
],
)
print(f"Status: {response.status}")
print(f"Output: {response.output_text}")
print()
def example_object_format():
"""Example using object format for allowed_tools with browser tools."""
print("=" * 60)
print("Example 4: allowed_tools with object format")
print("=" * 60)
base_url = "http://0.0.0.0:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = get_first_model(client)
response = client.responses.create(
model=model,
input="Search for 'machine learning' and open the first result",
instructions="Use the browser tool.",
tools=[
{
"type": "mcp",
"server_label": "web_search_preview",
"server_url": "http://localhost:8888",
# Object format with tool_names field
# Can also include read_only and other fields
# Browser has tools: 'search', 'open', 'find'
"allowed_tools": {
"tool_names": [
"search",
"open",
], # Allow search and open, block find
"read_only": False,
},
}
],
)
print(f"Status: {response.status}")
print(f"Output: {response.output_text}")
print()
def main():
"""Run all examples."""
print("\n" + "=" * 60)
print("MCP Tools with allowed_tools Examples")
print("=" * 60 + "\n")
# Run all examples
example_no_filter()
example_wildcard()
example_specific_tools()
example_object_format()
print("=" * 60)
print("Summary:")
print(" - No filter or '*' → All tools available from server")
print(" - Specific list → Only those sub-tools available")
print(" - Object format → More control with tool_names field")
print("")
print("Note: allowed_tools filters SUB-TOOLS within an MCP server:")
print(" - code_interpreter (python): No sub-tools to filter")
print(" - web_search_preview (browser): Has 'search', 'open', 'find'")
print("=" * 60)
if __name__ == "__main__":
main()

View File

@ -1,97 +0,0 @@
# Pooling models
## Cohere rerank usage
```bash
# vllm serve BAAI/bge-reranker-base
python examples/online_serving/pooling/cohere_rerank_client.py
```
## Embedding requests base64 encoding_format usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/embedding_requests_base64_client.py
```
## Embedding requests bytes encoding_format usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/embedding_requests_bytes_client.py
```
## Jinaai rerank usage
```bash
# vllm serve BAAI/bge-reranker-base
python examples/online_serving/pooling/jinaai_rerank_client.py
```
## Multi vector retrieval usage
```bash
# vllm serve BAAI/bge-m3
python examples/online_serving/pooling/multi_vector_retrieval_client.py
```
## Named Entity Recognition (NER) usage
```bash
# vllm serve boltuix/NeuroBERT-NER
python examples/online_serving/pooling/ner_client.py
```
## OpenAI chat embedding for multimodal usage
```bash
python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
```
## OpenAI classification usage
```bash
# vllm serve jason9693/Qwen2.5-1.5B-apeach
python examples/online_serving/pooling/openai_classification_client.py
```
## OpenAI cross_encoder score usage
```bash
# vllm serve BAAI/bge-reranker-v2-m3
python examples/online_serving/pooling/openai_cross_encoder_score.py
```
## OpenAI cross_encoder score for multimodal usage
```bash
# vllm serve jinaai/jina-reranker-m0
python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
```
## OpenAI embedding usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/openai_embedding_client.py
```
## OpenAI embedding matryoshka dimensions usage
```bash
# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
```
## OpenAI pooling usage
```bash
# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
python examples/online_serving/pooling/openai_pooling_client.py
```
## Online Prithvi Geospatial MAE usage
```bash
python examples/online_serving/pooling/prithvi_geospatial_mae.py
```

Some files were not shown because too many files have changed in this diff Show More