Merge branch 'main' into rename_file_info_to_pkg/file

This commit is contained in:
Ning Xie 2025-12-03 10:00:17 +08:00 committed by GitHub
commit 658f6a6da5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
219 changed files with 4102 additions and 2992 deletions

View File

@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
#### Default Parameters Field
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
<details>
<summary> An Example of default parameters field </summary>
```json
{
"defaults": {
"qps_list": [
"inf"
],
"server_environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
},
"server_parameters": {
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"block_size": 128,
"disable_log_stats": "",
"load_format": "dummy"
},
"client_parameters": {
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 200,
"ignore-eos": ""
}
},
"tests": [
{
"test_name": "serving_llama3B_tp2_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 2,
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
}
},
{
"test_name": "serving_qwen3_tp4_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-14B",
"tensor_parallel_size": 4,
},
"client_parameters": {
"model": "Qwen/Qwen3-14B",
}
},
]
}
```
</details>
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.

View File

@ -110,7 +110,8 @@ json2envs() {
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
local timeout_val="1200"
timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
@ -316,12 +317,44 @@ run_throughput_tests() {
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
jq -c '
if type == "array" then
# Plain format: test cases array
.[]
elif (type == "object" and has("tests")) then
# merge the default parameters into each test cases
. as $root
| ($root.defaults // {}) as $d
| ($root.tests // [])[]
# default qps / max_concurrency from defaults if missing
| .qps_list = (.qps_list // $d.qps_list)
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
# merge envs / params: test overrides defaults
| .server_environment_variables =
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
| .server_parameters =
(($d.server_parameters // {}) + (.server_parameters // {}))
| .client_parameters =
(($d.client_parameters // {}) + (.client_parameters // {}))
else
error("Unsupported serving test file format: must be array or object with .tests")
end
' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@ -335,20 +368,25 @@ run_serving_tests() {
continue
fi
# get client and server arguments
# get client and server arguments (after merged the default parameters)
server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
# qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
max_concurrency_list="[$num_prompts]"
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"

View File

@ -1,610 +0,0 @@
[
{
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

View File

@ -1,276 +1,246 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 32
}
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 32
}
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
]
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@ -8,7 +8,7 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@ -30,19 +30,6 @@ steps:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.8"
depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
@ -109,7 +96,6 @@ steps:
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge

View File

@ -0,0 +1,369 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# do not complain about line length (for docstring)
# ruff: noqa: E501
import argparse
import json
import re
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
from urllib.parse import quote
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html>
<meta name="pypi:repository-version" content="1.0">
<body>
{items}
</body>
</html>
"""
@dataclass
class WheelFileInfo:
package_name: str
version: str
build_tag: str | None
python_tag: str
abi_tag: str
platform_tag: str
variant: str | None
filename: str
def parse_from_filename(file: str) -> WheelFileInfo:
"""
Parse wheel file name to extract metadata.
The format of wheel names:
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
Example:
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
"""
wheel_file_re = re.compile(
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
)
match = wheel_file_re.match(file)
if not match:
raise ValueError(f"Invalid wheel file name: {file}")
package_name = match.group("package_name")
version = match.group("version")
build_tag = match.group("build_tag")
python_tag = match.group("python_tag")
abi_tag = match.group("abi_tag")
platform_tag = match.group("platform_tag")
# extract variant from version
variant = None
if "dev" in version:
ver_after_dev = version.split("dev")[-1]
if "." in ver_after_dev:
variant = ver_after_dev.split(".")[-1]
version = version.removesuffix("." + variant)
else:
if "+" in version:
version, variant = version.split("+")
return WheelFileInfo(
package_name=package_name,
version=version,
build_tag=build_tag,
python_tag=python_tag,
abi_tag=abi_tag,
platform_tag=platform_tag,
variant=variant,
filename=file,
)
def generate_project_list(subdir_names: list[str]) -> str:
"""
Generate project list HTML content linking to each project & variant sub-directory.
"""
href_tags = []
for name in sorted(subdir_names):
name = name.strip("/").strip(".")
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
def generate_package_index_and_metadata(
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
) -> tuple[str, str]:
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
"""
href_tags = []
metadata = []
for file in sorted(wheel_files, key=lambda x: x.filename):
relative_path = (
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
)
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
# NOTE: this is AWS S3 specific behavior!
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
file_meta = asdict(file)
file_meta["path"] = file_path_quoted
metadata.append(file_meta)
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str
def generate_index_and_metadata(
whl_files: list[str],
wheel_base_dir: Path,
index_base_dir: Path,
default_variant: str | None = None,
alias_to_default: str | None = None,
):
"""
Generate index for all wheel files.
Args:
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
wheel_base_dir (Path): Base directory for wheel files.
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
as the default variant index, but the links are adjusted accordingly.
Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant sub-directory
index.html
vllm/
index.html
metadata.json
cu129/ # cu129 is actually the alias to default variant
index.html
vllm/
index.html
metadata.json
cu130/ # cu130 variant sub-directory
index.html
vllm/
index.html
metadata.json
...
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
[
{
"package_name": "vllm",
"version": "0.10.2rc2",
"build_tag": null,
"python_tag": "cp38",
"abi_tag": "abi3",
"platform_tag": "manylinux2014_aarch64",
"variant": "cu129",
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
},
...
]
"""
parsed_files = [parse_from_filename(f) for f in whl_files]
if not parsed_files:
print("No wheel files found, skipping index generation.")
return
# Group by variant
variant_to_files: dict[str, list[WheelFileInfo]] = {}
for file in parsed_files:
variant = file.variant or "default"
if variant not in variant_to_files:
variant_to_files[variant] = []
variant_to_files[variant].append(file)
print(f"Found variants: {list(variant_to_files.keys())}")
# sanity check for default variant
if default_variant:
if "default" in variant_to_files:
raise ValueError(
"All wheel files must have variant suffixes when `default_variant` is specified."
)
if default_variant not in variant_to_files:
raise ValueError(
f"Default variant '{default_variant}' not found among wheel files."
)
if alias_to_default:
if "default" not in variant_to_files:
# e.g. only some wheels are uploaded to S3 currently
print(
"[WARN] Alias to default variant specified, but no default variant found."
)
elif alias_to_default in variant_to_files:
raise ValueError(
f"Alias variant name '{alias_to_default}' already exists among wheel files."
)
else:
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.")
# Generate index for each variant
subdir_names = set()
for variant, files in variant_to_files.items():
if variant == "default":
variant_dir = index_base_dir
else:
variant_dir = index_base_dir / variant
subdir_names.add(variant)
variant_dir.mkdir(parents=True, exist_ok=True)
# gather all package names in this variant
packages = set(f.package_name for f in files)
if variant == "default":
# these packages should also appear in the "project list"
# generate after all variants are processed
subdir_names = subdir_names.union(packages)
else:
# generate project list for this variant directly
project_list_str = generate_project_list(sorted(packages))
with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str)
for package in packages:
# filter files belonging to this package only
package_files = [f for f in files if f.package_name == package]
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
package_files, wheel_base_dir, package_dir
)
with open(package_dir / "index.html", "w") as f:
f.write(index_str)
with open(package_dir / "metadata.json", "w") as f:
f.write(metadata_str)
# Generate top-level project list index
project_list_str = generate_project_list(sorted(subdir_names))
with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str)
if __name__ == "__main__":
"""
Arguments:
--version <version> : version string for the current build (e.g., commit hash)
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
"""
parser = argparse.ArgumentParser(
description="Process nightly build wheel files to generate indices."
)
parser.add_argument(
"--version",
type=str,
required=True,
help="Version string for the current build (e.g., commit hash)",
)
parser.add_argument(
"--current-objects",
type=str,
required=True,
help="Path to JSON file containing current S3 objects listing in this version directory",
)
parser.add_argument(
"--output-dir",
type=str,
required=True,
help="Directory to store generated index files",
)
parser.add_argument(
"--alias-to-default",
type=str,
default=None,
help="Alias variant name for the default variant",
)
args = parser.parse_args()
version = args.version
if "/" in version or "\\" in version:
raise ValueError("Version string must not contain slashes.")
current_objects_path = Path(args.current_objects)
output_dir = Path(args.output_dir)
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)
# Read current objects JSON
with open(current_objects_path) as f:
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
# current_objects looks like from list_objects_v2 S3 API:
"""
"Contents": [
{
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
"LastModified": "2025-11-28T14:00:32+00:00",
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
"ChecksumAlgorithm": [
"CRC64NVME"
],
"ChecksumType": "FULL_OBJECT",
"Size": 435649349,
"StorageClass": "STANDARD"
},
...
]
"""
# Extract wheel file keys
wheel_files = []
for item in current_objects.get("Contents", []):
key: str = item["Key"]
if key.endswith(".whl"):
wheel_files.append(key.split("/")[-1]) # only the filename is used
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
wheel_base_dir = Path(output_dir).parent / version
index_base_dir = Path(output_dir)
generate_index_and_metadata(
whl_files=wheel_files,
wheel_base_dir=wheel_base_dir,
index_base_dir=index_base_dir,
default_variant=None,
alias_to_default=args.alias_to_default,
)
print(f"Successfully generated index and metadata in {output_dir}")

View File

@ -2,6 +2,28 @@
set -ex
# ======== part 0: setup ========
BUCKET="vllm-wheels"
INDICES_OUTPUT_DIR="indices"
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
# detect if python3.10+ is available
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
if [[ "$has_new_python" -eq 0 ]]; then
# use new python from docker
docker pull python:3-slim
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
fi
echo "Using python interpreter: $PYTHON"
echo "Python version: $($PYTHON --version)"
# ========= part 1: collect, rename & upload the wheel ==========
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
# Get the single wheel file
wheel="${wheel_files[0]}"
# Detect architecture and rename 'linux' to appropriate manylinux version
arch=$(uname -m)
if [[ $arch == "x86_64" ]]; then
manylinux_version="manylinux1"
elif [[ $arch == "aarch64" ]]; then
manylinux_version="manylinux2014"
else
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
manylinux_version="manylinux1"
fi
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
manylinux_version="manylinux_2_31"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
if [[ "$wheel" != *"linux"* ]]; then
echo "Error: Wheel filename does not contain 'linux': $wheel"
exit 1
fi
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
echo "Renamed wheel to: $wheel"
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version"
echo "Version in wheel: $version"
pure_version="${version%%+*}"
echo "Pure version (without variant): $pure_version"
normal_wheel="$wheel" # Save the original wheel filename
# copy wheel to its own bucket
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
suffix="${version##*.}"
if [[ $suffix == cu* ]]; then
new_version="1.0.0.dev+${suffix}"
else
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
# use cp to keep both files in the artifacts directory
cp -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi
# ========= part 2: generate and upload indices ==========
# generate indices for all existing wheels in the commit directory
# this script might be run multiple times if there are multiple variants being built
# so we need to guarantee there is little chance for "TOCTOU" issues
# i.e., one process is generating indices while another is uploading a new wheel
# so we need to ensure no time-consuming operations happen below
# Upload the wheel to S3
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
# list all wheels in the commit directory
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
obj_json="objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# generate index for this commit
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu129"* ]]; then
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
# call script to generate indicies for all existing wheels
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
else
echo "Skipping index files for non-cu129 wheels"
alias_arg=""
fi
# generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
if [[ $normal_wheel == *"cu129"* ]]; then
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
else
echo "Skipping index files for non-cu129 wheels"
# copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
# copy to /nightly/ only if it is on the main branch and not a PR
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
echo "Uploading indices to overwrite /nightly/"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
# copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then
echo "Uploading indices to overwrite /$pure_version/"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@ -715,6 +715,7 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 15min
@ -934,6 +935,18 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test (CPU)
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
@ -1472,14 +1485,14 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/compile/distributed/test_async_tp.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- pytest -v -s tests/distributed/test_sequence_parallel.py
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####

View File

@ -215,7 +215,6 @@ steps:
timeout_in_minutes: 10
gpu: h100
num_gpus: 8
optional: true
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
@ -391,20 +390,24 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
# for basic
- python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@ -1370,4 +1373,4 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

4
.github/CODEOWNERS vendored
View File

@ -146,10 +146,10 @@ mkdocs.yaml @hmellor
/requirements/kv_connectors.txt @NickLucche
# Pooling models
/examples/*/pooling/ @noooop
/examples/pooling @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop

View File

@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- name: Set up Python
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'

View File

@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

View File

@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig(),
scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group

View File

@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer

View File

@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer

View File

@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
}
auto Y_all = at::empty({offsets[E], H}, x_c.options());
at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) {
c10::InferenceMode guard;
for (int64_t e = e_begin; e < e_end; ++e) {
const int64_t te = counts[e];
if (te == 0) {
for (int64_t e = 0; e < E; ++e) {
int64_t start = std::max(offsets[e], idx_begin);
int64_t end = std::min(offsets[e + 1], idx_end);
int64_t te = end - start;
if (te <= 0) {
continue;
}
const int64_t start = offsets[e];
auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
auto w13_e = w13_packed.select(/*dim=*/0, e);

View File

@ -364,7 +364,12 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} && \
# https://github.com/vllm-project/vllm/issues/29590
libcurand-dev-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} \
# Fixes nccl_allocator requiring nccl.h at runtime
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
libnccl-dev && \
rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL

View File

@ -5,11 +5,7 @@ nav:
- Getting Started:
- getting_started/quickstart.md
- getting_started/installation
- Examples:
- examples/README.md
- Offline Inference: examples/offline_inference
- Online Serving: examples/online_serving
- Others: examples/others
- Examples: examples
- General:
- usage/v1_guide.md
- usage/*

View File

@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen
The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.
## Using an IO Processor plugin

View File

@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
### Pre-built wheels
Currently, there are no pre-built CPU wheels.
Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
### Build wheel from source
#### Set up using Python-only build (without compilation) {#python-only-build}
Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
```bash
VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
```
#### Full build (with compilation) {#full-build}
=== "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
@ -125,6 +138,35 @@ vllm serve facebook/opt-125m --dtype=bfloat16
Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
### What are supported models on CPU?
For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu)
### How to find benchmark configuration examples for supported CPU models?
For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suites CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details).
To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.
Below is an example command to benchmark all CPU-supported models using optimized configurations.
```bash
ON_CPU=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
```
The benchmark results will be saved in `./benchmark/results/`.
In the directory, the generated `.commands` files contain all example commands for the benchmark.
We recommend configuring tensor-parallel-size to match the number of NUMA nodes on your system. Note that the current release does not support tensor-parallel-size=6.
To determine the number of NUMA nodes available, use the following command:
```bash
lscpu | grep "NUMA node(s):" | awk '{print $3}'
```
For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
, which publishes default-model CPU results produced using the same Benchmark Suite.
### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.

View File

@ -26,42 +26,49 @@ uv pip install vllm --torch-backend=auto
??? console "pip"
```bash
# Install vLLM with CUDA 12.8.
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
# Install vLLM with CUDA 12.9.
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
```
We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
!!! note
NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
```bash
# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
# Install vLLM with a specific CUDA version (e.g., 13.0).
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
export CUDA_VERSION=118 # or 126
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
export CUDA_VERSION=130 # or other
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
```
#### Install the latest code
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
To install from nightly index, run:
```bash
uv pip install -U vllm \
--torch-backend=auto \
--extra-index-url https://wheels.vllm.ai/nightly
--extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
```
??? console "pip"
```bash
pip install -U vllm \
--pre \
--extra-index-url https://wheels.vllm.ai/nightly
```
!!! warning "`pip` caveat"
`--pre` is required for `pip` to consider pre-released versions.
Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
```bash
pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
```
##### Install specific revisions
@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
uv pip install vllm \
--torch-backend=auto \
--extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
--extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
```
The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
??? note "pip"
If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
performance regression), due to the limitation of `pip`, you have to specify the full URL of the
wheel file by embedding the commit hash in the URL:
```bash
export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
```
Note that the wheels are built with Python 3.8 ABI (see [PEP
425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
#### Set up using Python-only build (without compilation)
#### Set up using Python-only build (without compilation) {#python-only-build}
If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
@ -121,18 +108,24 @@ This command will do the following:
In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
```bash
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
export VLLM_PRECOMPILED_WHEEL_COMMIT=$(git rev-parse HEAD~1) # or earlier commit on main
export VLLM_USE_PRECOMPILED=1
uv pip install --editable .
```
There are more environment variables to control the behavior of Python-only build:
* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
!!! note
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
#### Full build (with compilation)
#### Full build (with compilation) {#full-build}
If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:

View File

@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
### Pre-built wheels
### Pre-built wheels {#pre-built-wheels}
=== "NVIDIA CUDA"

View File

@ -2,7 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import logging
from dataclasses import dataclass, field
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import Literal
@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples"
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
def fix_case(text: str) -> str:
def title(text: str) -> str:
# Default title case
text = text.replace("_", " ").replace("/", " - ").title()
# Custom substitutions
subs = {
"io": "IO",
"api": "API",
"cli": "CLI",
"cpu": "CPU",
"llm": "LLM",
"mae": "MAE",
"ner": "NER",
"tpu": "TPU",
"gguf": "GGUF",
"lora": "LoRA",
@ -48,71 +54,65 @@ class Example:
Attributes:
path (Path): The path to the main directory or file.
category (str): The category of the document.
main_file (Path): The main file in the directory.
other_files (list[Path]): list of other files in the directory.
title (str): The title of the document.
Properties::
main_file() -> Path | None: Determines the main file in the given path.
other_files() -> list[Path]: Determines other files in the directory excluding
the main file.
title() -> str: Determines the title of the document.
Methods:
__post_init__(): Initializes the main_file, other_files, and title attributes.
determine_main_file() -> Path: Determines the main file in the given path.
determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
determine_title() -> str: Determines the title of the document.
generate() -> str: Generates the documentation content.
""" # noqa: E501
"""
path: Path
category: str = None
main_file: Path = field(init=False)
other_files: list[Path] = field(init=False)
title: str = field(init=False)
category: str
def __post_init__(self):
self.main_file = self.determine_main_file()
self.other_files = self.determine_other_files()
self.title = self.determine_title()
@cached_property
def main_file(self) -> Path | None:
"""Determines the main file in the given path.
@property
def is_code(self) -> bool:
return self.main_file.suffix != ".md"
If path is a file, it returns the path itself. If path is a directory, it
searches for Markdown files (*.md) in the directory and returns the first one
found. If no Markdown files are found, it returns None."""
# Single file example
if self.path.is_file():
return self.path
# Multi file example with a README
if md_paths := list(self.path.glob("*.md")):
return md_paths[0]
# Multi file example without a README
return None
def determine_main_file(self) -> Path:
"""
Determines the main file in the given path.
If the path is a file, it returns the path itself. Otherwise, it searches
for Markdown files (*.md) in the directory and returns the first one found.
Returns:
Path: The main file path, either the original path if it's a file or the first
Markdown file found in the directory.
Raises:
IndexError: If no Markdown files are found in the directory.
""" # noqa: E501
return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
@cached_property
def other_files(self) -> list[Path]:
"""Determine other files in the directory excluding the main file.
def determine_other_files(self) -> list[Path]:
"""
Determine other files in the directory excluding the main file.
This method checks if the given path is a file. If it is, it returns an empty list.
Otherwise, it recursively searches through the directory and returns a list of all
files that are not the main file.
Returns:
list[Path]: A list of Path objects representing the other files in the directory.
""" # noqa: E501
If path is a file, it returns an empty list. Otherwise, it returns every file
in the directory except the main file in a list."""
# Single file example
if self.path.is_file():
return []
# Multi file example
is_other_file = lambda file: file.is_file() and file != self.main_file
return [file for file in self.path.rglob("*") if is_other_file(file)]
return sorted(file for file in self.path.rglob("*") if is_other_file(file))
def determine_title(self) -> str:
if not self.is_code:
# Specify encoding for building on Windows
with open(self.main_file, encoding="utf-8") as f:
first_line = f.readline().strip()
match = re.match(r"^#\s+(?P<title>.+)$", first_line)
if match:
return match.group("title")
return fix_case(self.path.stem.replace("_", " ").title())
@cached_property
def is_code(self) -> bool:
return self.main_file is not None and self.main_file.suffix != ".md"
@cached_property
def title(self) -> str:
# Generate title from filename if no main md file found
if self.main_file is None or self.is_code:
return title(self.path.stem)
# Specify encoding for building on Windows
with open(self.main_file, encoding="utf-8") as f:
first_line = f.readline().strip()
match = re.match(r"^#\s+(?P<title>.+)$", first_line)
if match:
return match.group("title")
raise ValueError(f"Title not found in {self.main_file}")
def fix_relative_links(self, content: str) -> str:
"""
@ -156,24 +156,35 @@ class Example:
# included files containing code fences too
code_fence = "``````"
if self.is_code:
content += (
f"{code_fence}{self.main_file.suffix[1:]}\n"
f'--8<-- "{self.main_file}"\n'
f"{code_fence}\n"
)
if self.main_file is not None:
# Single file example or multi file example with a README
if self.is_code:
content += (
f"{code_fence}{self.main_file.suffix[1:]}\n"
f'--8<-- "{self.main_file}"\n'
f"{code_fence}\n"
)
else:
with open(self.main_file, encoding="utf-8") as f:
# Skip the title from md snippets as it's been included above
main_content = f.readlines()[1:]
content += self.fix_relative_links("".join(main_content))
content += "\n"
else:
with open(self.main_file) as f:
# Skip the title from md snippets as it's been included above
main_content = f.readlines()[1:]
content += self.fix_relative_links("".join(main_content))
content += "\n"
# Multi file example without a README
for file in self.other_files:
file_title = title(str(file.relative_to(self.path).with_suffix("")))
content += f"## {file_title}\n\n"
content += (
f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n'
)
return content
if not self.other_files:
return content
content += "## Example materials\n\n"
for file in sorted(self.other_files):
for file in self.other_files:
content += f'??? abstract "{file.relative_to(self.path)}"\n'
if file.suffix != ".md":
content += f" {code_fence}{file.suffix[1:]}\n"
@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
glob_patterns = ["*.py", "*.md", "*.sh"]
# Find categorised examples
for category in categories:
logger.info("Processing category: %s", category.stem)
globs = [category.glob(pattern) for pattern in glob_patterns]
for path in itertools.chain(*globs):
examples.append(Example(path, category.stem))
# Find examples in subdirectories
for path in category.glob("*/*.md"):
globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns]
for path in itertools.chain(*globs):
examples.append(Example(path.parent, category.stem))
# Generate the example documentation
@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
with open(doc_path, "w+", encoding="utf-8") as f:
f.write(example.generate())
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
logger.info("Total examples generated: %d", len(examples))

View File

@ -274,7 +274,7 @@ outputs = llm.embed(
print(outputs[0].outputs)
```
A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py)
A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py)
### Online Inference
@ -304,7 +304,7 @@ Expected output:
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
```
An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py)
## Deprecated Features

View File

@ -417,7 +417,8 @@ th {
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
@ -567,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
```
!!! note
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py).
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
```bash
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@ -605,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
!!! note
Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).
Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py).
## List of Multimodal Language Models
@ -711,7 +712,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ |
| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |

View File

@ -234,7 +234,7 @@ The following extra parameters are supported:
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)
Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
@ -335,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
example below for details.
Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py)
Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py)
#### Extra parameters
@ -516,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)
Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py)
### Classification API
@ -524,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py)
Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py)
#### Example Requests
@ -640,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py)
Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
#### Single inference
@ -821,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
print("Scoring output:", response_json["data"][0]["score"])
print("Scoring output:", response_json["data"][1]["score"])
```
Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py)
Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py)
#### Extra parameters
@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
popular open-source tools.
Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py)
Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
#### Example Request

View File

@ -0,0 +1,98 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates preempt requests when using the `LLMEngine`
for processing prompts with various sampling parameters.
"""
import argparse
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser
def create_test_prompts() -> list[tuple[str, SamplingParams]]:
"""Create a list of test prompts with their sampling parameters."""
return [
(
"A robot may not injure a human being " * 50,
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
),
),
(
"A robot may not injure a human being " * 50,
SamplingParams(
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
),
),
(
"To be or not to be,",
SamplingParams(
temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128
),
),
(
"What is the meaning of life?",
SamplingParams(
n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1, max_tokens=128
),
),
]
def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
print("-" * 50)
step_id = 0
while test_prompts or engine.has_unfinished_requests():
print("-" * 50)
import os
print(f"Step {step_id} (pid={os.getpid()})")
if test_prompts:
prompt, sampling_params = test_prompts.pop(0)
engine.add_request(str(request_id), prompt, sampling_params)
request_id += 1
if step_id == 10:
print(f"Resetting prefix cache at {step_id}")
engine.reset_prefix_cache(reset_running_requests=True)
request_outputs: list[RequestOutput] = engine.step()
for request_output in request_outputs:
if request_output.finished:
print("-" * 50)
print(request_output)
print("-" * 50)
step_id += 1
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
"""Initialize the LLMEngine from the command line arguments."""
engine_args = EngineArgs.from_cli_args(args)
return LLMEngine.from_engine_args(engine_args)
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using the LLMEngine class directly"
)
parser = EngineArgs.add_cli_args(parser)
return parser.parse_args()
def main(args: argparse.Namespace):
"""Main function that sets up and runs the prompt processing."""
engine = initialize_engine(args)
test_prompts = create_test_prompts()
process_requests(engine, test_prompts)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -1,57 +0,0 @@
# Pooling models
## Convert llm model to seq cls
```bash
# for BAAI/bge-reranker-v2-gemma
# Caution: "Yes" and "yes" are two different tokens
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
# for mxbai-rerank-v2
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
# for Qwen3-Reranker
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
```
## Embed jina_embeddings_v3 usage
Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120>
```bash
python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
```
## Embed matryoshka dimensions usage
```bash
python examples/offline_inference/pooling/embed_matryoshka_fy.py
```
## Multi vector retrieval usage
```bash
python examples/offline_inference/pooling/multi_vector_retrieval.py
```
## Named Entity Recognition (NER) usage
```bash
python examples/offline_inference/pooling/ner.py
```
## Prithvi Geospatial MAE usage
```bash
python examples/offline_inference/pooling/prithvi_geospatial_mae.py
```
## IO Processor Plugins for Prithvi Geospatial MAE
```bash
python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
```
## Qwen3 reranker usage
```bash
python examples/offline_inference/pooling/qwen3_reranker.py
```

View File

@ -1801,7 +1801,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
hf_overrides={
"architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2",
},
limit_mm_per_prompt={modality: 1},
)

View File

@ -1222,7 +1222,10 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
trust_remote_code=True,
max_model_len=32768,
limit_mm_per_prompt={"image": len(image_urls)},
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
hf_overrides={
"architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2",
},
)
prompt = (

View File

@ -1,97 +0,0 @@
# Pooling models
## Cohere rerank usage
```bash
# vllm serve BAAI/bge-reranker-base
python examples/online_serving/pooling/cohere_rerank_client.py
```
## Embedding requests base64 encoding_format usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/embedding_requests_base64_client.py
```
## Embedding requests bytes encoding_format usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/embedding_requests_bytes_client.py
```
## Jinaai rerank usage
```bash
# vllm serve BAAI/bge-reranker-base
python examples/online_serving/pooling/jinaai_rerank_client.py
```
## Multi vector retrieval usage
```bash
# vllm serve BAAI/bge-m3
python examples/online_serving/pooling/multi_vector_retrieval_client.py
```
## Named Entity Recognition (NER) usage
```bash
# vllm serve boltuix/NeuroBERT-NER
python examples/online_serving/pooling/ner_client.py
```
## OpenAI chat embedding for multimodal usage
```bash
python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
```
## OpenAI classification usage
```bash
# vllm serve jason9693/Qwen2.5-1.5B-apeach
python examples/online_serving/pooling/openai_classification_client.py
```
## OpenAI cross_encoder score usage
```bash
# vllm serve BAAI/bge-reranker-v2-m3
python examples/online_serving/pooling/openai_cross_encoder_score.py
```
## OpenAI cross_encoder score for multimodal usage
```bash
# vllm serve jinaai/jina-reranker-m0
python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
```
## OpenAI embedding usage
```bash
# vllm serve intfloat/e5-small
python examples/online_serving/pooling/openai_embedding_client.py
```
## OpenAI embedding matryoshka dimensions usage
```bash
# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
```
## OpenAI pooling usage
```bash
# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
python examples/online_serving/pooling/openai_pooling_client.py
```
## Online Prithvi Geospatial MAE usage
```bash
python examples/online_serving/pooling/prithvi_geospatial_mae.py
```

148
setup.py
View File

@ -311,7 +311,7 @@ class precompiled_build_ext(build_ext):
"""Disables extension building when using precompiled binaries."""
def run(self) -> None:
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
return
def build_extensions(self) -> None:
print("Skipping build_ext: using precompiled extensions.")
@ -322,14 +322,121 @@ class precompiled_wheel_utils:
"""Extracts libraries and other files from an existing wheel."""
@staticmethod
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
def fetch_metadata_for_variant(
commit: str, variant: str | None
) -> tuple[list[dict], str]:
"""
Fetches metadata for a specific variant of the precompiled wheel.
"""
variant_dir = f"{variant}/" if variant is not None else ""
repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
meta_url = repo_url + "metadata.json"
print(f"Trying to fetch nightly build metadata from {meta_url}")
from urllib.request import urlopen
with urlopen(meta_url) as resp:
# urlopen raises HTTPError on unexpected status code
wheels = json.loads(resp.read().decode("utf-8"))
return wheels, repo_url
@staticmethod
def determine_wheel_url() -> tuple[str, str | None]:
"""
Try to determine the precompiled wheel URL or path to use.
The order of preference is:
1. user-specified wheel location (can be either local or remote, via
VLLM_PRECOMPILED_WHEEL_LOCATION)
2. user-specified variant from nightly repo (current main commit via
VLLM_PRECOMPILED_WHEEL_VARIANT)
3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
4. the default variant from nightly repo (current main commit)
"""
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
print(f"Using user-specified precompiled wheel location: {wheel_location}")
return wheel_location, None
else:
import platform
arch = platform.machine()
# try to fetch the wheel metadata from the nightly wheel repo
main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
commit = os.getenv(
"VLLM_PRECOMPILED_WHEEL_COMMIT",
precompiled_wheel_utils.get_base_commit_in_main_branch(),
)
print(f"Using precompiled wheel commit {commit} with variant {variant}")
try_default = False
wheels, repo_url, download_filename = None, None, None
try:
wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant(
commit, variant
)
except Exception as e:
logger.warning(
"Failed to fetch precompiled wheel metadata for variant %s: %s",
variant,
e,
)
try_default = True # try outside handler to keep the stacktrace simple
if try_default:
print("Trying the default variant from remote")
wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant(
commit, None
)
# if this also fails, then we have nothing more to try / cache
assert wheels is not None and repo_url is not None, (
"Failed to fetch precompiled wheel metadata"
)
# The metadata.json has the following format:
# see .buildkite/scripts/generate-nightly-index.py for details
"""[{
"package_name": "vllm",
"version": "0.11.2.dev278+gdbc3d9991",
"build_tag": null,
"python_tag": "cp38",
"abi_tag": "abi3",
"platform_tag": "manylinux1_x86_64",
"variant": null,
"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
"path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
},
...]"""
from urllib.parse import urljoin
for wheel in wheels:
# TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
if wheel.get("package_name") == "vllm" and arch in wheel.get(
"platform_tag", ""
):
print(f"Found precompiled wheel metadata: {wheel}")
if "path" not in wheel:
raise ValueError(f"Wheel metadata missing path: {wheel}")
wheel_url = urljoin(repo_url, wheel["path"])
download_filename = wheel.get("filename")
print(f"Using precompiled wheel URL: {wheel_url}")
break
else:
raise ValueError(
f"No precompiled vllm wheel found for architecture {arch} "
f"from repo {repo_url}. All available wheels: {wheels}"
)
return wheel_url, download_filename
@staticmethod
def extract_precompiled_and_patch_package(
wheel_url_or_path: str, download_filename: str | None
) -> dict:
import tempfile
import zipfile
temp_dir = None
try:
if not os.path.isfile(wheel_url_or_path):
wheel_filename = wheel_url_or_path.split("/")[-1]
# use provided filename first, then derive from URL
wheel_filename = download_filename or wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
@ -648,38 +755,13 @@ package_data = {
]
}
# If using precompiled, extract and patch package_data (in advance of setup)
if envs.VLLM_USE_PRECOMPILED:
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
wheel_url = wheel_location
else:
import platform
arch = platform.machine()
if arch == "x86_64":
wheel_tag = "manylinux1_x86_64"
elif arch == "aarch64":
wheel_tag = "manylinux2014_aarch64"
else:
raise ValueError(f"Unsupported architecture: {arch}")
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
nightly_wheel_url = (
f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
)
from urllib.request import urlopen
try:
with urlopen(wheel_url) as resp:
if resp.status != 200:
wheel_url = nightly_wheel_url
except Exception as e:
print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = nightly_wheel_url
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url()
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url, download_filename
)
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)

View File

@ -0,0 +1,257 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import tempfile
from pathlib import Path
import pytest
from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem
class TestParameterSweepItem:
"""Test ParameterSweepItem functionality."""
@pytest.mark.parametrize(
"input_dict,expected",
[
(
{"compilation_config.use_inductor_graph_partition": False},
"--compilation-config.use_inductor_graph_partition=false",
),
(
{"compilation_config.use_inductor_graph_partition": True},
"--compilation-config.use_inductor_graph_partition=true",
),
(
{"compilation_config.use_inductor": False},
"--compilation-config.use_inductor=false",
),
(
{"compilation_config.use_inductor": True},
"--compilation-config.use_inductor=true",
),
],
)
def test_nested_boolean_params(self, input_dict, expected):
"""Test that nested boolean params use =true/false syntax."""
item = ParameterSweepItem.from_record(input_dict)
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
assert expected in cmd
@pytest.mark.parametrize(
"input_dict,expected",
[
({"enable_prefix_caching": False}, "--no-enable-prefix-caching"),
({"enable_prefix_caching": True}, "--enable-prefix-caching"),
({"disable_log_stats": False}, "--no-disable-log-stats"),
({"disable_log_stats": True}, "--disable-log-stats"),
],
)
def test_non_nested_boolean_params(self, input_dict, expected):
"""Test that non-nested boolean params use --no- prefix."""
item = ParameterSweepItem.from_record(input_dict)
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
assert expected in cmd
@pytest.mark.parametrize(
"compilation_config",
[
{"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True},
{
"cudagraph_mode": "piecewise",
"mode": 3,
"use_inductor_graph_partition": False,
},
],
)
def test_nested_dict_value(self, compilation_config):
"""Test that nested dict values are serialized as JSON."""
item = ParameterSweepItem.from_record(
{"compilation_config": compilation_config}
)
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
assert "--compilation-config" in cmd
# The dict should be JSON serialized
idx = cmd.index("--compilation-config")
assert json.loads(cmd[idx + 1]) == compilation_config
@pytest.mark.parametrize(
"input_dict,expected_key,expected_value",
[
({"model": "test-model"}, "--model", "test-model"),
({"max_tokens": 100}, "--max-tokens", "100"),
({"temperature": 0.7}, "--temperature", "0.7"),
],
)
def test_string_and_numeric_values(self, input_dict, expected_key, expected_value):
"""Test that string and numeric values are handled correctly."""
item = ParameterSweepItem.from_record(input_dict)
cmd = item.apply_to_cmd(["vllm", "serve"])
assert expected_key in cmd
assert expected_value in cmd
@pytest.mark.parametrize(
"input_dict,expected_key,key_idx_offset",
[
({"max_tokens": 200}, "--max-tokens", 1),
({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0),
],
)
def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset):
"""Test that existing parameters in cmd are replaced."""
item = ParameterSweepItem.from_record(input_dict)
if key_idx_offset == 1:
# Key-value pair
cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"])
assert expected_key in cmd
idx = cmd.index(expected_key)
assert cmd[idx + 1] == "200"
assert "100" not in cmd
else:
# Boolean flag
cmd = item.apply_to_cmd(
["vllm", "serve", "--enable-prefix-caching", "model"]
)
assert expected_key in cmd
assert "--enable-prefix-caching" not in cmd
class TestParameterSweep:
"""Test ParameterSweep functionality."""
def test_from_records_list(self):
"""Test creating ParameterSweep from a list of records."""
records = [
{"max_tokens": 100, "temperature": 0.7},
{"max_tokens": 200, "temperature": 0.9},
]
sweep = ParameterSweep.from_records(records)
assert len(sweep) == 2
assert sweep[0]["max_tokens"] == 100
assert sweep[1]["max_tokens"] == 200
def test_read_from_dict(self):
"""Test creating ParameterSweep from a dict format."""
data = {
"experiment1": {"max_tokens": 100, "temperature": 0.7},
"experiment2": {"max_tokens": 200, "temperature": 0.9},
}
sweep = ParameterSweep.read_from_dict(data)
assert len(sweep) == 2
# Check that items have the _benchmark_name field
names = {item["_benchmark_name"] for item in sweep}
assert names == {"experiment1", "experiment2"}
# Check that parameters are preserved
for item in sweep:
if item["_benchmark_name"] == "experiment1":
assert item["max_tokens"] == 100
assert item["temperature"] == 0.7
elif item["_benchmark_name"] == "experiment2":
assert item["max_tokens"] == 200
assert item["temperature"] == 0.9
def test_read_json_list_format(self):
"""Test reading JSON file with list format."""
records = [
{"max_tokens": 100, "temperature": 0.7},
{"max_tokens": 200, "temperature": 0.9},
]
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(records, f)
temp_path = Path(f.name)
try:
sweep = ParameterSweep.read_json(temp_path)
assert len(sweep) == 2
assert sweep[0]["max_tokens"] == 100
assert sweep[1]["max_tokens"] == 200
finally:
temp_path.unlink()
def test_read_json_dict_format(self):
"""Test reading JSON file with dict format."""
data = {
"experiment1": {"max_tokens": 100, "temperature": 0.7},
"experiment2": {"max_tokens": 200, "temperature": 0.9},
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(data, f)
temp_path = Path(f.name)
try:
sweep = ParameterSweep.read_json(temp_path)
assert len(sweep) == 2
# Check that items have the _benchmark_name field
names = {item["_benchmark_name"] for item in sweep}
assert names == {"experiment1", "experiment2"}
finally:
temp_path.unlink()
def test_unique_benchmark_names_validation(self):
"""Test that duplicate _benchmark_name values raise an error."""
# Test with duplicate names in list format
records = [
{"_benchmark_name": "exp1", "max_tokens": 100},
{"_benchmark_name": "exp1", "max_tokens": 200},
]
with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
ParameterSweep.from_records(records)
def test_unique_benchmark_names_multiple_duplicates(self):
"""Test validation with multiple duplicate names."""
records = [
{"_benchmark_name": "exp1", "max_tokens": 100},
{"_benchmark_name": "exp1", "max_tokens": 200},
{"_benchmark_name": "exp2", "max_tokens": 300},
{"_benchmark_name": "exp2", "max_tokens": 400},
]
with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
ParameterSweep.from_records(records)
def test_no_benchmark_names_allowed(self):
"""Test that records without _benchmark_name are allowed."""
records = [
{"max_tokens": 100, "temperature": 0.7},
{"max_tokens": 200, "temperature": 0.9},
]
sweep = ParameterSweep.from_records(records)
assert len(sweep) == 2
def test_mixed_benchmark_names_allowed(self):
"""Test that mixing records with and without _benchmark_name is allowed."""
records = [
{"_benchmark_name": "exp1", "max_tokens": 100},
{"max_tokens": 200, "temperature": 0.9},
]
sweep = ParameterSweep.from_records(records)
assert len(sweep) == 2
class TestParameterSweepItemKeyNormalization:
"""Test key normalization in ParameterSweepItem."""
def test_underscore_to_hyphen_conversion(self):
"""Test that underscores are converted to hyphens in CLI."""
item = ParameterSweepItem.from_record({"max_tokens": 100})
cmd = item.apply_to_cmd(["vllm", "serve"])
assert "--max-tokens" in cmd
def test_nested_key_preserves_suffix(self):
"""Test that nested keys preserve the suffix format."""
# The suffix after the dot should preserve underscores
item = ParameterSweepItem.from_record(
{"compilation_config.some_nested_param": "value"}
)
cmd = item.apply_to_cmd(["vllm", "serve"])
# The prefix (compilation_config) gets converted to hyphens,
# but the suffix (some_nested_param) is preserved
assert any("compilation-config.some_nested_param" in arg for arg in cmd)

View File

@ -0,0 +1,171 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pandas as pd
import pytest
from vllm.benchmarks.sweep.plot import (
PlotEqualTo,
PlotFilterBase,
PlotFilters,
PlotGreaterThan,
PlotGreaterThanOrEqualTo,
PlotLessThan,
PlotLessThanOrEqualTo,
PlotNotEqualTo,
)
class TestPlotFilters:
"""Test PlotFilter functionality including 'inf' edge case."""
def setup_method(self):
"""Create sample DataFrames for testing."""
# DataFrame with numeric values
self.df_numeric = pd.DataFrame(
{
"request_rate": [1.0, 5.0, 10.0, 50.0, 100.0],
"value": [10, 20, 30, 40, 50],
}
)
# DataFrame with float('inf') - note: string "inf" values are coerced
# to float when loading data, so we only test with float('inf')
self.df_inf_float = pd.DataFrame(
{
"request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")],
"value": [10, 20, 30, 40, 50],
}
)
@pytest.mark.parametrize(
"target,expected_count",
[
("5.0", 1),
("10.0", 1),
("1.0", 1),
],
)
def test_equal_to_numeric(self, target, expected_count):
"""Test PlotEqualTo with numeric values."""
filter_obj = PlotEqualTo("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
def test_equal_to_inf_float(self):
"""Test PlotEqualTo with float('inf')."""
filter_obj = PlotEqualTo("request_rate", "inf")
result = filter_obj.apply(self.df_inf_float)
# Should match both float('inf') entries because float('inf') == float('inf')
assert len(result) == 2
@pytest.mark.parametrize(
"target,expected_count",
[
("5.0", 4), # All except 5.0
("1.0", 4), # All except 1.0
],
)
def test_not_equal_to_numeric(self, target, expected_count):
"""Test PlotNotEqualTo with numeric values."""
filter_obj = PlotNotEqualTo("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
def test_not_equal_to_inf_float(self):
"""Test PlotNotEqualTo with float('inf')."""
filter_obj = PlotNotEqualTo("request_rate", "inf")
result = filter_obj.apply(self.df_inf_float)
# Should exclude float('inf') entries
assert len(result) == 3
@pytest.mark.parametrize(
"target,expected_count",
[
("10.0", 2), # 1.0, 5.0
("50.0", 3), # 1.0, 5.0, 10.0
("5.0", 1), # 1.0
],
)
def test_less_than(self, target, expected_count):
"""Test PlotLessThan with numeric values."""
filter_obj = PlotLessThan("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
@pytest.mark.parametrize(
"target,expected_count",
[
("10.0", 3), # 1.0, 5.0, 10.0
("5.0", 2), # 1.0, 5.0
],
)
def test_less_than_or_equal_to(self, target, expected_count):
"""Test PlotLessThanOrEqualTo with numeric values."""
filter_obj = PlotLessThanOrEqualTo("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
@pytest.mark.parametrize(
"target,expected_count",
[
("10.0", 2), # 50.0, 100.0
("5.0", 3), # 10.0, 50.0, 100.0
],
)
def test_greater_than(self, target, expected_count):
"""Test PlotGreaterThan with numeric values."""
filter_obj = PlotGreaterThan("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
@pytest.mark.parametrize(
"target,expected_count",
[
("10.0", 3), # 10.0, 50.0, 100.0
("5.0", 4), # 5.0, 10.0, 50.0, 100.0
],
)
def test_greater_than_or_equal_to(self, target, expected_count):
"""Test PlotGreaterThanOrEqualTo with numeric values."""
filter_obj = PlotGreaterThanOrEqualTo("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
@pytest.mark.parametrize(
"filter_str,expected_var,expected_target,expected_type",
[
("request_rate==5.0", "request_rate", "5.0", PlotEqualTo),
("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo),
("request_rate<50.0", "request_rate", "50.0", PlotLessThan),
("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo),
("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan),
("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo),
("request_rate==inf", "request_rate", "inf", PlotEqualTo),
("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo),
],
)
def test_parse_str(self, filter_str, expected_var, expected_target, expected_type):
"""Test parsing filter strings."""
filter_obj = PlotFilterBase.parse_str(filter_str)
assert isinstance(filter_obj, expected_type)
assert filter_obj.var == expected_var
assert filter_obj.target == expected_target
def test_parse_str_inf_edge_case(self):
"""Test parsing 'inf' string in filter."""
filter_obj = PlotFilterBase.parse_str("request_rate==inf")
assert isinstance(filter_obj, PlotEqualTo)
assert filter_obj.var == "request_rate"
assert filter_obj.target == "inf"
def test_parse_multiple_filters(self):
"""Test parsing multiple filters."""
filters = PlotFilters.parse_str("request_rate>5.0,value<=40")
assert len(filters) == 2
assert isinstance(filters[0], PlotGreaterThan)
assert isinstance(filters[1], PlotLessThanOrEqualTo)
def test_parse_empty_filter(self):
"""Test parsing empty filter string."""
filters = PlotFilters.parse_str("")
assert len(filters) == 0

View File

@ -8,7 +8,7 @@ import torch
from vllm import LLM, SamplingParams
from vllm.config.compilation import CompilationMode, DynamicShapesType
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from vllm.utils.torch_utils import is_torch_equal_or_newer

View File

@ -318,13 +318,18 @@ def test_attention_quant_pattern(
torch.set_default_dtype(dtype)
torch.manual_seed(42)
model_config = ModelConfig(
model=model_name,
max_model_len=2048,
dtype=dtype,
)
vllm_config = VllmConfig(
model_config=ModelConfig(
model=model_name,
max_model_len=2048,
dtype=dtype,
model_config=model_config,
scheduler_config=SchedulerConfig(
max_num_seqs=1024,
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
scheduler_config=SchedulerConfig(max_num_seqs=1024),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops_list,

View File

@ -59,6 +59,7 @@ from vllm.distributed import (
)
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import fetch_image
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
@ -1174,6 +1175,7 @@ def caplog_mp_spawn(tmp_path, monkeypatch):
"level": level,
"filename": log_path.as_posix(),
}
config["loggers"]["vllm"]["level"] = level
config_path.write_text(json.dumps(config))
@ -1388,7 +1390,11 @@ class LocalAssetServer:
return f"{self.base_url}/{name}"
def get_image_asset(self, name: str) -> Image.Image:
return fetch_image(self.url_for(name))
image = fetch_image(self.url_for(name))
# Unwrap MediaWithBytes if present
if isinstance(image, MediaWithBytes):
image = image.media
return image
@pytest.fixture(scope="session")

View File

@ -22,7 +22,14 @@ def get_model_args(
"num_speculative_tokens": 1,
"max_model_len": model_max_len,
}
eplb_config = {
"num_redundant_experts": tp_size,
"window_size": 128,
"step_interval": 1024,
"log_balancedness": False,
}
if use_async:
eplb_config["use_async"] = True
model_args = {
"pretrained": model_name,
"dtype": "auto",
@ -31,15 +38,10 @@ def get_model_args(
"gpu_memory_utilization": 0.7,
"speculative_config": speculative_config,
"enable_expert_parallel": True,
"num_redundant_experts": tp_size,
"eplb_window_size": 128,
"eplb_step_interval": 1024,
"eplb_log_balancedness": False,
"eplb_config": eplb_config,
"enable_eplb": True,
"max_model_len": model_max_len,
}
if use_async:
model_args["eplb_config"] = {"use_async": True}
return model_args

View File

@ -6,7 +6,7 @@ import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import VLLM_PATH

View File

@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from vllm.v1.engine.async_llm import AsyncLLM
MODEL_NAME = "openai-community/gpt2"

View File

@ -0,0 +1,87 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import pytest_asyncio
from openai import OpenAI
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-8B"
@pytest.fixture(scope="module")
def server():
args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
# uncomment for tool calling
# PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
)
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
)
assert response is not None
print("response: ", response)
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{"type": "message", "content": "Hello.", "role": "user"},
{
"type": "reasoning",
"id": "lol",
"content": [
{
"type": "reasoning_text",
"text": "We need to respond: greeting.",
}
],
"summary": [],
},
{
"arguments": '{"location": "Paris", "unit": "celsius"}',
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
"name": "get_weather",
"type": "function_call",
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
"status": "completed",
},
{
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
"output": "The weather in Paris is 20 Celsius",
"status": "completed",
"type": "function_call_output",
},
],
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
# make sure we get a reasoning and text output
assert response.output[0].type == "reasoning"
assert response.output[1].type == "message"
assert type(response.output[1].content[0].text) is str

View File

@ -3,7 +3,7 @@
import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer

View File

@ -7,7 +7,7 @@
import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer

View File

@ -14,7 +14,7 @@ from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import RemoteOpenAIServer

View File

@ -7,7 +7,7 @@ import tempfile
import pytest
from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer

View File

@ -5,7 +5,7 @@ import pytest
import pytest_asyncio
import requests
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer

View File

@ -8,6 +8,7 @@ import pytest
import pytest_asyncio
from transformers import AutoProcessor
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import RemoteOpenAIServer
@ -111,7 +112,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
"content": f"{placeholder}{content}",
}
]
images = [fetch_image(image_url)]
image = fetch_image(image_url)
# Unwrap MediaWithBytes if present
if isinstance(image, MediaWithBytes):
image = image.media
images = [image]
prompt = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True

View File

@ -271,7 +271,7 @@ async def test_streaming_product_tool_call():
@pytest.fixture
def qwen_tokenizer() -> TokenizerLike:
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
return get_tokenizer("Qwen/Qwen3-32B")

View File

@ -18,7 +18,7 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.platforms import current_platform
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,

View File

@ -9,6 +9,7 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
@ -62,7 +63,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
placeholder = "<|image_1|> "
prompt = f"{placeholder}{content}"
images = [fetch_image(image_url)]
image = fetch_image(image_url)
# Unwrap MediaWithBytes if present
if isinstance(image, MediaWithBytes):
image = image.media
images = [image]
inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1]

View File

@ -12,7 +12,7 @@ import torch
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,

View File

@ -45,7 +45,10 @@ def basic_server_with_lora(smollm2_lora_files):
"64",
]
envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
envs = {
"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True",
"SAGEMAKER_ENABLE_STATEFUL_SESSIONS": "True",
}
with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server:
yield remote_server

View File

@ -28,8 +28,7 @@ from vllm.multimodal.utils import (
encode_image_base64,
encode_video_base64,
)
from vllm.tokenizers import MistralTokenizer
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import MistralTokenizer, get_tokenizer
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import VLLM_PATH

View File

@ -5,6 +5,8 @@ import pytest
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
from openai.types.responses.response_output_message import ResponseOutputMessage
from openai.types.responses.response_output_text import ResponseOutputText
from openai.types.responses.response_reasoning_item import (
Content,
ResponseReasoningItem,
@ -101,3 +103,22 @@ class TestResponsesUtils:
)
with pytest.raises(ValueError):
construct_chat_message_with_tool_call(item)
output_item = ResponseOutputMessage(
id="msg_bf585bbbe3d500e0",
content=[
ResponseOutputText(
annotations=[],
text="dongyi",
type="output_text",
logprobs=None,
)
],
role="assistant",
status="completed",
type="message",
)
formatted_item = construct_chat_message_with_tool_call(output_item)
assert formatted_item["role"] == "assistant"
assert formatted_item["content"] == "dongyi"

View File

@ -33,14 +33,16 @@ def test_worker_apply_lora(qwen3_lora_files):
lora_requests, lora_mapping
)
model_config = ModelConfig(
MODEL_PATH,
seed=0,
dtype="float16",
max_model_len=127,
enforce_eager=True,
)
vllm_config = VllmConfig(
model_config=ModelConfig(
MODEL_PATH,
seed=0,
dtype="float16",
max_model_len=127,
enforce_eager=True,
),
model_config=model_config,
load_config=LoadConfig(
download_dir=None,
load_format="dummy",
@ -50,7 +52,14 @@ def test_worker_apply_lora(qwen3_lora_files):
tensor_parallel_size=1,
data_parallel_size=1,
),
scheduler_config=SchedulerConfig("generate", 32, 32, 32),
scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
runner_type="generate",
max_num_batched_tokens=32,
max_num_seqs=32,
max_num_partial_prefills=32,
),
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(
block_size=16,

View File

@ -315,3 +315,38 @@ def test_mistral_function_call_nested_json():
assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
# No additional content outside the tool call should be returned.
assert parsed.content is None
# multiple calls
multiple_args_dict = [
{
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit",
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
},
{},
{"a": 0},
{"a": 1, "b": "c"},
]
names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
model_output = "".join(
[
f"{parser.bot_token}{name}{json.dumps(args)}"
for name, args in zip(names, multiple_args_dict)
]
)
parsed = parser.extract_tool_calls(model_output, None)
# Assertions: the tool call is detected and the full nested JSON is parsed
# without truncation.
assert parsed.tools_called
assert len(parsed.tool_calls) == len(multiple_args_dict)
for i, tool_call in enumerate(parsed.tool_calls):
assert MistralToolCall.is_valid_id(tool_call.id)
assert tool_call.function.name == names[i]
assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
# No additional content outside the tool call should be returned.
assert parsed.content is None

View File

@ -22,10 +22,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.tokenizers import MistralTokenizer
from vllm.transformers_utils.tokenizer import (
from vllm.tokenizers import (
MistralTokenizer,
TokenizerLike,
cached_tokenizer_from_config,
encode_tokens,
)
from ....multimodal.utils import random_audio, random_image, random_video
@ -154,7 +154,7 @@ def get_text_token_prompts(
mm_data: MultiModalDataDict,
):
dummy_inputs = processor.dummy_inputs
tokenizer = processor.info.get_tokenizer()
tokenizer: TokenizerLike = processor.info.get_tokenizer()
model_config = processor.info.ctx.model_config
model_type = model_config.hf_config.model_type
@ -191,10 +191,9 @@ def get_text_token_prompts(
assert isinstance(inputs.prompt, str)
text_prompt = inputs.prompt
token_prompt = encode_tokens(
tokenizer,
token_prompt = tokenizer.encode(
text_prompt,
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
)
return text_prompt, token_prompt

View File

@ -5,7 +5,6 @@
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.tokenizer import encode_tokens
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@ -48,7 +47,7 @@ def test_processor_override(
]
}
if tokenized_prompt:
prompt = encode_tokens(tokenizer, prompt)
prompt = tokenizer.encode(prompt)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
mm_data = processed_inputs["mm_kwargs"].get_data()

View File

@ -31,7 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.platforms import current_platform
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype

View File

@ -358,6 +358,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True,
),
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
"MistralLarge3ForCausalLM": _HfExamplesInfo(
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
),
"MixtralForCausalLM": _HfExamplesInfo(
"mistralai/Mixtral-8x7B-Instruct-v0.1",
{"tiny": "TitanML/tiny-mixtral"},
@ -770,7 +773,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"PixtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Pixtral-12B-2409",
extras={
"mistral-large-3": "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4",
"ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
},
tokenizer_mode="mistral",
# TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
is_available_online=False,
),
"QwenVLForConditionalGeneration": _HfExamplesInfo(
"Qwen/Qwen-VL",
@ -822,7 +831,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),
"Tarsier2ForConditionalGeneration": _HfExamplesInfo(
"omni-research/Tarsier2-Recap-7b",
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
hf_overrides={
"architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2",
},
),
"VoxtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Voxtral-Mini-3B-2507",
@ -870,6 +882,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
use_original_num_layers=True,
max_model_len=10240,
),
"EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
"mistralai/Mistral-Large-3-675B-Instruct-2512",
speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
is_available_online=False,
),
"LlamaForCausalLMEagle3": _HfExamplesInfo(
"Qwen/Qwen3-8B",
trust_remote_code=True,

View File

@ -203,7 +203,7 @@ class TestGGUFModelLoader:
@patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
@patch("vllm.config.model.get_config")
@patch("vllm.config.model.is_gguf", return_value=False)
@patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
@patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
@patch("os.path.isfile", return_value=False)
def test_prepare_weights_invalid_format(
self,

View File

@ -13,7 +13,7 @@ from transformers import PretrainedConfig
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
from vllm.multimodal.processing import InputProcessingContext
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config
from .. import ci_envs
from .registry import HF_EXAMPLE_MODELS

View File

@ -6,12 +6,14 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
from unittest.mock import patch
import pytest
from pydantic import ValidationError
from vllm.compilation.backends import VllmBackend
from vllm.config import (
CompilationConfig,
ModelConfig,
PoolerConfig,
SchedulerConfig,
VllmConfig,
update_config,
)
@ -716,7 +718,7 @@ def test_is_chunked_prefill_supported(
):
model_config = ModelConfig(model_id, trust_remote_code=True)
assert model_config.attn_type == expected_attn_type
with caplog_vllm.at_level(level=logging.DEBUG):
with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"):
assert model_config.is_chunked_prefill_supported == expected_result
assert reason in caplog_vllm.text
@ -835,7 +837,7 @@ def test_is_prefix_caching_supported(
):
model_config = ModelConfig(model_id, trust_remote_code=True)
assert model_config.attn_type == expected_attn_type
with caplog_vllm.at_level(level=logging.DEBUG):
with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"):
assert model_config.is_prefix_caching_supported == expected_result
assert reason in caplog_vllm.text
@ -1095,3 +1097,14 @@ def test_vllm_config_explicit_overrides():
# Other fields should still use defaults
assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
def test_scheduler_config_init():
with pytest.raises(ValidationError):
# Positional InitVars missing
# (InitVars cannot have defaults otherwise they will become attributes)
SchedulerConfig()
with pytest.raises(AttributeError):
# InitVar does not become an attribute
print(SchedulerConfig.default_factory().max_model_len)

View File

@ -7,7 +7,7 @@ from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor
from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
from vllm.tokenizers import init_tokenizer_from_config
pytestmark = pytest.mark.cpu_test
@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config)
tokenizer = init_tokenizer_from_config(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)
# HF processor adds sep token

View File

@ -57,7 +57,7 @@ def test_default_vllm_root_logger_configuration(monkeypatch):
_configure_vllm_root_logger()
logger = logging.getLogger("vllm")
assert logger.level == logging.DEBUG
assert logger.level == logging.INFO
assert not logger.propagate
handler = logger.handlers[0]
@ -524,7 +524,7 @@ def mp_function(**kwargs):
def test_caplog_mp_fork(caplog_vllm, caplog_mp_fork):
with caplog_vllm.at_level(logging.DEBUG), caplog_mp_fork():
with caplog_vllm.at_level(logging.DEBUG, logger="vllm"), caplog_mp_fork():
import multiprocessing
ctx = multiprocessing.get_context("fork")

View File

@ -5,8 +5,7 @@ from typing import _get_protocol_attrs # type: ignore
import pytest
from transformers import PreTrainedTokenizerBase
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import TokenizerLike, get_tokenizer
def _get_missing_attrs(obj: object, target: type):

View File

@ -91,6 +91,118 @@ from vllm.tokenizers.mistral import (
],
),
),
(
{
"messages": [
{
"role": "user",
"content": "What is the current local date and time?",
}
],
"tools": [
{
"type": "function",
"function": {
"description": "Fetch the current local date and time.",
"unsupported_field": False,
"name": "get_current_time",
"parameters": {},
},
},
{
"type": "function",
"function": {
"description": "Fetch the current local date and time.",
"unsupported_field2": False,
"name": "get_current_time",
"parameters": {},
},
},
],
},
(
[
{
"role": "user",
"content": "What is the current local date and time?",
}
],
[
{
"type": "function",
"function": {
"description": "Fetch the current local date and time.",
"name": "get_current_time",
"parameters": {},
},
},
{
"type": "function",
"function": {
"description": "Fetch the current local date and time.",
"name": "get_current_time",
"parameters": {},
},
},
],
),
),
(
{
"messages": [
{
"role": "user",
"content": "What is the current local date and time?",
}
],
"tools": [
{
"type": "function",
"unsupported_field": False,
"function": {
"description": "Fetch the current local date and time.",
"name": "get_current_time",
"parameters": {},
},
},
{
"type": "function",
"unsupported_field2": False,
"function": {
"description": "Fetch the current local date and time 2.",
"name": "get_current_time2",
"parameters": {"a": "1"},
},
},
],
},
(
[
{
"role": "user",
"content": "What is the current local date and time?",
}
],
[
{
"type": "function",
"function": {
"description": "Fetch the current local date and time.",
"name": "get_current_time",
"parameters": {},
},
},
{
"type": "function",
"function": {
"description": "Fetch the current local date and time 2.",
"name": "get_current_time2",
"parameters": {"a": "1"},
},
},
],
),
),
],
)
def test_prepare_apply_chat_template_tools_and_messages(
@ -1108,13 +1220,6 @@ class TestMistralTokenizer:
)
== expected_tokens[mistral_tokenizer.is_tekken]
)
assert (
mistral_tokenizer.decode(
ids[mistral_tokenizer.is_tekken],
skip_special_tokens=skip_special_tokens,
)
== expected_tokens[mistral_tokenizer.is_tekken]
)
def test_decode_empty(
self,
@ -1140,6 +1245,45 @@ class TestMistralTokenizer:
== "<s>"
)
@pytest.mark.parametrize(
"skip_special_tokens,expected_tokens",
(
(
False,
(
["<s>[INST]▁Hello▁world▁![/INST]▁Hello</s>"],
["<s>[INST]Hello world ![/INST]Hello</s>"],
),
),
(True, (["Hello world ! Hello"], ["Hello world !Hello"])),
),
)
def test_batch_decode(
self,
mistral_tokenizer: MistralTokenizer,
skip_special_tokens: bool,
expected_tokens: tuple[str, str],
):
ids = (
[[1, 3, 23325, 2294, 1686, 4, 23325, 2]],
[[1, 3, 22177, 4304, 2662, 4, 22177, 2]],
)
assert (
mistral_tokenizer.batch_decode(
ids[mistral_tokenizer.is_tekken],
skip_special_tokens=skip_special_tokens,
)
== expected_tokens[mistral_tokenizer.is_tekken]
)
def test_batch_decode_empty(
self,
mistral_tokenizer: MistralTokenizer,
):
assert mistral_tokenizer.batch_decode(
[[]],
) == [""]
def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
tokens = (
[

View File

@ -2,8 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from vllm.tokenizers import TokenizerLike, TokenizerRegistry
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
class TestTokenizer(TokenizerLike):

View File

@ -6,7 +6,7 @@ import pytest
from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
DeepSeekV31ToolParser,
)
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
MODEL = "deepseek-ai/DeepSeek-V3.1"

View File

@ -14,9 +14,8 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
from vllm.transformers_utils.tokenizer import get_tokenizer
# Use a common model that is likely to be available
MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"

View File

@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
Glm4MoeModelToolParser,
)
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
pytestmark = pytest.mark.cpu_test

View File

@ -10,9 +10,8 @@ from partial_json_parser.core.options import Allow
from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test

View File

@ -8,7 +8,7 @@ import pytest
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
pytestmark = pytest.mark.cpu_test

Some files were not shown because too many files have changed in this diff Show More