mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-02 03:57:03 +08:00
Merge branch 'main' into rename_file_info_to_pkg/file
This commit is contained in:
commit
658f6a6da5
@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||
|
||||
#### Default Parameters Field
|
||||
|
||||
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
|
||||
|
||||
<details>
|
||||
<summary> An Example of default parameters field </summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"block_size": 128,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"num_prompts": 200,
|
||||
"ignore-eos": ""
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama3B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen3_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-14B",
|
||||
"tensor_parallel_size": 4,
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-14B",
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
|
||||
@ -110,7 +110,8 @@ json2envs() {
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
local timeout_val="1200"
|
||||
timeout "$timeout_val" bash -c '
|
||||
until curl -X POST localhost:8000/v1/completions; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
@ -316,12 +317,44 @@ run_throughput_tests() {
|
||||
run_serving_tests() {
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
#
|
||||
# Supported JSON formats:
|
||||
# 1) Plain format: top-level array
|
||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
#
|
||||
# 2) Default parameters field + plain format tests
|
||||
# {
|
||||
# "defaults": { ... },
|
||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
# }
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
jq -c '
|
||||
if type == "array" then
|
||||
# Plain format: test cases array
|
||||
.[]
|
||||
elif (type == "object" and has("tests")) then
|
||||
# merge the default parameters into each test cases
|
||||
. as $root
|
||||
| ($root.defaults // {}) as $d
|
||||
| ($root.tests // [])[]
|
||||
# default qps / max_concurrency from defaults if missing
|
||||
| .qps_list = (.qps_list // $d.qps_list)
|
||||
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
|
||||
# merge envs / params: test overrides defaults
|
||||
| .server_environment_variables =
|
||||
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
|
||||
| .server_parameters =
|
||||
(($d.server_parameters // {}) + (.server_parameters // {}))
|
||||
| .client_parameters =
|
||||
(($d.client_parameters // {}) + (.client_parameters // {}))
|
||||
else
|
||||
error("Unsupported serving test file format: must be array or object with .tests")
|
||||
end
|
||||
' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||
@ -335,20 +368,25 @@ run_serving_tests() {
|
||||
continue
|
||||
fi
|
||||
|
||||
# get client and server arguments
|
||||
# get client and server arguments (after merged the default parameters)
|
||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||
|
||||
server_args=$(json2args "$server_params")
|
||||
server_envs=$(json2envs "$server_envs")
|
||||
client_args=$(json2args "$client_params")
|
||||
|
||||
# qps_list
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# max_concurrency_list (fallback to num_prompts if missing)
|
||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||
max_concurrency_list="[$num_prompts]"
|
||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||
max_concurrency_list="[$num_prompts]"
|
||||
fi
|
||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
||||
echo "Running over max concurrency list $max_concurrency_list"
|
||||
|
||||
@ -1,610 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
}
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,276 +1,246 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 32
|
||||
}
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 32
|
||||
}
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 200
|
||||
}
|
||||
]
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -8,7 +8,7 @@ steps:
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@ -30,19 +30,6 @@ steps:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
@ -109,7 +96,6 @@ steps:
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
- build-wheel-cuda-12-8
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
|
||||
369
.buildkite/scripts/generate-nightly-index.py
Normal file
369
.buildkite/scripts/generate-nightly-index.py
Normal file
@ -0,0 +1,369 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# do not complain about line length (for docstring)
|
||||
# ruff: noqa: E501
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
<html>
|
||||
<meta name="pypi:repository-version" content="1.0">
|
||||
<body>
|
||||
{items}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class WheelFileInfo:
|
||||
package_name: str
|
||||
version: str
|
||||
build_tag: str | None
|
||||
python_tag: str
|
||||
abi_tag: str
|
||||
platform_tag: str
|
||||
variant: str | None
|
||||
filename: str
|
||||
|
||||
|
||||
def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
"""
|
||||
Parse wheel file name to extract metadata.
|
||||
|
||||
The format of wheel names:
|
||||
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
|
||||
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
|
||||
Example:
|
||||
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
|
||||
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
|
||||
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
|
||||
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
|
||||
"""
|
||||
wheel_file_re = re.compile(
|
||||
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
|
||||
)
|
||||
match = wheel_file_re.match(file)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid wheel file name: {file}")
|
||||
|
||||
package_name = match.group("package_name")
|
||||
version = match.group("version")
|
||||
build_tag = match.group("build_tag")
|
||||
python_tag = match.group("python_tag")
|
||||
abi_tag = match.group("abi_tag")
|
||||
platform_tag = match.group("platform_tag")
|
||||
|
||||
# extract variant from version
|
||||
variant = None
|
||||
if "dev" in version:
|
||||
ver_after_dev = version.split("dev")[-1]
|
||||
if "." in ver_after_dev:
|
||||
variant = ver_after_dev.split(".")[-1]
|
||||
version = version.removesuffix("." + variant)
|
||||
else:
|
||||
if "+" in version:
|
||||
version, variant = version.split("+")
|
||||
|
||||
return WheelFileInfo(
|
||||
package_name=package_name,
|
||||
version=version,
|
||||
build_tag=build_tag,
|
||||
python_tag=python_tag,
|
||||
abi_tag=abi_tag,
|
||||
platform_tag=platform_tag,
|
||||
variant=variant,
|
||||
filename=file,
|
||||
)
|
||||
|
||||
|
||||
def generate_project_list(subdir_names: list[str]) -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
"""
|
||||
href_tags = []
|
||||
for name in sorted(subdir_names):
|
||||
name = name.strip("/").strip(".")
|
||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
|
||||
|
||||
def generate_package_index_and_metadata(
|
||||
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
||||
"""
|
||||
href_tags = []
|
||||
metadata = []
|
||||
for file in sorted(wheel_files, key=lambda x: x.filename):
|
||||
relative_path = (
|
||||
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
|
||||
)
|
||||
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
|
||||
# NOTE: this is AWS S3 specific behavior!
|
||||
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
|
||||
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
|
||||
file_meta = asdict(file)
|
||||
file_meta["path"] = file_path_quoted
|
||||
metadata.append(file_meta)
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
metadata_str = json.dumps(metadata, indent=2)
|
||||
return index_str, metadata_str
|
||||
|
||||
|
||||
def generate_index_and_metadata(
|
||||
whl_files: list[str],
|
||||
wheel_base_dir: Path,
|
||||
index_base_dir: Path,
|
||||
default_variant: str | None = None,
|
||||
alias_to_default: str | None = None,
|
||||
):
|
||||
"""
|
||||
Generate index for all wheel files.
|
||||
|
||||
Args:
|
||||
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
|
||||
wheel_base_dir (Path): Base directory for wheel files.
|
||||
index_base_dir (Path): Base directory to store index files.
|
||||
default_variant (str | None): The default variant name, if any.
|
||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
The index for the default variant (if any) is generated in the root index directory.
|
||||
|
||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
||||
|
||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
||||
as the default variant index, but the links are adjusted accordingly.
|
||||
|
||||
Index directory structure:
|
||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
||||
vllm/
|
||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
||||
metadata.json # machine-readable metadata for all wheels in this package
|
||||
cpu/ # cpu variant sub-directory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu129/ # cu129 is actually the alias to default variant
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu130/ # cu130 variant sub-directory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
...
|
||||
|
||||
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
|
||||
[
|
||||
{
|
||||
"package_name": "vllm",
|
||||
"version": "0.10.2rc2",
|
||||
"build_tag": null,
|
||||
"python_tag": "cp38",
|
||||
"abi_tag": "abi3",
|
||||
"platform_tag": "manylinux2014_aarch64",
|
||||
"variant": "cu129",
|
||||
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
|
||||
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
parsed_files = [parse_from_filename(f) for f in whl_files]
|
||||
|
||||
if not parsed_files:
|
||||
print("No wheel files found, skipping index generation.")
|
||||
return
|
||||
|
||||
# Group by variant
|
||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||
for file in parsed_files:
|
||||
variant = file.variant or "default"
|
||||
if variant not in variant_to_files:
|
||||
variant_to_files[variant] = []
|
||||
variant_to_files[variant].append(file)
|
||||
|
||||
print(f"Found variants: {list(variant_to_files.keys())}")
|
||||
|
||||
# sanity check for default variant
|
||||
if default_variant:
|
||||
if "default" in variant_to_files:
|
||||
raise ValueError(
|
||||
"All wheel files must have variant suffixes when `default_variant` is specified."
|
||||
)
|
||||
if default_variant not in variant_to_files:
|
||||
raise ValueError(
|
||||
f"Default variant '{default_variant}' not found among wheel files."
|
||||
)
|
||||
|
||||
if alias_to_default:
|
||||
if "default" not in variant_to_files:
|
||||
# e.g. only some wheels are uploaded to S3 currently
|
||||
print(
|
||||
"[WARN] Alias to default variant specified, but no default variant found."
|
||||
)
|
||||
elif alias_to_default in variant_to_files:
|
||||
raise ValueError(
|
||||
f"Alias variant name '{alias_to_default}' already exists among wheel files."
|
||||
)
|
||||
else:
|
||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
||||
|
||||
# Generate index for each variant
|
||||
subdir_names = set()
|
||||
for variant, files in variant_to_files.items():
|
||||
if variant == "default":
|
||||
variant_dir = index_base_dir
|
||||
else:
|
||||
variant_dir = index_base_dir / variant
|
||||
subdir_names.add(variant)
|
||||
|
||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# gather all package names in this variant
|
||||
packages = set(f.package_name for f in files)
|
||||
if variant == "default":
|
||||
# these packages should also appear in the "project list"
|
||||
# generate after all variants are processed
|
||||
subdir_names = subdir_names.union(packages)
|
||||
else:
|
||||
# generate project list for this variant directly
|
||||
project_list_str = generate_project_list(sorted(packages))
|
||||
with open(variant_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
for package in packages:
|
||||
# filter files belonging to this package only
|
||||
package_files = [f for f in files if f.package_name == package]
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
package_files, wheel_base_dir, package_dir
|
||||
)
|
||||
with open(package_dir / "index.html", "w") as f:
|
||||
f.write(index_str)
|
||||
with open(package_dir / "metadata.json", "w") as f:
|
||||
f.write(metadata_str)
|
||||
|
||||
# Generate top-level project list index
|
||||
project_list_str = generate_project_list(sorted(subdir_names))
|
||||
with open(index_base_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Arguments:
|
||||
--version <version> : version string for the current build (e.g., commit hash)
|
||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||
--output-dir <output_directory> : directory to store generated index files
|
||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process nightly build wheel files to generate indices."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Version string for the current build (e.g., commit hash)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--current-objects",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to JSON file containing current S3 objects listing in this version directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory to store generated index files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alias-to-default",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Alias variant name for the default variant",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
version = args.version
|
||||
if "/" in version or "\\" in version:
|
||||
raise ValueError("Version string must not contain slashes.")
|
||||
current_objects_path = Path(args.current_objects)
|
||||
output_dir = Path(args.output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read current objects JSON
|
||||
with open(current_objects_path) as f:
|
||||
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
|
||||
|
||||
# current_objects looks like from list_objects_v2 S3 API:
|
||||
"""
|
||||
"Contents": [
|
||||
{
|
||||
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
|
||||
"LastModified": "2025-11-28T14:00:32+00:00",
|
||||
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
|
||||
"ChecksumAlgorithm": [
|
||||
"CRC64NVME"
|
||||
],
|
||||
"ChecksumType": "FULL_OBJECT",
|
||||
"Size": 435649349,
|
||||
"StorageClass": "STANDARD"
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
# Extract wheel file keys
|
||||
wheel_files = []
|
||||
for item in current_objects.get("Contents", []):
|
||||
key: str = item["Key"]
|
||||
if key.endswith(".whl"):
|
||||
wheel_files.append(key.split("/")[-1]) # only the filename is used
|
||||
|
||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{version}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
wheel_base_dir = Path(output_dir).parent / version
|
||||
index_base_dir = Path(output_dir)
|
||||
|
||||
generate_index_and_metadata(
|
||||
whl_files=wheel_files,
|
||||
wheel_base_dir=wheel_base_dir,
|
||||
index_base_dir=index_base_dir,
|
||||
default_variant=None,
|
||||
alias_to_default=args.alias_to_default,
|
||||
)
|
||||
print(f"Successfully generated index and metadata in {output_dir}")
|
||||
@ -2,6 +2,28 @@
|
||||
|
||||
set -ex
|
||||
|
||||
# ======== part 0: setup ========
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
INDICES_OUTPUT_DIR="indices"
|
||||
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
|
||||
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
# detect if python3.10+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# use new python from docker
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ========= part 1: collect, rename & upload the wheel ==========
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the single wheel file
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# Detect architecture and rename 'linux' to appropriate manylinux version
|
||||
arch=$(uname -m)
|
||||
if [[ $arch == "x86_64" ]]; then
|
||||
manylinux_version="manylinux1"
|
||||
elif [[ $arch == "aarch64" ]]; then
|
||||
manylinux_version="manylinux2014"
|
||||
else
|
||||
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
|
||||
manylinux_version="manylinux1"
|
||||
fi
|
||||
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||
manylinux_version="manylinux_2_31"
|
||||
|
||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||
if [[ "$wheel" != *"linux"* ]]; then
|
||||
echo "Error: Wheel filename does not contain 'linux': $wheel"
|
||||
exit 1
|
||||
fi
|
||||
new_wheel="${wheel/linux/$manylinux_version}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
echo "Renamed wheel to: $wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version: $version"
|
||||
echo "Version in wheel: $version"
|
||||
pure_version="${version%%+*}"
|
||||
echo "Pure version (without variant): $pure_version"
|
||||
|
||||
normal_wheel="$wheel" # Save the original wheel filename
|
||||
# copy wheel to its own bucket
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||
if [[ $version == *dev* ]]; then
|
||||
suffix="${version##*.}"
|
||||
if [[ $suffix == cu* ]]; then
|
||||
new_version="1.0.0.dev+${suffix}"
|
||||
else
|
||||
new_version="1.0.0.dev"
|
||||
fi
|
||||
new_wheel="${wheel/$version/$new_version}"
|
||||
# use cp to keep both files in the artifacts directory
|
||||
cp -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
version="$new_version"
|
||||
fi
|
||||
# ========= part 2: generate and upload indices ==========
|
||||
# generate indices for all existing wheels in the commit directory
|
||||
# this script might be run multiple times if there are multiple variants being built
|
||||
# so we need to guarantee there is little chance for "TOCTOU" issues
|
||||
# i.e., one process is generating indices while another is uploading a new wheel
|
||||
# so we need to ensure no time-consuming operations happen below
|
||||
|
||||
# Upload the wheel to S3
|
||||
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||
# list all wheels in the commit directory
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
obj_json="objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# generate index for this commit
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
|
||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||
# only upload index.html for cu129 wheels (default wheels) as it
|
||||
# is available on both x86 and arm64
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
# call script to generate indicies for all existing wheels
|
||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
||||
else
|
||||
echo "Skipping index files for non-cu129 wheels"
|
||||
alias_arg=""
|
||||
fi
|
||||
|
||||
# generate index for nightly
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
|
||||
|
||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||
# only upload index.html for cu129 wheels (default wheels) as it
|
||||
# is available on both x86 and arm64
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
else
|
||||
echo "Skipping index files for non-cu129 wheels"
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# copy to /nightly/ only if it is on the main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
echo "Uploading indices to overwrite /nightly/"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||
fi
|
||||
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
||||
# copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Uploading indices to overwrite /$pure_version/"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
|
||||
@ -715,6 +715,7 @@ steps:
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 15min
|
||||
@ -934,6 +935,18 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling_mteb_test
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor Test # 44min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -1472,14 +1485,14 @@ steps:
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
|
||||
@ -215,7 +215,6 @@ steps:
|
||||
timeout_in_minutes: 10
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
@ -391,20 +390,24 @@ steps:
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
@ -1370,4 +1373,4 @@ steps:
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
4
.github/CODEOWNERS
vendored
4
.github/CODEOWNERS
vendored
@ -146,10 +146,10 @@ mkdocs.yaml @hmellor
|
||||
/requirements/kv_connectors.txt @NickLucche
|
||||
|
||||
# Pooling models
|
||||
/examples/*/pooling/ @noooop
|
||||
/examples/pooling @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
|
||||
/vllm/entrypoints/pooling @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
/vllm/pooling_params.py @noooop
|
||||
/vllm/model_executor/layers/pooler.py @noooop
|
||||
|
||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
|
||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||
|
||||
@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
|
||||
device_config=DeviceConfig(device=current_platform.device_type),
|
||||
parallel_config=ParallelConfig(),
|
||||
load_config=LoadConfig(),
|
||||
scheduler_config=SchedulerConfig(),
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_model_len=model_config.max_model_len,
|
||||
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||
),
|
||||
)
|
||||
|
||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||
|
||||
@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
except ImportError:
|
||||
from backend_request_func import get_tokenizer
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
except ImportError:
|
||||
from backend_request_func import get_tokenizer
|
||||
|
||||
|
||||
@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
|
||||
}
|
||||
auto Y_all = at::empty({offsets[E], H}, x_c.options());
|
||||
|
||||
at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
|
||||
at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) {
|
||||
c10::InferenceMode guard;
|
||||
for (int64_t e = e_begin; e < e_end; ++e) {
|
||||
const int64_t te = counts[e];
|
||||
if (te == 0) {
|
||||
for (int64_t e = 0; e < E; ++e) {
|
||||
int64_t start = std::max(offsets[e], idx_begin);
|
||||
int64_t end = std::min(offsets[e + 1], idx_end);
|
||||
int64_t te = end - start;
|
||||
if (te <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int64_t start = offsets[e];
|
||||
|
||||
auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
|
||||
|
||||
auto w13_e = w13_packed.select(/*dim=*/0, e);
|
||||
|
||||
@ -364,7 +364,12 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
|
||||
cuda-cudart-${CUDA_VERSION_DASH} \
|
||||
cuda-nvrtc-${CUDA_VERSION_DASH} \
|
||||
cuda-cuobjdump-${CUDA_VERSION_DASH} \
|
||||
libcublas-${CUDA_VERSION_DASH} && \
|
||||
# https://github.com/vllm-project/vllm/issues/29590
|
||||
libcurand-dev-${CUDA_VERSION_DASH} \
|
||||
libcublas-${CUDA_VERSION_DASH} \
|
||||
# Fixes nccl_allocator requiring nccl.h at runtime
|
||||
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
|
||||
libnccl-dev && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
|
||||
@ -5,11 +5,7 @@ nav:
|
||||
- Getting Started:
|
||||
- getting_started/quickstart.md
|
||||
- getting_started/installation
|
||||
- Examples:
|
||||
- examples/README.md
|
||||
- Offline Inference: examples/offline_inference
|
||||
- Online Serving: examples/online_serving
|
||||
- Others: examples/others
|
||||
- Examples: examples
|
||||
- General:
|
||||
- usage/v1_guide.md
|
||||
- usage/*
|
||||
|
||||
@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen
|
||||
The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
|
||||
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
|
||||
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
||||
|
||||
## Using an IO Processor plugin
|
||||
|
||||
|
||||
@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
Currently, there are no pre-built CPU wheels.
|
||||
Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
|
||||
|
||||
When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
|
||||
For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
#### Set up using Python-only build (without compilation) {#python-only-build}
|
||||
|
||||
Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
|
||||
|
||||
```bash
|
||||
VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
|
||||
```
|
||||
|
||||
#### Full build (with compilation) {#full-build}
|
||||
|
||||
=== "Intel/AMD x86"
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
|
||||
@ -125,6 +138,35 @@ vllm serve facebook/opt-125m --dtype=bfloat16
|
||||
|
||||
Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
|
||||
|
||||
### What are supported models on CPU?
|
||||
|
||||
For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu)
|
||||
|
||||
### How to find benchmark configuration examples for supported CPU models?
|
||||
|
||||
For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
|
||||
For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details).
|
||||
To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.
|
||||
|
||||
Below is an example command to benchmark all CPU-supported models using optimized configurations.
|
||||
|
||||
```bash
|
||||
ON_CPU=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
```
|
||||
|
||||
The benchmark results will be saved in `./benchmark/results/`.
|
||||
In the directory, the generated `.commands` files contain all example commands for the benchmark.
|
||||
|
||||
We recommend configuring tensor-parallel-size to match the number of NUMA nodes on your system. Note that the current release does not support tensor-parallel-size=6.
|
||||
To determine the number of NUMA nodes available, use the following command:
|
||||
|
||||
```bash
|
||||
lscpu | grep "NUMA node(s):" | awk '{print $3}'
|
||||
```
|
||||
|
||||
For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
|
||||
, which publishes default-model CPU results produced using the same Benchmark Suite.
|
||||
|
||||
### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
|
||||
|
||||
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
|
||||
|
||||
@ -26,42 +26,49 @@ uv pip install vllm --torch-backend=auto
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
# Install vLLM with CUDA 12.8.
|
||||
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
|
||||
# Install vLLM with CUDA 12.9.
|
||||
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
|
||||
```
|
||||
|
||||
We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
|
||||
We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
|
||||
|
||||
!!! note
|
||||
NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
|
||||
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
|
||||
|
||||
```bash
|
||||
# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
|
||||
# Install vLLM with a specific CUDA version (e.g., 13.0).
|
||||
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
|
||||
export CUDA_VERSION=118 # or 126
|
||||
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
|
||||
export CUDA_VERSION=130 # or other
|
||||
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
|
||||
```
|
||||
|
||||
#### Install the latest code
|
||||
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
|
||||
|
||||
* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
|
||||
* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
|
||||
|
||||
To install from nightly index, run:
|
||||
|
||||
```bash
|
||||
uv pip install -U vllm \
|
||||
--torch-backend=auto \
|
||||
--extra-index-url https://wheels.vllm.ai/nightly
|
||||
--extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
pip install -U vllm \
|
||||
--pre \
|
||||
--extra-index-url https://wheels.vllm.ai/nightly
|
||||
```
|
||||
!!! warning "`pip` caveat"
|
||||
|
||||
`--pre` is required for `pip` to consider pre-released versions.
|
||||
Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
|
||||
|
||||
If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
|
||||
|
||||
```bash
|
||||
pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
|
||||
pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
|
||||
```
|
||||
|
||||
##### Install specific revisions
|
||||
|
||||
@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
|
||||
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
|
||||
uv pip install vllm \
|
||||
--torch-backend=auto \
|
||||
--extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
|
||||
--extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
|
||||
```
|
||||
|
||||
The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
|
||||
|
||||
??? note "pip"
|
||||
If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
|
||||
performance regression), due to the limitation of `pip`, you have to specify the full URL of the
|
||||
wheel file by embedding the commit hash in the URL:
|
||||
|
||||
```bash
|
||||
export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
|
||||
pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
```
|
||||
|
||||
Note that the wheels are built with Python 3.8 ABI (see [PEP
|
||||
425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
|
||||
with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
|
||||
placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
|
||||
the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
|
||||
don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
|
||||
wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
#### Set up using Python-only build (without compilation)
|
||||
#### Set up using Python-only build (without compilation) {#python-only-build}
|
||||
|
||||
If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
|
||||
|
||||
@ -121,18 +108,24 @@ This command will do the following:
|
||||
In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
|
||||
|
||||
```bash
|
||||
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
|
||||
export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
export VLLM_PRECOMPILED_WHEEL_COMMIT=$(git rev-parse HEAD~1) # or earlier commit on main
|
||||
export VLLM_USE_PRECOMPILED=1
|
||||
uv pip install --editable .
|
||||
```
|
||||
|
||||
There are more environment variables to control the behavior of Python-only build:
|
||||
|
||||
* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
|
||||
* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
|
||||
* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
|
||||
|
||||
You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
|
||||
|
||||
!!! note
|
||||
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
|
||||
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
|
||||
|
||||
#### Full build (with compilation)
|
||||
#### Full build (with compilation) {#full-build}
|
||||
|
||||
If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
|
||||
|
||||
### Pre-built wheels
|
||||
### Pre-built wheels {#pre-built-wheels}
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
|
||||
@ -2,7 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import itertools
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from functools import cached_property
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples"
|
||||
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
|
||||
|
||||
|
||||
def fix_case(text: str) -> str:
|
||||
def title(text: str) -> str:
|
||||
# Default title case
|
||||
text = text.replace("_", " ").replace("/", " - ").title()
|
||||
# Custom substitutions
|
||||
subs = {
|
||||
"io": "IO",
|
||||
"api": "API",
|
||||
"cli": "CLI",
|
||||
"cpu": "CPU",
|
||||
"llm": "LLM",
|
||||
"mae": "MAE",
|
||||
"ner": "NER",
|
||||
"tpu": "TPU",
|
||||
"gguf": "GGUF",
|
||||
"lora": "LoRA",
|
||||
@ -48,71 +54,65 @@ class Example:
|
||||
Attributes:
|
||||
path (Path): The path to the main directory or file.
|
||||
category (str): The category of the document.
|
||||
main_file (Path): The main file in the directory.
|
||||
other_files (list[Path]): list of other files in the directory.
|
||||
title (str): The title of the document.
|
||||
|
||||
Properties::
|
||||
main_file() -> Path | None: Determines the main file in the given path.
|
||||
other_files() -> list[Path]: Determines other files in the directory excluding
|
||||
the main file.
|
||||
title() -> str: Determines the title of the document.
|
||||
|
||||
Methods:
|
||||
__post_init__(): Initializes the main_file, other_files, and title attributes.
|
||||
determine_main_file() -> Path: Determines the main file in the given path.
|
||||
determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
|
||||
determine_title() -> str: Determines the title of the document.
|
||||
generate() -> str: Generates the documentation content.
|
||||
""" # noqa: E501
|
||||
"""
|
||||
|
||||
path: Path
|
||||
category: str = None
|
||||
main_file: Path = field(init=False)
|
||||
other_files: list[Path] = field(init=False)
|
||||
title: str = field(init=False)
|
||||
category: str
|
||||
|
||||
def __post_init__(self):
|
||||
self.main_file = self.determine_main_file()
|
||||
self.other_files = self.determine_other_files()
|
||||
self.title = self.determine_title()
|
||||
@cached_property
|
||||
def main_file(self) -> Path | None:
|
||||
"""Determines the main file in the given path.
|
||||
|
||||
@property
|
||||
def is_code(self) -> bool:
|
||||
return self.main_file.suffix != ".md"
|
||||
If path is a file, it returns the path itself. If path is a directory, it
|
||||
searches for Markdown files (*.md) in the directory and returns the first one
|
||||
found. If no Markdown files are found, it returns None."""
|
||||
# Single file example
|
||||
if self.path.is_file():
|
||||
return self.path
|
||||
# Multi file example with a README
|
||||
if md_paths := list(self.path.glob("*.md")):
|
||||
return md_paths[0]
|
||||
# Multi file example without a README
|
||||
return None
|
||||
|
||||
def determine_main_file(self) -> Path:
|
||||
"""
|
||||
Determines the main file in the given path.
|
||||
If the path is a file, it returns the path itself. Otherwise, it searches
|
||||
for Markdown files (*.md) in the directory and returns the first one found.
|
||||
Returns:
|
||||
Path: The main file path, either the original path if it's a file or the first
|
||||
Markdown file found in the directory.
|
||||
Raises:
|
||||
IndexError: If no Markdown files are found in the directory.
|
||||
""" # noqa: E501
|
||||
return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
|
||||
@cached_property
|
||||
def other_files(self) -> list[Path]:
|
||||
"""Determine other files in the directory excluding the main file.
|
||||
|
||||
def determine_other_files(self) -> list[Path]:
|
||||
"""
|
||||
Determine other files in the directory excluding the main file.
|
||||
|
||||
This method checks if the given path is a file. If it is, it returns an empty list.
|
||||
Otherwise, it recursively searches through the directory and returns a list of all
|
||||
files that are not the main file.
|
||||
|
||||
Returns:
|
||||
list[Path]: A list of Path objects representing the other files in the directory.
|
||||
""" # noqa: E501
|
||||
If path is a file, it returns an empty list. Otherwise, it returns every file
|
||||
in the directory except the main file in a list."""
|
||||
# Single file example
|
||||
if self.path.is_file():
|
||||
return []
|
||||
# Multi file example
|
||||
is_other_file = lambda file: file.is_file() and file != self.main_file
|
||||
return [file for file in self.path.rglob("*") if is_other_file(file)]
|
||||
return sorted(file for file in self.path.rglob("*") if is_other_file(file))
|
||||
|
||||
def determine_title(self) -> str:
|
||||
if not self.is_code:
|
||||
# Specify encoding for building on Windows
|
||||
with open(self.main_file, encoding="utf-8") as f:
|
||||
first_line = f.readline().strip()
|
||||
match = re.match(r"^#\s+(?P<title>.+)$", first_line)
|
||||
if match:
|
||||
return match.group("title")
|
||||
return fix_case(self.path.stem.replace("_", " ").title())
|
||||
@cached_property
|
||||
def is_code(self) -> bool:
|
||||
return self.main_file is not None and self.main_file.suffix != ".md"
|
||||
|
||||
@cached_property
|
||||
def title(self) -> str:
|
||||
# Generate title from filename if no main md file found
|
||||
if self.main_file is None or self.is_code:
|
||||
return title(self.path.stem)
|
||||
# Specify encoding for building on Windows
|
||||
with open(self.main_file, encoding="utf-8") as f:
|
||||
first_line = f.readline().strip()
|
||||
match = re.match(r"^#\s+(?P<title>.+)$", first_line)
|
||||
if match:
|
||||
return match.group("title")
|
||||
raise ValueError(f"Title not found in {self.main_file}")
|
||||
|
||||
def fix_relative_links(self, content: str) -> str:
|
||||
"""
|
||||
@ -156,24 +156,35 @@ class Example:
|
||||
# included files containing code fences too
|
||||
code_fence = "``````"
|
||||
|
||||
if self.is_code:
|
||||
content += (
|
||||
f"{code_fence}{self.main_file.suffix[1:]}\n"
|
||||
f'--8<-- "{self.main_file}"\n'
|
||||
f"{code_fence}\n"
|
||||
)
|
||||
if self.main_file is not None:
|
||||
# Single file example or multi file example with a README
|
||||
if self.is_code:
|
||||
content += (
|
||||
f"{code_fence}{self.main_file.suffix[1:]}\n"
|
||||
f'--8<-- "{self.main_file}"\n'
|
||||
f"{code_fence}\n"
|
||||
)
|
||||
else:
|
||||
with open(self.main_file, encoding="utf-8") as f:
|
||||
# Skip the title from md snippets as it's been included above
|
||||
main_content = f.readlines()[1:]
|
||||
content += self.fix_relative_links("".join(main_content))
|
||||
content += "\n"
|
||||
else:
|
||||
with open(self.main_file) as f:
|
||||
# Skip the title from md snippets as it's been included above
|
||||
main_content = f.readlines()[1:]
|
||||
content += self.fix_relative_links("".join(main_content))
|
||||
content += "\n"
|
||||
# Multi file example without a README
|
||||
for file in self.other_files:
|
||||
file_title = title(str(file.relative_to(self.path).with_suffix("")))
|
||||
content += f"## {file_title}\n\n"
|
||||
content += (
|
||||
f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n'
|
||||
)
|
||||
return content
|
||||
|
||||
if not self.other_files:
|
||||
return content
|
||||
|
||||
content += "## Example materials\n\n"
|
||||
for file in sorted(self.other_files):
|
||||
for file in self.other_files:
|
||||
content += f'??? abstract "{file.relative_to(self.path)}"\n'
|
||||
if file.suffix != ".md":
|
||||
content += f" {code_fence}{file.suffix[1:]}\n"
|
||||
@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
glob_patterns = ["*.py", "*.md", "*.sh"]
|
||||
# Find categorised examples
|
||||
for category in categories:
|
||||
logger.info("Processing category: %s", category.stem)
|
||||
globs = [category.glob(pattern) for pattern in glob_patterns]
|
||||
for path in itertools.chain(*globs):
|
||||
examples.append(Example(path, category.stem))
|
||||
# Find examples in subdirectories
|
||||
for path in category.glob("*/*.md"):
|
||||
globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns]
|
||||
for path in itertools.chain(*globs):
|
||||
examples.append(Example(path.parent, category.stem))
|
||||
|
||||
# Generate the example documentation
|
||||
@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
with open(doc_path, "w+", encoding="utf-8") as f:
|
||||
f.write(example.generate())
|
||||
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
|
||||
logger.info("Total examples generated: %d", len(examples))
|
||||
|
||||
@ -274,7 +274,7 @@ outputs = llm.embed(
|
||||
print(outputs[0].outputs)
|
||||
```
|
||||
|
||||
A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py)
|
||||
A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py)
|
||||
|
||||
### Online Inference
|
||||
|
||||
@ -304,7 +304,7 @@ Expected output:
|
||||
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
```
|
||||
|
||||
An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
|
||||
An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py)
|
||||
|
||||
## Deprecated Features
|
||||
|
||||
|
||||
@ -417,7 +417,8 @@ th {
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
@ -567,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
```
|
||||
|
||||
!!! note
|
||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py).
|
||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
@ -605,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
|
||||
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
|
||||
|
||||
!!! note
|
||||
Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).
|
||||
Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py).
|
||||
|
||||
## List of Multimodal Language Models
|
||||
|
||||
@ -711,7 +712,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
|
||||
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
@ -234,7 +234,7 @@ The following extra parameters are supported:
|
||||
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
|
||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||
|
||||
Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)
|
||||
Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
|
||||
|
||||
If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
|
||||
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
|
||||
@ -335,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
|
||||
example below for details.
|
||||
|
||||
Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py)
|
||||
Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py)
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@ -516,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
|
||||
|
||||
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
|
||||
|
||||
Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)
|
||||
Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py)
|
||||
|
||||
### Classification API
|
||||
|
||||
@ -524,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
|
||||
|
||||
We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
|
||||
|
||||
Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py)
|
||||
Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py)
|
||||
|
||||
#### Example Requests
|
||||
|
||||
@ -640,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
|
||||
|
||||
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
|
||||
Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py)
|
||||
Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
|
||||
|
||||
#### Single inference
|
||||
|
||||
@ -821,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
|
||||
print("Scoring output:", response_json["data"][0]["score"])
|
||||
print("Scoring output:", response_json["data"][1]["score"])
|
||||
```
|
||||
Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py)
|
||||
Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py)
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
|
||||
[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
|
||||
popular open-source tools.
|
||||
|
||||
Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py)
|
||||
Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
|
||||
|
||||
#### Example Request
|
||||
|
||||
|
||||
98
examples/offline_inference/llm_engine_reset_kv.py
Normal file
98
examples/offline_inference/llm_engine_reset_kv.py
Normal file
@ -0,0 +1,98 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file demonstrates preempt requests when using the `LLMEngine`
|
||||
for processing prompts with various sampling parameters.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def create_test_prompts() -> list[tuple[str, SamplingParams]]:
|
||||
"""Create a list of test prompts with their sampling parameters."""
|
||||
return [
|
||||
(
|
||||
"A robot may not injure a human being " * 50,
|
||||
SamplingParams(
|
||||
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
|
||||
),
|
||||
),
|
||||
(
|
||||
"A robot may not injure a human being " * 50,
|
||||
SamplingParams(
|
||||
temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
|
||||
),
|
||||
),
|
||||
(
|
||||
"To be or not to be,",
|
||||
SamplingParams(
|
||||
temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128
|
||||
),
|
||||
),
|
||||
(
|
||||
"What is the meaning of life?",
|
||||
SamplingParams(
|
||||
n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1, max_tokens=128
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
|
||||
print("-" * 50)
|
||||
step_id = 0
|
||||
while test_prompts or engine.has_unfinished_requests():
|
||||
print("-" * 50)
|
||||
import os
|
||||
|
||||
print(f"Step {step_id} (pid={os.getpid()})")
|
||||
|
||||
if test_prompts:
|
||||
prompt, sampling_params = test_prompts.pop(0)
|
||||
engine.add_request(str(request_id), prompt, sampling_params)
|
||||
request_id += 1
|
||||
|
||||
if step_id == 10:
|
||||
print(f"Resetting prefix cache at {step_id}")
|
||||
engine.reset_prefix_cache(reset_running_requests=True)
|
||||
|
||||
request_outputs: list[RequestOutput] = engine.step()
|
||||
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
print("-" * 50)
|
||||
print(request_output)
|
||||
print("-" * 50)
|
||||
step_id += 1
|
||||
|
||||
|
||||
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
|
||||
"""Initialize the LLMEngine from the command line arguments."""
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
return LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Demo on using the LLMEngine class directly"
|
||||
)
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
"""Main function that sets up and runs the prompt processing."""
|
||||
engine = initialize_engine(args)
|
||||
test_prompts = create_test_prompts()
|
||||
process_requests(engine, test_prompts)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -1,57 +0,0 @@
|
||||
# Pooling models
|
||||
|
||||
## Convert llm model to seq cls
|
||||
|
||||
```bash
|
||||
# for BAAI/bge-reranker-v2-gemma
|
||||
# Caution: "Yes" and "yes" are two different tokens
|
||||
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
|
||||
# for mxbai-rerank-v2
|
||||
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
|
||||
# for Qwen3-Reranker
|
||||
python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
|
||||
```
|
||||
|
||||
## Embed jina_embeddings_v3 usage
|
||||
|
||||
Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120>
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
|
||||
```
|
||||
|
||||
## Embed matryoshka dimensions usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/embed_matryoshka_fy.py
|
||||
```
|
||||
|
||||
## Multi vector retrieval usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/multi_vector_retrieval.py
|
||||
```
|
||||
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/ner.py
|
||||
```
|
||||
|
||||
## Prithvi Geospatial MAE usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/prithvi_geospatial_mae.py
|
||||
```
|
||||
|
||||
## IO Processor Plugins for Prithvi Geospatial MAE
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
|
||||
```
|
||||
|
||||
## Qwen3 reranker usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/qwen3_reranker.py
|
||||
```
|
||||
@ -1801,7 +1801,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
|
||||
hf_overrides={
|
||||
"architectures": ["Tarsier2ForConditionalGeneration"],
|
||||
"model_type": "tarsier2",
|
||||
},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
|
||||
@ -1222,7 +1222,10 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
trust_remote_code=True,
|
||||
max_model_len=32768,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
|
||||
hf_overrides={
|
||||
"architectures": ["Tarsier2ForConditionalGeneration"],
|
||||
"model_type": "tarsier2",
|
||||
},
|
||||
)
|
||||
|
||||
prompt = (
|
||||
|
||||
@ -1,97 +0,0 @@
|
||||
# Pooling models
|
||||
|
||||
## Cohere rerank usage
|
||||
|
||||
```bash
|
||||
# vllm serve BAAI/bge-reranker-base
|
||||
python examples/online_serving/pooling/cohere_rerank_client.py
|
||||
```
|
||||
|
||||
## Embedding requests base64 encoding_format usage
|
||||
|
||||
```bash
|
||||
# vllm serve intfloat/e5-small
|
||||
python examples/online_serving/pooling/embedding_requests_base64_client.py
|
||||
```
|
||||
|
||||
## Embedding requests bytes encoding_format usage
|
||||
|
||||
```bash
|
||||
# vllm serve intfloat/e5-small
|
||||
python examples/online_serving/pooling/embedding_requests_bytes_client.py
|
||||
```
|
||||
|
||||
## Jinaai rerank usage
|
||||
|
||||
```bash
|
||||
# vllm serve BAAI/bge-reranker-base
|
||||
python examples/online_serving/pooling/jinaai_rerank_client.py
|
||||
```
|
||||
|
||||
## Multi vector retrieval usage
|
||||
|
||||
```bash
|
||||
# vllm serve BAAI/bge-m3
|
||||
python examples/online_serving/pooling/multi_vector_retrieval_client.py
|
||||
```
|
||||
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
# vllm serve boltuix/NeuroBERT-NER
|
||||
python examples/online_serving/pooling/ner_client.py
|
||||
```
|
||||
|
||||
## OpenAI chat embedding for multimodal usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
|
||||
```
|
||||
|
||||
## OpenAI classification usage
|
||||
|
||||
```bash
|
||||
# vllm serve jason9693/Qwen2.5-1.5B-apeach
|
||||
python examples/online_serving/pooling/openai_classification_client.py
|
||||
```
|
||||
|
||||
## OpenAI cross_encoder score usage
|
||||
|
||||
```bash
|
||||
# vllm serve BAAI/bge-reranker-v2-m3
|
||||
python examples/online_serving/pooling/openai_cross_encoder_score.py
|
||||
```
|
||||
|
||||
## OpenAI cross_encoder score for multimodal usage
|
||||
|
||||
```bash
|
||||
# vllm serve jinaai/jina-reranker-m0
|
||||
python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
|
||||
```
|
||||
|
||||
## OpenAI embedding usage
|
||||
|
||||
```bash
|
||||
# vllm serve intfloat/e5-small
|
||||
python examples/online_serving/pooling/openai_embedding_client.py
|
||||
```
|
||||
|
||||
## OpenAI embedding matryoshka dimensions usage
|
||||
|
||||
```bash
|
||||
# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
|
||||
python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
|
||||
```
|
||||
|
||||
## OpenAI pooling usage
|
||||
|
||||
```bash
|
||||
# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
|
||||
python examples/online_serving/pooling/openai_pooling_client.py
|
||||
```
|
||||
|
||||
## Online Prithvi Geospatial MAE usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/prithvi_geospatial_mae.py
|
||||
```
|
||||
148
setup.py
148
setup.py
@ -311,7 +311,7 @@ class precompiled_build_ext(build_ext):
|
||||
"""Disables extension building when using precompiled binaries."""
|
||||
|
||||
def run(self) -> None:
|
||||
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
||||
return
|
||||
|
||||
def build_extensions(self) -> None:
|
||||
print("Skipping build_ext: using precompiled extensions.")
|
||||
@ -322,14 +322,121 @@ class precompiled_wheel_utils:
|
||||
"""Extracts libraries and other files from an existing wheel."""
|
||||
|
||||
@staticmethod
|
||||
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
|
||||
def fetch_metadata_for_variant(
|
||||
commit: str, variant: str | None
|
||||
) -> tuple[list[dict], str]:
|
||||
"""
|
||||
Fetches metadata for a specific variant of the precompiled wheel.
|
||||
"""
|
||||
variant_dir = f"{variant}/" if variant is not None else ""
|
||||
repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
|
||||
meta_url = repo_url + "metadata.json"
|
||||
print(f"Trying to fetch nightly build metadata from {meta_url}")
|
||||
from urllib.request import urlopen
|
||||
|
||||
with urlopen(meta_url) as resp:
|
||||
# urlopen raises HTTPError on unexpected status code
|
||||
wheels = json.loads(resp.read().decode("utf-8"))
|
||||
return wheels, repo_url
|
||||
|
||||
@staticmethod
|
||||
def determine_wheel_url() -> tuple[str, str | None]:
|
||||
"""
|
||||
Try to determine the precompiled wheel URL or path to use.
|
||||
The order of preference is:
|
||||
1. user-specified wheel location (can be either local or remote, via
|
||||
VLLM_PRECOMPILED_WHEEL_LOCATION)
|
||||
2. user-specified variant from nightly repo (current main commit via
|
||||
VLLM_PRECOMPILED_WHEEL_VARIANT)
|
||||
3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
|
||||
4. the default variant from nightly repo (current main commit)
|
||||
"""
|
||||
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
|
||||
if wheel_location is not None:
|
||||
print(f"Using user-specified precompiled wheel location: {wheel_location}")
|
||||
return wheel_location, None
|
||||
else:
|
||||
import platform
|
||||
|
||||
arch = platform.machine()
|
||||
# try to fetch the wheel metadata from the nightly wheel repo
|
||||
main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
|
||||
variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
|
||||
commit = os.getenv(
|
||||
"VLLM_PRECOMPILED_WHEEL_COMMIT",
|
||||
precompiled_wheel_utils.get_base_commit_in_main_branch(),
|
||||
)
|
||||
print(f"Using precompiled wheel commit {commit} with variant {variant}")
|
||||
try_default = False
|
||||
wheels, repo_url, download_filename = None, None, None
|
||||
try:
|
||||
wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant(
|
||||
commit, variant
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to fetch precompiled wheel metadata for variant %s: %s",
|
||||
variant,
|
||||
e,
|
||||
)
|
||||
try_default = True # try outside handler to keep the stacktrace simple
|
||||
if try_default:
|
||||
print("Trying the default variant from remote")
|
||||
wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant(
|
||||
commit, None
|
||||
)
|
||||
# if this also fails, then we have nothing more to try / cache
|
||||
assert wheels is not None and repo_url is not None, (
|
||||
"Failed to fetch precompiled wheel metadata"
|
||||
)
|
||||
# The metadata.json has the following format:
|
||||
# see .buildkite/scripts/generate-nightly-index.py for details
|
||||
"""[{
|
||||
"package_name": "vllm",
|
||||
"version": "0.11.2.dev278+gdbc3d9991",
|
||||
"build_tag": null,
|
||||
"python_tag": "cp38",
|
||||
"abi_tag": "abi3",
|
||||
"platform_tag": "manylinux1_x86_64",
|
||||
"variant": null,
|
||||
"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
|
||||
"path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
|
||||
},
|
||||
...]"""
|
||||
from urllib.parse import urljoin
|
||||
|
||||
for wheel in wheels:
|
||||
# TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
|
||||
if wheel.get("package_name") == "vllm" and arch in wheel.get(
|
||||
"platform_tag", ""
|
||||
):
|
||||
print(f"Found precompiled wheel metadata: {wheel}")
|
||||
if "path" not in wheel:
|
||||
raise ValueError(f"Wheel metadata missing path: {wheel}")
|
||||
wheel_url = urljoin(repo_url, wheel["path"])
|
||||
download_filename = wheel.get("filename")
|
||||
print(f"Using precompiled wheel URL: {wheel_url}")
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
f"No precompiled vllm wheel found for architecture {arch} "
|
||||
f"from repo {repo_url}. All available wheels: {wheels}"
|
||||
)
|
||||
|
||||
return wheel_url, download_filename
|
||||
|
||||
@staticmethod
|
||||
def extract_precompiled_and_patch_package(
|
||||
wheel_url_or_path: str, download_filename: str | None
|
||||
) -> dict:
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
||||
temp_dir = None
|
||||
try:
|
||||
if not os.path.isfile(wheel_url_or_path):
|
||||
wheel_filename = wheel_url_or_path.split("/")[-1]
|
||||
# use provided filename first, then derive from URL
|
||||
wheel_filename = download_filename or wheel_url_or_path.split("/")[-1]
|
||||
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
||||
wheel_path = os.path.join(temp_dir, wheel_filename)
|
||||
print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
|
||||
@ -648,38 +755,13 @@ package_data = {
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# If using precompiled, extract and patch package_data (in advance of setup)
|
||||
if envs.VLLM_USE_PRECOMPILED:
|
||||
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
||||
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
|
||||
if wheel_location is not None:
|
||||
wheel_url = wheel_location
|
||||
else:
|
||||
import platform
|
||||
|
||||
arch = platform.machine()
|
||||
if arch == "x86_64":
|
||||
wheel_tag = "manylinux1_x86_64"
|
||||
elif arch == "aarch64":
|
||||
wheel_tag = "manylinux2014_aarch64"
|
||||
else:
|
||||
raise ValueError(f"Unsupported architecture: {arch}")
|
||||
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
|
||||
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
|
||||
nightly_wheel_url = (
|
||||
f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
|
||||
)
|
||||
from urllib.request import urlopen
|
||||
|
||||
try:
|
||||
with urlopen(wheel_url) as resp:
|
||||
if resp.status != 200:
|
||||
wheel_url = nightly_wheel_url
|
||||
except Exception as e:
|
||||
print(f"[warn] Falling back to nightly wheel: {e}")
|
||||
wheel_url = nightly_wheel_url
|
||||
|
||||
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
|
||||
wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url()
|
||||
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
|
||||
wheel_url, download_filename
|
||||
)
|
||||
for pkg, files in patch.items():
|
||||
package_data.setdefault(pkg, []).extend(files)
|
||||
|
||||
|
||||
257
tests/benchmarks/test_param_sweep.py
Normal file
257
tests/benchmarks/test_param_sweep.py
Normal file
@ -0,0 +1,257 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem
|
||||
|
||||
|
||||
class TestParameterSweepItem:
|
||||
"""Test ParameterSweepItem functionality."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected",
|
||||
[
|
||||
(
|
||||
{"compilation_config.use_inductor_graph_partition": False},
|
||||
"--compilation-config.use_inductor_graph_partition=false",
|
||||
),
|
||||
(
|
||||
{"compilation_config.use_inductor_graph_partition": True},
|
||||
"--compilation-config.use_inductor_graph_partition=true",
|
||||
),
|
||||
(
|
||||
{"compilation_config.use_inductor": False},
|
||||
"--compilation-config.use_inductor=false",
|
||||
),
|
||||
(
|
||||
{"compilation_config.use_inductor": True},
|
||||
"--compilation-config.use_inductor=true",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_nested_boolean_params(self, input_dict, expected):
|
||||
"""Test that nested boolean params use =true/false syntax."""
|
||||
item = ParameterSweepItem.from_record(input_dict)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
|
||||
assert expected in cmd
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected",
|
||||
[
|
||||
({"enable_prefix_caching": False}, "--no-enable-prefix-caching"),
|
||||
({"enable_prefix_caching": True}, "--enable-prefix-caching"),
|
||||
({"disable_log_stats": False}, "--no-disable-log-stats"),
|
||||
({"disable_log_stats": True}, "--disable-log-stats"),
|
||||
],
|
||||
)
|
||||
def test_non_nested_boolean_params(self, input_dict, expected):
|
||||
"""Test that non-nested boolean params use --no- prefix."""
|
||||
item = ParameterSweepItem.from_record(input_dict)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
|
||||
assert expected in cmd
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"compilation_config",
|
||||
[
|
||||
{"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True},
|
||||
{
|
||||
"cudagraph_mode": "piecewise",
|
||||
"mode": 3,
|
||||
"use_inductor_graph_partition": False,
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_nested_dict_value(self, compilation_config):
|
||||
"""Test that nested dict values are serialized as JSON."""
|
||||
item = ParameterSweepItem.from_record(
|
||||
{"compilation_config": compilation_config}
|
||||
)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
|
||||
assert "--compilation-config" in cmd
|
||||
# The dict should be JSON serialized
|
||||
idx = cmd.index("--compilation-config")
|
||||
assert json.loads(cmd[idx + 1]) == compilation_config
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected_key,expected_value",
|
||||
[
|
||||
({"model": "test-model"}, "--model", "test-model"),
|
||||
({"max_tokens": 100}, "--max-tokens", "100"),
|
||||
({"temperature": 0.7}, "--temperature", "0.7"),
|
||||
],
|
||||
)
|
||||
def test_string_and_numeric_values(self, input_dict, expected_key, expected_value):
|
||||
"""Test that string and numeric values are handled correctly."""
|
||||
item = ParameterSweepItem.from_record(input_dict)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve"])
|
||||
assert expected_key in cmd
|
||||
assert expected_value in cmd
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected_key,key_idx_offset",
|
||||
[
|
||||
({"max_tokens": 200}, "--max-tokens", 1),
|
||||
({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0),
|
||||
],
|
||||
)
|
||||
def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset):
|
||||
"""Test that existing parameters in cmd are replaced."""
|
||||
item = ParameterSweepItem.from_record(input_dict)
|
||||
|
||||
if key_idx_offset == 1:
|
||||
# Key-value pair
|
||||
cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"])
|
||||
assert expected_key in cmd
|
||||
idx = cmd.index(expected_key)
|
||||
assert cmd[idx + 1] == "200"
|
||||
assert "100" not in cmd
|
||||
else:
|
||||
# Boolean flag
|
||||
cmd = item.apply_to_cmd(
|
||||
["vllm", "serve", "--enable-prefix-caching", "model"]
|
||||
)
|
||||
assert expected_key in cmd
|
||||
assert "--enable-prefix-caching" not in cmd
|
||||
|
||||
|
||||
class TestParameterSweep:
|
||||
"""Test ParameterSweep functionality."""
|
||||
|
||||
def test_from_records_list(self):
|
||||
"""Test creating ParameterSweep from a list of records."""
|
||||
records = [
|
||||
{"max_tokens": 100, "temperature": 0.7},
|
||||
{"max_tokens": 200, "temperature": 0.9},
|
||||
]
|
||||
sweep = ParameterSweep.from_records(records)
|
||||
assert len(sweep) == 2
|
||||
assert sweep[0]["max_tokens"] == 100
|
||||
assert sweep[1]["max_tokens"] == 200
|
||||
|
||||
def test_read_from_dict(self):
|
||||
"""Test creating ParameterSweep from a dict format."""
|
||||
data = {
|
||||
"experiment1": {"max_tokens": 100, "temperature": 0.7},
|
||||
"experiment2": {"max_tokens": 200, "temperature": 0.9},
|
||||
}
|
||||
sweep = ParameterSweep.read_from_dict(data)
|
||||
assert len(sweep) == 2
|
||||
|
||||
# Check that items have the _benchmark_name field
|
||||
names = {item["_benchmark_name"] for item in sweep}
|
||||
assert names == {"experiment1", "experiment2"}
|
||||
|
||||
# Check that parameters are preserved
|
||||
for item in sweep:
|
||||
if item["_benchmark_name"] == "experiment1":
|
||||
assert item["max_tokens"] == 100
|
||||
assert item["temperature"] == 0.7
|
||||
elif item["_benchmark_name"] == "experiment2":
|
||||
assert item["max_tokens"] == 200
|
||||
assert item["temperature"] == 0.9
|
||||
|
||||
def test_read_json_list_format(self):
|
||||
"""Test reading JSON file with list format."""
|
||||
records = [
|
||||
{"max_tokens": 100, "temperature": 0.7},
|
||||
{"max_tokens": 200, "temperature": 0.9},
|
||||
]
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||
json.dump(records, f)
|
||||
temp_path = Path(f.name)
|
||||
|
||||
try:
|
||||
sweep = ParameterSweep.read_json(temp_path)
|
||||
assert len(sweep) == 2
|
||||
assert sweep[0]["max_tokens"] == 100
|
||||
assert sweep[1]["max_tokens"] == 200
|
||||
finally:
|
||||
temp_path.unlink()
|
||||
|
||||
def test_read_json_dict_format(self):
|
||||
"""Test reading JSON file with dict format."""
|
||||
data = {
|
||||
"experiment1": {"max_tokens": 100, "temperature": 0.7},
|
||||
"experiment2": {"max_tokens": 200, "temperature": 0.9},
|
||||
}
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||
json.dump(data, f)
|
||||
temp_path = Path(f.name)
|
||||
|
||||
try:
|
||||
sweep = ParameterSweep.read_json(temp_path)
|
||||
assert len(sweep) == 2
|
||||
|
||||
# Check that items have the _benchmark_name field
|
||||
names = {item["_benchmark_name"] for item in sweep}
|
||||
assert names == {"experiment1", "experiment2"}
|
||||
finally:
|
||||
temp_path.unlink()
|
||||
|
||||
def test_unique_benchmark_names_validation(self):
|
||||
"""Test that duplicate _benchmark_name values raise an error."""
|
||||
# Test with duplicate names in list format
|
||||
records = [
|
||||
{"_benchmark_name": "exp1", "max_tokens": 100},
|
||||
{"_benchmark_name": "exp1", "max_tokens": 200},
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
|
||||
ParameterSweep.from_records(records)
|
||||
|
||||
def test_unique_benchmark_names_multiple_duplicates(self):
|
||||
"""Test validation with multiple duplicate names."""
|
||||
records = [
|
||||
{"_benchmark_name": "exp1", "max_tokens": 100},
|
||||
{"_benchmark_name": "exp1", "max_tokens": 200},
|
||||
{"_benchmark_name": "exp2", "max_tokens": 300},
|
||||
{"_benchmark_name": "exp2", "max_tokens": 400},
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
|
||||
ParameterSweep.from_records(records)
|
||||
|
||||
def test_no_benchmark_names_allowed(self):
|
||||
"""Test that records without _benchmark_name are allowed."""
|
||||
records = [
|
||||
{"max_tokens": 100, "temperature": 0.7},
|
||||
{"max_tokens": 200, "temperature": 0.9},
|
||||
]
|
||||
sweep = ParameterSweep.from_records(records)
|
||||
assert len(sweep) == 2
|
||||
|
||||
def test_mixed_benchmark_names_allowed(self):
|
||||
"""Test that mixing records with and without _benchmark_name is allowed."""
|
||||
records = [
|
||||
{"_benchmark_name": "exp1", "max_tokens": 100},
|
||||
{"max_tokens": 200, "temperature": 0.9},
|
||||
]
|
||||
sweep = ParameterSweep.from_records(records)
|
||||
assert len(sweep) == 2
|
||||
|
||||
|
||||
class TestParameterSweepItemKeyNormalization:
|
||||
"""Test key normalization in ParameterSweepItem."""
|
||||
|
||||
def test_underscore_to_hyphen_conversion(self):
|
||||
"""Test that underscores are converted to hyphens in CLI."""
|
||||
item = ParameterSweepItem.from_record({"max_tokens": 100})
|
||||
cmd = item.apply_to_cmd(["vllm", "serve"])
|
||||
assert "--max-tokens" in cmd
|
||||
|
||||
def test_nested_key_preserves_suffix(self):
|
||||
"""Test that nested keys preserve the suffix format."""
|
||||
# The suffix after the dot should preserve underscores
|
||||
item = ParameterSweepItem.from_record(
|
||||
{"compilation_config.some_nested_param": "value"}
|
||||
)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve"])
|
||||
# The prefix (compilation_config) gets converted to hyphens,
|
||||
# but the suffix (some_nested_param) is preserved
|
||||
assert any("compilation-config.some_nested_param" in arg for arg in cmd)
|
||||
171
tests/benchmarks/test_plot_filters.py
Normal file
171
tests/benchmarks/test_plot_filters.py
Normal file
@ -0,0 +1,171 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from vllm.benchmarks.sweep.plot import (
|
||||
PlotEqualTo,
|
||||
PlotFilterBase,
|
||||
PlotFilters,
|
||||
PlotGreaterThan,
|
||||
PlotGreaterThanOrEqualTo,
|
||||
PlotLessThan,
|
||||
PlotLessThanOrEqualTo,
|
||||
PlotNotEqualTo,
|
||||
)
|
||||
|
||||
|
||||
class TestPlotFilters:
|
||||
"""Test PlotFilter functionality including 'inf' edge case."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Create sample DataFrames for testing."""
|
||||
# DataFrame with numeric values
|
||||
self.df_numeric = pd.DataFrame(
|
||||
{
|
||||
"request_rate": [1.0, 5.0, 10.0, 50.0, 100.0],
|
||||
"value": [10, 20, 30, 40, 50],
|
||||
}
|
||||
)
|
||||
|
||||
# DataFrame with float('inf') - note: string "inf" values are coerced
|
||||
# to float when loading data, so we only test with float('inf')
|
||||
self.df_inf_float = pd.DataFrame(
|
||||
{
|
||||
"request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")],
|
||||
"value": [10, 20, 30, 40, 50],
|
||||
}
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("5.0", 1),
|
||||
("10.0", 1),
|
||||
("1.0", 1),
|
||||
],
|
||||
)
|
||||
def test_equal_to_numeric(self, target, expected_count):
|
||||
"""Test PlotEqualTo with numeric values."""
|
||||
filter_obj = PlotEqualTo("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
def test_equal_to_inf_float(self):
|
||||
"""Test PlotEqualTo with float('inf')."""
|
||||
filter_obj = PlotEqualTo("request_rate", "inf")
|
||||
result = filter_obj.apply(self.df_inf_float)
|
||||
# Should match both float('inf') entries because float('inf') == float('inf')
|
||||
assert len(result) == 2
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("5.0", 4), # All except 5.0
|
||||
("1.0", 4), # All except 1.0
|
||||
],
|
||||
)
|
||||
def test_not_equal_to_numeric(self, target, expected_count):
|
||||
"""Test PlotNotEqualTo with numeric values."""
|
||||
filter_obj = PlotNotEqualTo("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
def test_not_equal_to_inf_float(self):
|
||||
"""Test PlotNotEqualTo with float('inf')."""
|
||||
filter_obj = PlotNotEqualTo("request_rate", "inf")
|
||||
result = filter_obj.apply(self.df_inf_float)
|
||||
# Should exclude float('inf') entries
|
||||
assert len(result) == 3
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("10.0", 2), # 1.0, 5.0
|
||||
("50.0", 3), # 1.0, 5.0, 10.0
|
||||
("5.0", 1), # 1.0
|
||||
],
|
||||
)
|
||||
def test_less_than(self, target, expected_count):
|
||||
"""Test PlotLessThan with numeric values."""
|
||||
filter_obj = PlotLessThan("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("10.0", 3), # 1.0, 5.0, 10.0
|
||||
("5.0", 2), # 1.0, 5.0
|
||||
],
|
||||
)
|
||||
def test_less_than_or_equal_to(self, target, expected_count):
|
||||
"""Test PlotLessThanOrEqualTo with numeric values."""
|
||||
filter_obj = PlotLessThanOrEqualTo("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("10.0", 2), # 50.0, 100.0
|
||||
("5.0", 3), # 10.0, 50.0, 100.0
|
||||
],
|
||||
)
|
||||
def test_greater_than(self, target, expected_count):
|
||||
"""Test PlotGreaterThan with numeric values."""
|
||||
filter_obj = PlotGreaterThan("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("10.0", 3), # 10.0, 50.0, 100.0
|
||||
("5.0", 4), # 5.0, 10.0, 50.0, 100.0
|
||||
],
|
||||
)
|
||||
def test_greater_than_or_equal_to(self, target, expected_count):
|
||||
"""Test PlotGreaterThanOrEqualTo with numeric values."""
|
||||
filter_obj = PlotGreaterThanOrEqualTo("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filter_str,expected_var,expected_target,expected_type",
|
||||
[
|
||||
("request_rate==5.0", "request_rate", "5.0", PlotEqualTo),
|
||||
("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo),
|
||||
("request_rate<50.0", "request_rate", "50.0", PlotLessThan),
|
||||
("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo),
|
||||
("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan),
|
||||
("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo),
|
||||
("request_rate==inf", "request_rate", "inf", PlotEqualTo),
|
||||
("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo),
|
||||
],
|
||||
)
|
||||
def test_parse_str(self, filter_str, expected_var, expected_target, expected_type):
|
||||
"""Test parsing filter strings."""
|
||||
filter_obj = PlotFilterBase.parse_str(filter_str)
|
||||
assert isinstance(filter_obj, expected_type)
|
||||
assert filter_obj.var == expected_var
|
||||
assert filter_obj.target == expected_target
|
||||
|
||||
def test_parse_str_inf_edge_case(self):
|
||||
"""Test parsing 'inf' string in filter."""
|
||||
filter_obj = PlotFilterBase.parse_str("request_rate==inf")
|
||||
assert isinstance(filter_obj, PlotEqualTo)
|
||||
assert filter_obj.var == "request_rate"
|
||||
assert filter_obj.target == "inf"
|
||||
|
||||
def test_parse_multiple_filters(self):
|
||||
"""Test parsing multiple filters."""
|
||||
filters = PlotFilters.parse_str("request_rate>5.0,value<=40")
|
||||
assert len(filters) == 2
|
||||
assert isinstance(filters[0], PlotGreaterThan)
|
||||
assert isinstance(filters[1], PlotLessThanOrEqualTo)
|
||||
|
||||
def test_parse_empty_filter(self):
|
||||
"""Test parsing empty filter string."""
|
||||
filters = PlotFilters.parse_str("")
|
||||
assert len(filters) == 0
|
||||
@ -8,7 +8,7 @@ import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config.compilation import CompilationMode, DynamicShapesType
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
|
||||
|
||||
@ -318,13 +318,18 @@ def test_attention_quant_pattern(
|
||||
torch.set_default_dtype(dtype)
|
||||
torch.manual_seed(42)
|
||||
|
||||
model_config = ModelConfig(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
dtype=dtype,
|
||||
)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=ModelConfig(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
dtype=dtype,
|
||||
model_config=model_config,
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_num_seqs=1024,
|
||||
max_model_len=model_config.max_model_len,
|
||||
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||
),
|
||||
scheduler_config=SchedulerConfig(max_num_seqs=1024),
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=custom_ops_list,
|
||||
|
||||
@ -59,6 +59,7 @@ from vllm.distributed import (
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
@ -1174,6 +1175,7 @@ def caplog_mp_spawn(tmp_path, monkeypatch):
|
||||
"level": level,
|
||||
"filename": log_path.as_posix(),
|
||||
}
|
||||
config["loggers"]["vllm"]["level"] = level
|
||||
|
||||
config_path.write_text(json.dumps(config))
|
||||
|
||||
@ -1388,7 +1390,11 @@ class LocalAssetServer:
|
||||
return f"{self.base_url}/{name}"
|
||||
|
||||
def get_image_asset(self, name: str) -> Image.Image:
|
||||
return fetch_image(self.url_for(name))
|
||||
image = fetch_image(self.url_for(name))
|
||||
# Unwrap MediaWithBytes if present
|
||||
if isinstance(image, MediaWithBytes):
|
||||
image = image.media
|
||||
return image
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
||||
@ -22,7 +22,14 @@ def get_model_args(
|
||||
"num_speculative_tokens": 1,
|
||||
"max_model_len": model_max_len,
|
||||
}
|
||||
|
||||
eplb_config = {
|
||||
"num_redundant_experts": tp_size,
|
||||
"window_size": 128,
|
||||
"step_interval": 1024,
|
||||
"log_balancedness": False,
|
||||
}
|
||||
if use_async:
|
||||
eplb_config["use_async"] = True
|
||||
model_args = {
|
||||
"pretrained": model_name,
|
||||
"dtype": "auto",
|
||||
@ -31,15 +38,10 @@ def get_model_args(
|
||||
"gpu_memory_utilization": 0.7,
|
||||
"speculative_config": speculative_config,
|
||||
"enable_expert_parallel": True,
|
||||
"num_redundant_experts": tp_size,
|
||||
"eplb_window_size": 128,
|
||||
"eplb_step_interval": 1024,
|
||||
"eplb_log_balancedness": False,
|
||||
"eplb_config": eplb_config,
|
||||
"enable_eplb": True,
|
||||
"max_model_len": model_max_len,
|
||||
}
|
||||
if use_async:
|
||||
model_args["eplb_config"] = {"use_async": True}
|
||||
return model_args
|
||||
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ import pytest
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...models.registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import VLLM_PATH
|
||||
|
||||
@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
MODEL_NAME = "openai-community/gpt2"
|
||||
|
||||
@ -0,0 +1,87 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from openai import OpenAI
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-8B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
|
||||
env_dict = dict(
|
||||
VLLM_ENABLE_RESPONSES_API_STORE="1",
|
||||
VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
|
||||
# uncomment for tool calling
|
||||
# PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
|
||||
)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
)
|
||||
assert response is not None
|
||||
print("response: ", response)
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{"type": "message", "content": "Hello.", "role": "user"},
|
||||
{
|
||||
"type": "reasoning",
|
||||
"id": "lol",
|
||||
"content": [
|
||||
{
|
||||
"type": "reasoning_text",
|
||||
"text": "We need to respond: greeting.",
|
||||
}
|
||||
],
|
||||
"summary": [],
|
||||
},
|
||||
{
|
||||
"arguments": '{"location": "Paris", "unit": "celsius"}',
|
||||
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
|
||||
"name": "get_weather",
|
||||
"type": "function_call",
|
||||
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
|
||||
"status": "completed",
|
||||
},
|
||||
{
|
||||
"call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
|
||||
"id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
|
||||
"output": "The weather in Paris is 20 Celsius",
|
||||
"status": "completed",
|
||||
"type": "function_call_output",
|
||||
},
|
||||
],
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
# make sure we get a reasoning and text output
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "message"
|
||||
assert type(response.output[1].content[0].text) is str
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -7,7 +7,7 @@ import tempfile
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
@ -111,7 +112,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
"content": f"{placeholder}{content}",
|
||||
}
|
||||
]
|
||||
images = [fetch_image(image_url)]
|
||||
image = fetch_image(image_url)
|
||||
# Unwrap MediaWithBytes if present
|
||||
if isinstance(image, MediaWithBytes):
|
||||
image = image.media
|
||||
images = [image]
|
||||
|
||||
prompt = processor.tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
|
||||
@ -271,7 +271,7 @@ async def test_streaming_product_tool_call():
|
||||
|
||||
@pytest.fixture
|
||||
def qwen_tokenizer() -> TokenizerLike:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
return get_tokenizer("Qwen/Qwen3-32B")
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.utils.serial_utils import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
ENDIANNESS,
|
||||
|
||||
@ -9,6 +9,7 @@ from transformers import AutoProcessor
|
||||
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
|
||||
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
||||
@ -62,7 +63,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
|
||||
placeholder = "<|image_1|> "
|
||||
prompt = f"{placeholder}{content}"
|
||||
images = [fetch_image(image_url)]
|
||||
image = fetch_image(image_url)
|
||||
# Unwrap MediaWithBytes if present
|
||||
if isinstance(image, MediaWithBytes):
|
||||
image = image.media
|
||||
images = [image]
|
||||
inputs = processor(prompt, images, return_tensors="pt")
|
||||
return inputs.input_ids.shape[1]
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ import torch
|
||||
from tests.models.utils import check_embeddings_close
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.utils.serial_utils import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
ENDIANNESS,
|
||||
|
||||
@ -45,7 +45,10 @@ def basic_server_with_lora(smollm2_lora_files):
|
||||
"64",
|
||||
]
|
||||
|
||||
envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
|
||||
envs = {
|
||||
"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True",
|
||||
"SAGEMAKER_ENABLE_STATEFUL_SESSIONS": "True",
|
||||
}
|
||||
with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -28,8 +28,7 @@ from vllm.multimodal.utils import (
|
||||
encode_image_base64,
|
||||
encode_video_base64,
|
||||
)
|
||||
from vllm.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import MistralTokenizer, get_tokenizer
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import VLLM_PATH
|
||||
|
||||
@ -5,6 +5,8 @@ import pytest
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai.types.responses.response_output_message import ResponseOutputMessage
|
||||
from openai.types.responses.response_output_text import ResponseOutputText
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content,
|
||||
ResponseReasoningItem,
|
||||
@ -101,3 +103,22 @@ class TestResponsesUtils:
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
construct_chat_message_with_tool_call(item)
|
||||
|
||||
output_item = ResponseOutputMessage(
|
||||
id="msg_bf585bbbe3d500e0",
|
||||
content=[
|
||||
ResponseOutputText(
|
||||
annotations=[],
|
||||
text="dongyi",
|
||||
type="output_text",
|
||||
logprobs=None,
|
||||
)
|
||||
],
|
||||
role="assistant",
|
||||
status="completed",
|
||||
type="message",
|
||||
)
|
||||
|
||||
formatted_item = construct_chat_message_with_tool_call(output_item)
|
||||
assert formatted_item["role"] == "assistant"
|
||||
assert formatted_item["content"] == "dongyi"
|
||||
|
||||
@ -33,14 +33,16 @@ def test_worker_apply_lora(qwen3_lora_files):
|
||||
lora_requests, lora_mapping
|
||||
)
|
||||
|
||||
model_config = ModelConfig(
|
||||
MODEL_PATH,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_model_len=127,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=ModelConfig(
|
||||
MODEL_PATH,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_model_len=127,
|
||||
enforce_eager=True,
|
||||
),
|
||||
model_config=model_config,
|
||||
load_config=LoadConfig(
|
||||
download_dir=None,
|
||||
load_format="dummy",
|
||||
@ -50,7 +52,14 @@ def test_worker_apply_lora(qwen3_lora_files):
|
||||
tensor_parallel_size=1,
|
||||
data_parallel_size=1,
|
||||
),
|
||||
scheduler_config=SchedulerConfig("generate", 32, 32, 32),
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_model_len=model_config.max_model_len,
|
||||
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||
runner_type="generate",
|
||||
max_num_batched_tokens=32,
|
||||
max_num_seqs=32,
|
||||
max_num_partial_prefills=32,
|
||||
),
|
||||
device_config=DeviceConfig("cuda"),
|
||||
cache_config=CacheConfig(
|
||||
block_size=16,
|
||||
|
||||
@ -315,3 +315,38 @@ def test_mistral_function_call_nested_json():
|
||||
assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
|
||||
# No additional content outside the tool call should be returned.
|
||||
assert parsed.content is None
|
||||
|
||||
# multiple calls
|
||||
multiple_args_dict = [
|
||||
{
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
|
||||
},
|
||||
{},
|
||||
{"a": 0},
|
||||
{"a": 1, "b": "c"},
|
||||
]
|
||||
names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
|
||||
|
||||
model_output = "".join(
|
||||
[
|
||||
f"{parser.bot_token}{name}{json.dumps(args)}"
|
||||
for name, args in zip(names, multiple_args_dict)
|
||||
]
|
||||
)
|
||||
|
||||
parsed = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
# Assertions: the tool call is detected and the full nested JSON is parsed
|
||||
# without truncation.
|
||||
assert parsed.tools_called
|
||||
assert len(parsed.tool_calls) == len(multiple_args_dict)
|
||||
|
||||
for i, tool_call in enumerate(parsed.tool_calls):
|
||||
assert MistralToolCall.is_valid_id(tool_call.id)
|
||||
assert tool_call.function.name == names[i]
|
||||
assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
|
||||
# No additional content outside the tool call should be returned.
|
||||
assert parsed.content is None
|
||||
|
||||
@ -22,10 +22,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
|
||||
from vllm.multimodal.inputs import MultiModalInputs
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.tokenizer import (
|
||||
from vllm.tokenizers import (
|
||||
MistralTokenizer,
|
||||
TokenizerLike,
|
||||
cached_tokenizer_from_config,
|
||||
encode_tokens,
|
||||
)
|
||||
|
||||
from ....multimodal.utils import random_audio, random_image, random_video
|
||||
@ -154,7 +154,7 @@ def get_text_token_prompts(
|
||||
mm_data: MultiModalDataDict,
|
||||
):
|
||||
dummy_inputs = processor.dummy_inputs
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
tokenizer: TokenizerLike = processor.info.get_tokenizer()
|
||||
model_config = processor.info.ctx.model_config
|
||||
|
||||
model_type = model_config.hf_config.model_type
|
||||
@ -191,10 +191,9 @@ def get_text_token_prompts(
|
||||
assert isinstance(inputs.prompt, str)
|
||||
|
||||
text_prompt = inputs.prompt
|
||||
token_prompt = encode_tokens(
|
||||
tokenizer,
|
||||
token_prompt = tokenizer.encode(
|
||||
text_prompt,
|
||||
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
|
||||
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
|
||||
)
|
||||
|
||||
return text_prompt, token_prompt
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.transformers_utils.tokenizer import encode_tokens
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
@ -48,7 +47,7 @@ def test_processor_override(
|
||||
]
|
||||
}
|
||||
if tokenized_prompt:
|
||||
prompt = encode_tokens(tokenizer, prompt)
|
||||
prompt = tokenizer.encode(prompt)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
mm_data = processed_inputs["mm_kwargs"].get_data()
|
||||
|
||||
@ -31,7 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
|
||||
@ -358,6 +358,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
|
||||
"MistralLarge3ForCausalLM": _HfExamplesInfo(
|
||||
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
|
||||
),
|
||||
"MixtralForCausalLM": _HfExamplesInfo(
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
{"tiny": "TitanML/tiny-mixtral"},
|
||||
@ -770,7 +773,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
),
|
||||
"PixtralForConditionalGeneration": _HfExamplesInfo(
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
extras={
|
||||
"mistral-large-3": "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4",
|
||||
"ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
|
||||
},
|
||||
tokenizer_mode="mistral",
|
||||
# TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
|
||||
is_available_online=False,
|
||||
),
|
||||
"QwenVLForConditionalGeneration": _HfExamplesInfo(
|
||||
"Qwen/Qwen-VL",
|
||||
@ -822,7 +831,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),
|
||||
"Tarsier2ForConditionalGeneration": _HfExamplesInfo(
|
||||
"omni-research/Tarsier2-Recap-7b",
|
||||
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
|
||||
hf_overrides={
|
||||
"architectures": ["Tarsier2ForConditionalGeneration"],
|
||||
"model_type": "tarsier2",
|
||||
},
|
||||
),
|
||||
"VoxtralForConditionalGeneration": _HfExamplesInfo(
|
||||
"mistralai/Voxtral-Mini-3B-2507",
|
||||
@ -870,6 +882,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
use_original_num_layers=True,
|
||||
max_model_len=10240,
|
||||
),
|
||||
"EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
|
||||
"mistralai/Mistral-Large-3-675B-Instruct-2512",
|
||||
speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
|
||||
is_available_online=False,
|
||||
),
|
||||
"LlamaForCausalLMEagle3": _HfExamplesInfo(
|
||||
"Qwen/Qwen3-8B",
|
||||
trust_remote_code=True,
|
||||
|
||||
@ -203,7 +203,7 @@ class TestGGUFModelLoader:
|
||||
@patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
|
||||
@patch("vllm.config.model.get_config")
|
||||
@patch("vllm.config.model.is_gguf", return_value=False)
|
||||
@patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
|
||||
@patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
|
||||
@patch("os.path.isfile", return_value=False)
|
||||
def test_prepare_weights_invalid_format(
|
||||
self,
|
||||
|
||||
@ -13,7 +13,7 @@ from transformers import PretrainedConfig
|
||||
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
|
||||
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||
from vllm.multimodal.processing import InputProcessingContext
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
|
||||
from .. import ci_envs
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
@ -6,12 +6,14 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from vllm.compilation.backends import VllmBackend
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
ModelConfig,
|
||||
PoolerConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
update_config,
|
||||
)
|
||||
@ -716,7 +718,7 @@ def test_is_chunked_prefill_supported(
|
||||
):
|
||||
model_config = ModelConfig(model_id, trust_remote_code=True)
|
||||
assert model_config.attn_type == expected_attn_type
|
||||
with caplog_vllm.at_level(level=logging.DEBUG):
|
||||
with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"):
|
||||
assert model_config.is_chunked_prefill_supported == expected_result
|
||||
assert reason in caplog_vllm.text
|
||||
|
||||
@ -835,7 +837,7 @@ def test_is_prefix_caching_supported(
|
||||
):
|
||||
model_config = ModelConfig(model_id, trust_remote_code=True)
|
||||
assert model_config.attn_type == expected_attn_type
|
||||
with caplog_vllm.at_level(level=logging.DEBUG):
|
||||
with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"):
|
||||
assert model_config.is_prefix_caching_supported == expected_result
|
||||
assert reason in caplog_vllm.text
|
||||
|
||||
@ -1095,3 +1097,14 @@ def test_vllm_config_explicit_overrides():
|
||||
# Other fields should still use defaults
|
||||
assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
|
||||
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
|
||||
|
||||
def test_scheduler_config_init():
|
||||
with pytest.raises(ValidationError):
|
||||
# Positional InitVars missing
|
||||
# (InitVars cannot have defaults otherwise they will become attributes)
|
||||
SchedulerConfig()
|
||||
|
||||
with pytest.raises(AttributeError):
|
||||
# InitVar does not become an attribute
|
||||
print(SchedulerConfig.default_factory().max_model_len)
|
||||
|
||||
@ -7,7 +7,7 @@ from vllm.config import ModelConfig
|
||||
from vllm.inputs import zip_enc_dec_prompts
|
||||
from vllm.inputs.parse import parse_raw_prompts
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
|
||||
from vllm.tokenizers import init_tokenizer_from_config
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
|
||||
)
|
||||
def test_preprocessor_always_mm_code_path(model_id, prompt):
|
||||
model_config = ModelConfig(model=model_id)
|
||||
tokenizer = init_tokenizer_from_configs(model_config)
|
||||
tokenizer = init_tokenizer_from_config(model_config)
|
||||
input_preprocessor = InputPreprocessor(model_config, tokenizer)
|
||||
|
||||
# HF processor adds sep token
|
||||
|
||||
@ -57,7 +57,7 @@ def test_default_vllm_root_logger_configuration(monkeypatch):
|
||||
_configure_vllm_root_logger()
|
||||
|
||||
logger = logging.getLogger("vllm")
|
||||
assert logger.level == logging.DEBUG
|
||||
assert logger.level == logging.INFO
|
||||
assert not logger.propagate
|
||||
|
||||
handler = logger.handlers[0]
|
||||
@ -524,7 +524,7 @@ def mp_function(**kwargs):
|
||||
|
||||
|
||||
def test_caplog_mp_fork(caplog_vllm, caplog_mp_fork):
|
||||
with caplog_vllm.at_level(logging.DEBUG), caplog_mp_fork():
|
||||
with caplog_vllm.at_level(logging.DEBUG, logger="vllm"), caplog_mp_fork():
|
||||
import multiprocessing
|
||||
|
||||
ctx = multiprocessing.get_context("fork")
|
||||
|
||||
@ -5,8 +5,7 @@ from typing import _get_protocol_attrs # type: ignore
|
||||
import pytest
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import TokenizerLike, get_tokenizer
|
||||
|
||||
|
||||
def _get_missing_attrs(obj: object, target: type):
|
||||
|
||||
@ -91,6 +91,118 @@ from vllm.tokenizers.mistral import (
|
||||
],
|
||||
),
|
||||
),
|
||||
(
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the current local date and time?",
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"description": "Fetch the current local date and time.",
|
||||
"unsupported_field": False,
|
||||
"name": "get_current_time",
|
||||
"parameters": {},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"description": "Fetch the current local date and time.",
|
||||
"unsupported_field2": False,
|
||||
"name": "get_current_time",
|
||||
"parameters": {},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the current local date and time?",
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"description": "Fetch the current local date and time.",
|
||||
"name": "get_current_time",
|
||||
"parameters": {},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"description": "Fetch the current local date and time.",
|
||||
"name": "get_current_time",
|
||||
"parameters": {},
|
||||
},
|
||||
},
|
||||
],
|
||||
),
|
||||
),
|
||||
(
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the current local date and time?",
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"unsupported_field": False,
|
||||
"function": {
|
||||
"description": "Fetch the current local date and time.",
|
||||
"name": "get_current_time",
|
||||
"parameters": {},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"unsupported_field2": False,
|
||||
"function": {
|
||||
"description": "Fetch the current local date and time 2.",
|
||||
"name": "get_current_time2",
|
||||
"parameters": {"a": "1"},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the current local date and time?",
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"description": "Fetch the current local date and time.",
|
||||
"name": "get_current_time",
|
||||
"parameters": {},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"description": "Fetch the current local date and time 2.",
|
||||
"name": "get_current_time2",
|
||||
"parameters": {"a": "1"},
|
||||
},
|
||||
},
|
||||
],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_prepare_apply_chat_template_tools_and_messages(
|
||||
@ -1108,13 +1220,6 @@ class TestMistralTokenizer:
|
||||
)
|
||||
== expected_tokens[mistral_tokenizer.is_tekken]
|
||||
)
|
||||
assert (
|
||||
mistral_tokenizer.decode(
|
||||
ids[mistral_tokenizer.is_tekken],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
)
|
||||
== expected_tokens[mistral_tokenizer.is_tekken]
|
||||
)
|
||||
|
||||
def test_decode_empty(
|
||||
self,
|
||||
@ -1140,6 +1245,45 @@ class TestMistralTokenizer:
|
||||
== "<s>"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"skip_special_tokens,expected_tokens",
|
||||
(
|
||||
(
|
||||
False,
|
||||
(
|
||||
["<s>[INST]▁Hello▁world▁![/INST]▁Hello</s>"],
|
||||
["<s>[INST]Hello world ![/INST]Hello</s>"],
|
||||
),
|
||||
),
|
||||
(True, (["Hello world ! Hello"], ["Hello world !Hello"])),
|
||||
),
|
||||
)
|
||||
def test_batch_decode(
|
||||
self,
|
||||
mistral_tokenizer: MistralTokenizer,
|
||||
skip_special_tokens: bool,
|
||||
expected_tokens: tuple[str, str],
|
||||
):
|
||||
ids = (
|
||||
[[1, 3, 23325, 2294, 1686, 4, 23325, 2]],
|
||||
[[1, 3, 22177, 4304, 2662, 4, 22177, 2]],
|
||||
)
|
||||
assert (
|
||||
mistral_tokenizer.batch_decode(
|
||||
ids[mistral_tokenizer.is_tekken],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
)
|
||||
== expected_tokens[mistral_tokenizer.is_tekken]
|
||||
)
|
||||
|
||||
def test_batch_decode_empty(
|
||||
self,
|
||||
mistral_tokenizer: MistralTokenizer,
|
||||
):
|
||||
assert mistral_tokenizer.batch_decode(
|
||||
[[]],
|
||||
) == [""]
|
||||
|
||||
def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
|
||||
tokens = (
|
||||
[
|
||||
|
||||
@ -2,8 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
from vllm.tokenizers import TokenizerLike, TokenizerRegistry
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
|
||||
|
||||
|
||||
class TestTokenizer(TokenizerLike):
|
||||
|
||||
@ -6,7 +6,7 @@ import pytest
|
||||
from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
|
||||
DeepSeekV31ToolParser,
|
||||
)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
MODEL = "deepseek-ai/DeepSeek-V3.1"
|
||||
|
||||
|
||||
@ -14,9 +14,8 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ToolCall,
|
||||
)
|
||||
from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers import TokenizerLike, get_tokenizer
|
||||
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
|
||||
from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
|
||||
Glm4MoeModelToolParser,
|
||||
)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
@ -10,9 +10,8 @@ from partial_json_parser.core.options import Allow
|
||||
|
||||
from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
|
||||
from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers import TokenizerLike, get_tokenizer
|
||||
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
|
||||
from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user