diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 6d494f64f14fa..015f48c2520d6 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+
+ An Example of default parameters field
+
+```json
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "server_environment_variables": {
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+ },
+ "server_parameters": {
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "block_size": 128,
+ "disable_log_stats": "",
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "num_prompts": 200,
+ "ignore-eos": ""
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama3B_tp2_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 2,
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ }
+ },
+ {
+ "test_name": "serving_qwen3_tp4_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ "tensor_parallel_size": 4,
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ }
+ },
+ ]
+}
+```
+
+
+
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
index 99a5a5e334f8e..34ceefe0996f2 100644
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -110,7 +110,8 @@ json2envs() {
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
- timeout 1200 bash -c '
+ local timeout_val="1200"
+ timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
@@ -316,12 +317,44 @@ run_throughput_tests() {
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
+ #
+ # Supported JSON formats:
+ # 1) Plain format: top-level array
+ # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ #
+ # 2) Default parameters field + plain format tests
+ # {
+ # "defaults": { ... },
+ # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ # }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
- jq -c '.[]' "$serving_test_file" | while read -r params; do
+ jq -c '
+ if type == "array" then
+ # Plain format: test cases array
+ .[]
+ elif (type == "object" and has("tests")) then
+ # merge the default parameters into each test cases
+ . as $root
+ | ($root.defaults // {}) as $d
+ | ($root.tests // [])[]
+ # default qps / max_concurrency from defaults if missing
+ | .qps_list = (.qps_list // $d.qps_list)
+ | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+ # merge envs / params: test overrides defaults
+ | .server_environment_variables =
+ (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+ | .server_parameters =
+ (($d.server_parameters // {}) + (.server_parameters // {}))
+ | .client_parameters =
+ (($d.client_parameters // {}) + (.client_parameters // {}))
+ else
+ error("Unsupported serving test file format: must be array or object with .tests")
+ end
+ ' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,20 +368,25 @@ run_serving_tests() {
continue
fi
- # get client and server arguments
+ # get client and server arguments (after merged the default parameters)
server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
+
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
+
+ # qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
+
+ # max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
- num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
- max_concurrency_list="[$num_prompts]"
+ num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+ max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
deleted file mode 100644
index f758097e098e4..0000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ /dev/null
@@ -1,610 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
deleted file mode 100644
index 0b1a42e790255..0000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ /dev/null
@@ -1,1023 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index f792956f39472..8f7200862d20c 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,276 +1,246 @@
-[
- {
- "test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
},
- {
- "test_name": "serving_llama8B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
},
- {
- "test_name": "serving_llama8B_tp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp1_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp1_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "ignore-eos": "",
+ "num_prompts": 200
}
-]
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama8B_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama3B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_granite2B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen1.7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen4B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen8B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_glm9B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_gemma7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "google/gemma-7b",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "google/gemma-7b",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ }
+ ]
+}
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf5..fbfc923998f89 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,19 +30,6 @@ steps:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- - label: "Build wheel - CUDA 12.8"
- depends_on: ~
- id: build-wheel-cuda-12-8
- agents:
- queue: cpu_queue_postmerge
- commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- - "mkdir artifacts"
- - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
- env:
- DOCKER_BUILDKIT: "1"
-
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
@@ -109,7 +96,6 @@ steps:
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- - build-wheel-cuda-12-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 0000000000000..8d09ba178db7b
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+if not sys.version_info >= (3, 12):
+ raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """
+
+
+
+{items}
+
+
+"""
+
+
+@dataclass
+class WheelFileInfo:
+ package_name: str
+ version: str
+ build_tag: str | None
+ python_tag: str
+ abi_tag: str
+ platform_tag: str
+ variant: str | None
+ filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+ """
+ Parse wheel file name to extract metadata.
+
+ The format of wheel names:
+ {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+ All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+ Example:
+ vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+ vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+ """
+ wheel_file_re = re.compile(
+ r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$"
+ )
+ match = wheel_file_re.match(file)
+ if not match:
+ raise ValueError(f"Invalid wheel file name: {file}")
+
+ package_name = match.group("package_name")
+ version = match.group("version")
+ build_tag = match.group("build_tag")
+ python_tag = match.group("python_tag")
+ abi_tag = match.group("abi_tag")
+ platform_tag = match.group("platform_tag")
+
+ # extract variant from version
+ variant = None
+ if "dev" in version:
+ ver_after_dev = version.split("dev")[-1]
+ if "." in ver_after_dev:
+ variant = ver_after_dev.split(".")[-1]
+ version = version.removesuffix("." + variant)
+ else:
+ if "+" in version:
+ version, variant = version.split("+")
+
+ return WheelFileInfo(
+ package_name=package_name,
+ version=version,
+ build_tag=build_tag,
+ python_tag=python_tag,
+ abi_tag=abi_tag,
+ platform_tag=platform_tag,
+ variant=variant,
+ filename=file,
+ )
+
+
+def generate_project_list(subdir_names: list[str]) -> str:
+ """
+ Generate project list HTML content linking to each project & variant sub-directory.
+ """
+ href_tags = []
+ for name in sorted(subdir_names):
+ name = name.strip("/").strip(".")
+ href_tags.append(f' {name}/
')
+ return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+
+
+def generate_package_index_and_metadata(
+ wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+) -> tuple[str, str]:
+ """
+ Generate package index HTML content for a specific package, linking to actual wheel files.
+ """
+ href_tags = []
+ metadata = []
+ for file in sorted(wheel_files, key=lambda x: x.filename):
+ relative_path = (
+ wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+ )
+ # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+ # NOTE: this is AWS S3 specific behavior!
+ file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+ href_tags.append(f' {file.filename}
')
+ file_meta = asdict(file)
+ file_meta["path"] = file_path_quoted
+ metadata.append(file_meta)
+ index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+ metadata_str = json.dumps(metadata, indent=2)
+ return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+ whl_files: list[str],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ default_variant: str | None = None,
+ alias_to_default: str | None = None,
+):
+ """
+ Generate index for all wheel files.
+
+ Args:
+ whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+ wheel_base_dir (Path): Base directory for wheel files.
+ index_base_dir (Path): Base directory to store index files.
+ default_variant (str | None): The default variant name, if any.
+ alias_to_default (str | None): Alias variant name for the default variant, if any.
+
+ First, parse all wheel files to extract metadata.
+ We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+ The index for the default variant (if any) is generated in the root index directory.
+
+ If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+ is purely a copy of the corresponding variant index, with only the links adjusted.
+ Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+ If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+ as the default variant index, but the links are adjusted accordingly.
+
+ Index directory structure:
+ index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+ index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
+ vllm/
+ index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+ metadata.json # machine-readable metadata for all wheels in this package
+ cpu/ # cpu variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu129/ # cu129 is actually the alias to default variant
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu130/ # cu130 variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ ...
+
+ metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+ [
+ {
+ "package_name": "vllm",
+ "version": "0.10.2rc2",
+ "build_tag": null,
+ "python_tag": "cp38",
+ "abi_tag": "abi3",
+ "platform_tag": "manylinux2014_aarch64",
+ "variant": "cu129",
+ "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+ "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+ },
+ ...
+ ]
+ """
+
+ parsed_files = [parse_from_filename(f) for f in whl_files]
+
+ if not parsed_files:
+ print("No wheel files found, skipping index generation.")
+ return
+
+ # Group by variant
+ variant_to_files: dict[str, list[WheelFileInfo]] = {}
+ for file in parsed_files:
+ variant = file.variant or "default"
+ if variant not in variant_to_files:
+ variant_to_files[variant] = []
+ variant_to_files[variant].append(file)
+
+ print(f"Found variants: {list(variant_to_files.keys())}")
+
+ # sanity check for default variant
+ if default_variant:
+ if "default" in variant_to_files:
+ raise ValueError(
+ "All wheel files must have variant suffixes when `default_variant` is specified."
+ )
+ if default_variant not in variant_to_files:
+ raise ValueError(
+ f"Default variant '{default_variant}' not found among wheel files."
+ )
+
+ if alias_to_default:
+ if "default" not in variant_to_files:
+ # e.g. only some wheels are uploaded to S3 currently
+ print(
+ "[WARN] Alias to default variant specified, but no default variant found."
+ )
+ elif alias_to_default in variant_to_files:
+ raise ValueError(
+ f"Alias variant name '{alias_to_default}' already exists among wheel files."
+ )
+ else:
+ variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+ print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+ # Generate index for each variant
+ subdir_names = set()
+ for variant, files in variant_to_files.items():
+ if variant == "default":
+ variant_dir = index_base_dir
+ else:
+ variant_dir = index_base_dir / variant
+ subdir_names.add(variant)
+
+ variant_dir.mkdir(parents=True, exist_ok=True)
+
+ # gather all package names in this variant
+ packages = set(f.package_name for f in files)
+ if variant == "default":
+ # these packages should also appear in the "project list"
+ # generate after all variants are processed
+ subdir_names = subdir_names.union(packages)
+ else:
+ # generate project list for this variant directly
+ project_list_str = generate_project_list(sorted(packages))
+ with open(variant_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+ for package in packages:
+ # filter files belonging to this package only
+ package_files = [f for f in files if f.package_name == package]
+ package_dir = variant_dir / package
+ package_dir.mkdir(parents=True, exist_ok=True)
+ index_str, metadata_str = generate_package_index_and_metadata(
+ package_files, wheel_base_dir, package_dir
+ )
+ with open(package_dir / "index.html", "w") as f:
+ f.write(index_str)
+ with open(package_dir / "metadata.json", "w") as f:
+ f.write(metadata_str)
+
+ # Generate top-level project list index
+ project_list_str = generate_project_list(sorted(subdir_names))
+ with open(index_base_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+
+if __name__ == "__main__":
+ """
+ Arguments:
+ --version : version string for the current build (e.g., commit hash)
+ --current-objects : path to JSON file containing current S3 objects listing in this version directory
+ --output-dir : directory to store generated index files
+ --alias-to-default : (optional) alias variant name for the default variant
+ """
+
+ parser = argparse.ArgumentParser(
+ description="Process nightly build wheel files to generate indices."
+ )
+ parser.add_argument(
+ "--version",
+ type=str,
+ required=True,
+ help="Version string for the current build (e.g., commit hash)",
+ )
+ parser.add_argument(
+ "--current-objects",
+ type=str,
+ required=True,
+ help="Path to JSON file containing current S3 objects listing in this version directory",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ required=True,
+ help="Directory to store generated index files",
+ )
+ parser.add_argument(
+ "--alias-to-default",
+ type=str,
+ default=None,
+ help="Alias variant name for the default variant",
+ )
+
+ args = parser.parse_args()
+
+ version = args.version
+ if "/" in version or "\\" in version:
+ raise ValueError("Version string must not contain slashes.")
+ current_objects_path = Path(args.current_objects)
+ output_dir = Path(args.output_dir)
+ if not output_dir.exists():
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Read current objects JSON
+ with open(current_objects_path) as f:
+ current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+ # current_objects looks like from list_objects_v2 S3 API:
+ """
+ "Contents": [
+ {
+ "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+ "LastModified": "2025-11-28T14:00:32+00:00",
+ "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+ "ChecksumAlgorithm": [
+ "CRC64NVME"
+ ],
+ "ChecksumType": "FULL_OBJECT",
+ "Size": 435649349,
+ "StorageClass": "STANDARD"
+ },
+ ...
+ ]
+ """
+
+ # Extract wheel file keys
+ wheel_files = []
+ for item in current_objects.get("Contents", []):
+ key: str = item["Key"]
+ if key.endswith(".whl"):
+ wheel_files.append(key.split("/")[-1]) # only the filename is used
+
+ print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+ # Generate index and metadata, assuming wheels and indices are stored as:
+ # s3://vllm-wheels/{version}/
+ # s3://vllm-wheels//
+ wheel_base_dir = Path(output_dir).parent / version
+ index_base_dir = Path(output_dir)
+
+ generate_index_and_metadata(
+ whl_files=wheel_files,
+ wheel_base_dir=wheel_base_dir,
+ index_base_dir=index_base_dir,
+ default_variant=None,
+ alias_to_default=args.alias_to_default,
+ )
+ print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c0090..2eaa91c04086c 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
set -ex
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+ # use new python from docker
+ docker pull python:3-slim
+ PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
@@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
-
-# Get the single wheel file
wheel="${wheel_files[0]}"
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
- manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
- manylinux_version="manylinux2014"
-else
- echo "Warning: Unknown architecture $arch, using manylinux1 as default"
- manylinux_version="manylinux1"
-fi
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="manylinux_2_31"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+ echo "Error: Wheel filename does not contain 'linux': $wheel"
+ exit 1
+fi
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
- suffix="${version##*.}"
- if [[ $suffix == cu* ]]; then
- new_version="1.0.0.dev+${suffix}"
- else
- new_version="1.0.0.dev"
- fi
- new_wheel="${wheel/$version/$new_version}"
- # use cp to keep both files in the artifacts directory
- cp -- "$wheel" "$new_wheel"
- wheel="$new_wheel"
- version="$new_version"
-fi
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
- aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels//
+# and indices can be placed in //, or /nightly/, or //
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+ alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
else
- echo "Skipping index files for non-cu129 wheels"
+ alias_arg=""
fi
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
- echo "Skipping index files for non-cu129 wheels"
+# copy indices to // unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+ echo "Uploading indices to overwrite /nightly/"
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# copy to // only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+ echo "Uploading indices to overwrite /$pure_version/"
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index d5d4043a1d5bc..ee4fdebae5675 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -715,6 +715,7 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
+ - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 15min
@@ -934,6 +935,18 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
@@ -1472,14 +1485,14 @@ steps:
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - pytest -v -s tests/compile/distributed/test_async_tp.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- - pytest -v -s tests/distributed/test_sequence_parallel.py
+ - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0e715a719d27d..52c848c784e53 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -215,7 +215,6 @@ steps:
timeout_in_minutes: 10
gpu: h100
num_gpus: 8
- optional: true
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
@@ -391,20 +390,24 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
- - python3 offline_inference/audio_language.py --seed 0
- - python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- - python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -1370,4 +1373,4 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ecb10d1a450f3..d6447649cd89a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -146,10 +146,10 @@ mkdocs.yaml @hmellor
/requirements/kv_connectors.txt @NickLucche
# Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
-/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
+/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 861290ea43c87..56fbe5ca704a1 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- name: Set up Python
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index d5e70f30ef638..a03b979ad761d 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index dedb564fffac8..cac401456b62a 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
- scheduler_config=SchedulerConfig(),
+ scheduler_config=SchedulerConfig(
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ ),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 28fc383a318dd..e6391134ff932 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 55001cf3722a0..df122b4c5e8db 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
index df47bb8dd1d7d..58dc402016881 100644
--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu(
}
auto Y_all = at::empty({offsets[E], H}, x_c.options());
- at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
+ at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) {
c10::InferenceMode guard;
- for (int64_t e = e_begin; e < e_end; ++e) {
- const int64_t te = counts[e];
- if (te == 0) {
+ for (int64_t e = 0; e < E; ++e) {
+ int64_t start = std::max(offsets[e], idx_begin);
+ int64_t end = std::min(offsets[e + 1], idx_end);
+ int64_t te = end - start;
+ if (te <= 0) {
continue;
}
- const int64_t start = offsets[e];
-
auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
auto w13_e = w13_packed.select(/*dim=*/0, e);
diff --git a/docker/Dockerfile b/docker/Dockerfile
index eb7c105071c00..006481b23cb9f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -364,7 +364,12 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \
- libcublas-${CUDA_VERSION_DASH} && \
+ # https://github.com/vllm-project/vllm/issues/29590
+ libcurand-dev-${CUDA_VERSION_DASH} \
+ libcublas-${CUDA_VERSION_DASH} \
+ # Fixes nccl_allocator requiring nccl.h at runtime
+ # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
+ libnccl-dev && \
rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL
diff --git a/docs/.nav.yml b/docs/.nav.yml
index d30c0f12eba4c..aa98ad52be215 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -5,11 +5,7 @@ nav:
- Getting Started:
- getting_started/quickstart.md
- getting_started/installation
- - Examples:
- - examples/README.md
- - Offline Inference: examples/offline_inference
- - Online Serving: examples/online_serving
- - Others: examples/others
+ - Examples: examples
- General:
- usage/v1_guide.md
- usage/*
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index b4a30cda35a01..5a86940fa9f13 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen
The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.
## Using an IO Processor plugin
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index d1beab7855b18..4b68cb4811789 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
### Pre-built wheels
-Currently, there are no pre-built CPU wheels.
+Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
+
+When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
+For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
### Build wheel from source
+#### Set up using Python-only build (without compilation) {#python-only-build}
+
+Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
+
+```bash
+VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
+```
+
+#### Full build (with compilation) {#full-build}
+
=== "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
@@ -125,6 +138,35 @@ vllm serve facebook/opt-125m --dtype=bfloat16
Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`.
+### What are supported models on CPU?
+
+For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu)
+
+### How to find benchmark configuration examples for supported CPU models?
+
+For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json)
+For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details).
+To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment.
+
+Below is an example command to benchmark all CPU-supported models using optimized configurations.
+
+```bash
+ON_CPU=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+The benchmark results will be saved in `./benchmark/results/`.
+In the directory, the generated `.commands` files contain all example commands for the benchmark.
+
+We recommend configuring tensor-parallel-size to match the number of NUMA nodes on your system. Note that the current release does not support tensor-parallel-size=6.
+To determine the number of NUMA nodes available, use the following command:
+
+```bash
+lscpu | grep "NUMA node(s):" | awk '{print $3}'
+```
+
+For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu)
+, which publishes default-model CPU results produced using the same Benchmark Suite.
+
### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 601d3659af886..03ce28c78efc9 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -26,42 +26,49 @@ uv pip install vllm --torch-backend=auto
??? console "pip"
```bash
- # Install vLLM with CUDA 12.8.
- pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+ # Install vLLM with CUDA 12.9.
+ pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
```
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
!!! note
NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
-As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
```bash
-# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+# Install vLLM with a specific CUDA version (e.g., 13.0).
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=118 # or 126
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=130 # or other
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
```
#### Install the latest code
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on . There are multiple indices that could be used:
+
+* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
+* `https://wheels.vllm.ai/nightly/`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
+
+To install from nightly index, run:
```bash
uv pip install -U vllm \
--torch-backend=auto \
- --extra-index-url https://wheels.vllm.ai/nightly
+ --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
```
-??? console "pip"
- ```bash
- pip install -U vllm \
- --pre \
- --extra-index-url https://wheels.vllm.ai/nightly
- ```
+!!! warning "`pip` caveat"
- `--pre` is required for `pip` to consider pre-released versions.
+ Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
+
+ If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
+
+ ```bash
+ pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
+ pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
+ ```
##### Install specific revisions
@@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
uv pip install vllm \
--torch-backend=auto \
- --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+ --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
```
-The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
-
-??? note "pip"
- If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
- performance regression), due to the limitation of `pip`, you have to specify the full URL of the
- wheel file by embedding the commit hash in the URL:
-
- ```bash
- export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
- pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
- ```
-
- Note that the wheels are built with Python 3.8 ABI (see [PEP
- 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
- with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
- placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
- the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
- don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
- wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
-#### Set up using Python-only build (without compilation)
+#### Set up using Python-only build (without compilation) {#python-only-build}
If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
@@ -121,18 +108,24 @@ This command will do the following:
In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
```bash
-export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_COMMIT=$(git rev-parse HEAD~1) # or earlier commit on main
+export VLLM_USE_PRECOMPILED=1
uv pip install --editable .
```
+There are more environment variables to control the behavior of Python-only build:
+
+* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
+* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
+* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
+
You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
!!! note
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
-#### Full build (with compilation)
+#### Full build (with compilation) {#full-build}
If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index bc7508b29475f..fb750f4499858 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
--8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
-### Pre-built wheels
+### Pre-built wheels {#pre-built-wheels}
=== "NVIDIA CUDA"
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 6e4fb039e3a07..e886a91e65732 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -2,7 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import logging
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+from functools import cached_property
from pathlib import Path
from typing import Literal
@@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples"
EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
-def fix_case(text: str) -> str:
+def title(text: str) -> str:
+ # Default title case
+ text = text.replace("_", " ").replace("/", " - ").title()
+ # Custom substitutions
subs = {
+ "io": "IO",
"api": "API",
"cli": "CLI",
"cpu": "CPU",
"llm": "LLM",
"mae": "MAE",
+ "ner": "NER",
"tpu": "TPU",
"gguf": "GGUF",
"lora": "LoRA",
@@ -48,71 +54,65 @@ class Example:
Attributes:
path (Path): The path to the main directory or file.
category (str): The category of the document.
- main_file (Path): The main file in the directory.
- other_files (list[Path]): list of other files in the directory.
- title (str): The title of the document.
+
+ Properties::
+ main_file() -> Path | None: Determines the main file in the given path.
+ other_files() -> list[Path]: Determines other files in the directory excluding
+ the main file.
+ title() -> str: Determines the title of the document.
Methods:
- __post_init__(): Initializes the main_file, other_files, and title attributes.
- determine_main_file() -> Path: Determines the main file in the given path.
- determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
- determine_title() -> str: Determines the title of the document.
generate() -> str: Generates the documentation content.
- """ # noqa: E501
+ """
path: Path
- category: str = None
- main_file: Path = field(init=False)
- other_files: list[Path] = field(init=False)
- title: str = field(init=False)
+ category: str
- def __post_init__(self):
- self.main_file = self.determine_main_file()
- self.other_files = self.determine_other_files()
- self.title = self.determine_title()
+ @cached_property
+ def main_file(self) -> Path | None:
+ """Determines the main file in the given path.
- @property
- def is_code(self) -> bool:
- return self.main_file.suffix != ".md"
+ If path is a file, it returns the path itself. If path is a directory, it
+ searches for Markdown files (*.md) in the directory and returns the first one
+ found. If no Markdown files are found, it returns None."""
+ # Single file example
+ if self.path.is_file():
+ return self.path
+ # Multi file example with a README
+ if md_paths := list(self.path.glob("*.md")):
+ return md_paths[0]
+ # Multi file example without a README
+ return None
- def determine_main_file(self) -> Path:
- """
- Determines the main file in the given path.
- If the path is a file, it returns the path itself. Otherwise, it searches
- for Markdown files (*.md) in the directory and returns the first one found.
- Returns:
- Path: The main file path, either the original path if it's a file or the first
- Markdown file found in the directory.
- Raises:
- IndexError: If no Markdown files are found in the directory.
- """ # noqa: E501
- return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop()
+ @cached_property
+ def other_files(self) -> list[Path]:
+ """Determine other files in the directory excluding the main file.
- def determine_other_files(self) -> list[Path]:
- """
- Determine other files in the directory excluding the main file.
-
- This method checks if the given path is a file. If it is, it returns an empty list.
- Otherwise, it recursively searches through the directory and returns a list of all
- files that are not the main file.
-
- Returns:
- list[Path]: A list of Path objects representing the other files in the directory.
- """ # noqa: E501
+ If path is a file, it returns an empty list. Otherwise, it returns every file
+ in the directory except the main file in a list."""
+ # Single file example
if self.path.is_file():
return []
+ # Multi file example
is_other_file = lambda file: file.is_file() and file != self.main_file
- return [file for file in self.path.rglob("*") if is_other_file(file)]
+ return sorted(file for file in self.path.rglob("*") if is_other_file(file))
- def determine_title(self) -> str:
- if not self.is_code:
- # Specify encoding for building on Windows
- with open(self.main_file, encoding="utf-8") as f:
- first_line = f.readline().strip()
- match = re.match(r"^#\s+(?P.+)$", first_line)
- if match:
- return match.group("title")
- return fix_case(self.path.stem.replace("_", " ").title())
+ @cached_property
+ def is_code(self) -> bool:
+ return self.main_file is not None and self.main_file.suffix != ".md"
+
+ @cached_property
+ def title(self) -> str:
+ # Generate title from filename if no main md file found
+ if self.main_file is None or self.is_code:
+ return title(self.path.stem)
+ # Specify encoding for building on Windows
+ with open(self.main_file, encoding="utf-8") as f:
+ first_line = f.readline().strip()
+ match = re.match(r"^#\s+(?P.+)$", first_line)
+ if match:
+ return match.group("title")
+ raise ValueError(f"Title not found in {self.main_file}")
def fix_relative_links(self, content: str) -> str:
"""
@@ -156,24 +156,35 @@ class Example:
# included files containing code fences too
code_fence = "``````"
- if self.is_code:
- content += (
- f"{code_fence}{self.main_file.suffix[1:]}\n"
- f'--8<-- "{self.main_file}"\n'
- f"{code_fence}\n"
- )
+ if self.main_file is not None:
+ # Single file example or multi file example with a README
+ if self.is_code:
+ content += (
+ f"{code_fence}{self.main_file.suffix[1:]}\n"
+ f'--8<-- "{self.main_file}"\n'
+ f"{code_fence}\n"
+ )
+ else:
+ with open(self.main_file, encoding="utf-8") as f:
+ # Skip the title from md snippets as it's been included above
+ main_content = f.readlines()[1:]
+ content += self.fix_relative_links("".join(main_content))
+ content += "\n"
else:
- with open(self.main_file) as f:
- # Skip the title from md snippets as it's been included above
- main_content = f.readlines()[1:]
- content += self.fix_relative_links("".join(main_content))
- content += "\n"
+ # Multi file example without a README
+ for file in self.other_files:
+ file_title = title(str(file.relative_to(self.path).with_suffix("")))
+ content += f"## {file_title}\n\n"
+ content += (
+ f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n'
+ )
+ return content
if not self.other_files:
return content
content += "## Example materials\n\n"
- for file in sorted(self.other_files):
+ for file in self.other_files:
content += f'??? abstract "{file.relative_to(self.path)}"\n'
if file.suffix != ".md":
content += f" {code_fence}{file.suffix[1:]}\n"
@@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
glob_patterns = ["*.py", "*.md", "*.sh"]
# Find categorised examples
for category in categories:
+ logger.info("Processing category: %s", category.stem)
globs = [category.glob(pattern) for pattern in glob_patterns]
for path in itertools.chain(*globs):
examples.append(Example(path, category.stem))
# Find examples in subdirectories
- for path in category.glob("*/*.md"):
+ globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns]
+ for path in itertools.chain(*globs):
examples.append(Example(path.parent, category.stem))
# Generate the example documentation
@@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
with open(doc_path, "w+", encoding="utf-8") as f:
f.write(example.generate())
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
+ logger.info("Total examples generated: %d", len(examples))
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index aca865f4bf77d..e2d427e8a4590 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -274,7 +274,7 @@ outputs = llm.embed(
print(outputs[0].outputs)
```
-A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py)
+A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py)
### Online Inference
@@ -304,7 +304,7 @@ Expected output:
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
```
-An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py)
+An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py)
## Deprecated Features
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index da7c5edf66bfb..040107c11efcf 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -417,7 +417,8 @@ th {
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
-| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
+| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
+| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
@@ -567,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
```
!!! note
- Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py).
+ Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
```bash
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
@@ -605,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |
!!! note
- Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py).
+ Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py).
## List of Multimodal Language Models
@@ -711,7 +712,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
-| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ |
+| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ |
| `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 672663dc50b1e..01453483a8d60 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -234,7 +234,7 @@ The following extra parameters are supported:
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
-Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py)
+Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py)
If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
@@ -335,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
example below for details.
-Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py)
+Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py)
#### Extra parameters
@@ -516,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
-Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py)
+Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py)
### Classification API
@@ -524,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo
We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities.
-Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py)
+Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py)
#### Example Requests
@@ -640,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
-Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py)
+Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
#### Single inference
@@ -821,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including
print("Scoring output:", response_json["data"][0]["score"])
print("Scoring output:", response_json["data"][1]["score"])
```
-Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py)
+Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py)
#### Extra parameters
@@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
popular open-source tools.
-Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py)
+Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
#### Example Request
diff --git a/examples/offline_inference/llm_engine_reset_kv.py b/examples/offline_inference/llm_engine_reset_kv.py
new file mode 100644
index 0000000000000..3fbe7fa7545e6
--- /dev/null
+++ b/examples/offline_inference/llm_engine_reset_kv.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file demonstrates preempt requests when using the `LLMEngine`
+for processing prompts with various sampling parameters.
+"""
+
+import argparse
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+def create_test_prompts() -> list[tuple[str, SamplingParams]]:
+ """Create a list of test prompts with their sampling parameters."""
+ return [
+ (
+ "A robot may not injure a human being " * 50,
+ SamplingParams(
+ temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
+ ),
+ ),
+ (
+ "A robot may not injure a human being " * 50,
+ SamplingParams(
+ temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16
+ ),
+ ),
+ (
+ "To be or not to be,",
+ SamplingParams(
+ temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128
+ ),
+ ),
+ (
+ "What is the meaning of life?",
+ SamplingParams(
+ n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1, max_tokens=128
+ ),
+ ),
+ ]
+
+
+def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
+ """Continuously process a list of prompts and handle the outputs."""
+ request_id = 0
+
+ print("-" * 50)
+ step_id = 0
+ while test_prompts or engine.has_unfinished_requests():
+ print("-" * 50)
+ import os
+
+ print(f"Step {step_id} (pid={os.getpid()})")
+
+ if test_prompts:
+ prompt, sampling_params = test_prompts.pop(0)
+ engine.add_request(str(request_id), prompt, sampling_params)
+ request_id += 1
+
+ if step_id == 10:
+ print(f"Resetting prefix cache at {step_id}")
+ engine.reset_prefix_cache(reset_running_requests=True)
+
+ request_outputs: list[RequestOutput] = engine.step()
+
+ for request_output in request_outputs:
+ if request_output.finished:
+ print("-" * 50)
+ print(request_output)
+ print("-" * 50)
+ step_id += 1
+
+
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
+ """Initialize the LLMEngine from the command line arguments."""
+ engine_args = EngineArgs.from_cli_args(args)
+ return LLMEngine.from_engine_args(engine_args)
+
+
+def parse_args():
+ parser = FlexibleArgumentParser(
+ description="Demo on using the LLMEngine class directly"
+ )
+ parser = EngineArgs.add_cli_args(parser)
+ return parser.parse_args()
+
+
+def main(args: argparse.Namespace):
+ """Main function that sets up and runs the prompt processing."""
+ engine = initialize_engine(args)
+ test_prompts = create_test_prompts()
+ process_requests(engine, test_prompts)
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ main(args)
diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
deleted file mode 100644
index ad78be38716b6..0000000000000
--- a/examples/offline_inference/pooling/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# Pooling models
-
-## Convert llm model to seq cls
-
-```bash
-# for BAAI/bge-reranker-v2-gemma
-# Caution: "Yes" and "yes" are two different tokens
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
-# for mxbai-rerank-v2
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
-# for Qwen3-Reranker
-python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
-```
-
-## Embed jina_embeddings_v3 usage
-
-Only text matching task is supported for now. See
-
-```bash
-python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
-```
-
-## Embed matryoshka dimensions usage
-
-```bash
-python examples/offline_inference/pooling/embed_matryoshka_fy.py
-```
-
-## Multi vector retrieval usage
-
-```bash
-python examples/offline_inference/pooling/multi_vector_retrieval.py
-```
-
-## Named Entity Recognition (NER) usage
-
-```bash
-python examples/offline_inference/pooling/ner.py
-```
-
-## Prithvi Geospatial MAE usage
-
-```bash
-python examples/offline_inference/pooling/prithvi_geospatial_mae.py
-```
-
-## IO Processor Plugins for Prithvi Geospatial MAE
-
-```bash
-python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
-```
-
-## Qwen3 reranker usage
-
-```bash
-python examples/offline_inference/pooling/qwen3_reranker.py
-```
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8f72bf6f0b0d1..0888a9d60a3fa 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1801,7 +1801,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
- hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+ hf_overrides={
+ "architectures": ["Tarsier2ForConditionalGeneration"],
+ "model_type": "tarsier2",
+ },
limit_mm_per_prompt={modality: 1},
)
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 7ba4e64b567de..2193b1ca9cf48 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1222,7 +1222,10 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
trust_remote_code=True,
max_model_len=32768,
limit_mm_per_prompt={"image": len(image_urls)},
- hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+ hf_overrides={
+ "architectures": ["Tarsier2ForConditionalGeneration"],
+ "model_type": "tarsier2",
+ },
)
prompt = (
diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
deleted file mode 100644
index b76ad21f04818..0000000000000
--- a/examples/online_serving/pooling/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# Pooling models
-
-## Cohere rerank usage
-
-```bash
-# vllm serve BAAI/bge-reranker-base
-python examples/online_serving/pooling/cohere_rerank_client.py
-```
-
-## Embedding requests base64 encoding_format usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/embedding_requests_base64_client.py
-```
-
-## Embedding requests bytes encoding_format usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/embedding_requests_bytes_client.py
-```
-
-## Jinaai rerank usage
-
-```bash
-# vllm serve BAAI/bge-reranker-base
-python examples/online_serving/pooling/jinaai_rerank_client.py
-```
-
-## Multi vector retrieval usage
-
-```bash
-# vllm serve BAAI/bge-m3
-python examples/online_serving/pooling/multi_vector_retrieval_client.py
-```
-
-## Named Entity Recognition (NER) usage
-
-```bash
-# vllm serve boltuix/NeuroBERT-NER
-python examples/online_serving/pooling/ner_client.py
-```
-
-## OpenAI chat embedding for multimodal usage
-
-```bash
-python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
-```
-
-## OpenAI classification usage
-
-```bash
-# vllm serve jason9693/Qwen2.5-1.5B-apeach
-python examples/online_serving/pooling/openai_classification_client.py
-```
-
-## OpenAI cross_encoder score usage
-
-```bash
-# vllm serve BAAI/bge-reranker-v2-m3
-python examples/online_serving/pooling/openai_cross_encoder_score.py
-```
-
-## OpenAI cross_encoder score for multimodal usage
-
-```bash
-# vllm serve jinaai/jina-reranker-m0
-python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
-```
-
-## OpenAI embedding usage
-
-```bash
-# vllm serve intfloat/e5-small
-python examples/online_serving/pooling/openai_embedding_client.py
-```
-
-## OpenAI embedding matryoshka dimensions usage
-
-```bash
-# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
-python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
-```
-
-## OpenAI pooling usage
-
-```bash
-# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code
-python examples/online_serving/pooling/openai_pooling_client.py
-```
-
-## Online Prithvi Geospatial MAE usage
-
-```bash
-python examples/online_serving/pooling/prithvi_geospatial_mae.py
-```
diff --git a/examples/online_serving/pooling/openai_classification_client.py b/examples/pooling/classify/openai_classification_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_classification_client.py
rename to examples/pooling/classify/openai_classification_client.py
diff --git a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py b/examples/pooling/embed/embed_jina_embeddings_v3.py
similarity index 100%
rename from examples/offline_inference/pooling/embed_jina_embeddings_v3.py
rename to examples/pooling/embed/embed_jina_embeddings_v3.py
diff --git a/examples/offline_inference/pooling/embed_matryoshka_fy.py b/examples/pooling/embed/embed_matryoshka_fy.py
similarity index 100%
rename from examples/offline_inference/pooling/embed_matryoshka_fy.py
rename to examples/pooling/embed/embed_matryoshka_fy.py
diff --git a/examples/online_serving/pooling/embedding_requests_base64_client.py b/examples/pooling/embed/embedding_requests_base64_client.py
similarity index 100%
rename from examples/online_serving/pooling/embedding_requests_base64_client.py
rename to examples/pooling/embed/embedding_requests_base64_client.py
diff --git a/examples/online_serving/pooling/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py
similarity index 100%
rename from examples/online_serving/pooling/embedding_requests_bytes_client.py
rename to examples/pooling/embed/embedding_requests_bytes_client.py
diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
similarity index 100%
rename from examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
rename to examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py
diff --git a/examples/online_serving/pooling/openai_embedding_client.py b/examples/pooling/embed/openai_embedding_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_embedding_client.py
rename to examples/pooling/embed/openai_embedding_client.py
diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/pooling/embed/openai_embedding_long_text/README.md
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/README.md
rename to examples/pooling/embed/openai_embedding_long_text/README.md
diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/pooling/embed/openai_embedding_long_text/client.py
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/client.py
rename to examples/pooling/embed/openai_embedding_long_text/client.py
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh
similarity index 100%
rename from examples/online_serving/openai_embedding_long_text/service.sh
rename to examples/pooling/embed/openai_embedding_long_text/service.sh
diff --git a/examples/online_serving/pooling/openai_embedding_matryoshka_fy.py b/examples/pooling/embed/openai_embedding_matryoshka_fy.py
similarity index 100%
rename from examples/online_serving/pooling/openai_embedding_matryoshka_fy.py
rename to examples/pooling/embed/openai_embedding_matryoshka_fy.py
diff --git a/examples/online_serving/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py
similarity index 100%
rename from examples/online_serving/pooling/prithvi_geospatial_mae.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_client.py
diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py b/examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
similarity index 100%
rename from examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_offline.py
similarity index 100%
rename from examples/offline_inference/pooling/prithvi_geospatial_mae.py
rename to examples/pooling/plugin/prithvi_geospatial_mae_offline.py
diff --git a/examples/online_serving/pooling/openai_pooling_client.py b/examples/pooling/pooling/openai_pooling_client.py
similarity index 100%
rename from examples/online_serving/pooling/openai_pooling_client.py
rename to examples/pooling/pooling/openai_pooling_client.py
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py
similarity index 100%
rename from examples/offline_inference/vision_language_pooling.py
rename to examples/pooling/pooling/vision_language_pooling.py
diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/pooling/score/cohere_rerank_client.py
similarity index 100%
rename from examples/online_serving/pooling/cohere_rerank_client.py
rename to examples/pooling/score/cohere_rerank_client.py
diff --git a/examples/offline_inference/pooling/convert_model_to_seq_cls.py b/examples/pooling/score/convert_model_to_seq_cls.py
similarity index 100%
rename from examples/offline_inference/pooling/convert_model_to_seq_cls.py
rename to examples/pooling/score/convert_model_to_seq_cls.py
diff --git a/examples/online_serving/pooling/jinaai_rerank_client.py b/examples/pooling/score/jinaai_rerank_client.py
similarity index 100%
rename from examples/online_serving/pooling/jinaai_rerank_client.py
rename to examples/pooling/score/jinaai_rerank_client.py
diff --git a/examples/online_serving/pooling/openai_cross_encoder_score.py b/examples/pooling/score/openai_cross_encoder_score.py
similarity index 100%
rename from examples/online_serving/pooling/openai_cross_encoder_score.py
rename to examples/pooling/score/openai_cross_encoder_score.py
diff --git a/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py b/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
similarity index 100%
rename from examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
rename to examples/pooling/score/openai_cross_encoder_score_for_multimodal.py
diff --git a/examples/offline_inference/pooling/qwen3_reranker.py b/examples/pooling/score/qwen3_reranker.py
similarity index 100%
rename from examples/offline_inference/pooling/qwen3_reranker.py
rename to examples/pooling/score/qwen3_reranker.py
diff --git a/examples/offline_inference/pooling/ner.py b/examples/pooling/token_classify/ner.py
similarity index 100%
rename from examples/offline_inference/pooling/ner.py
rename to examples/pooling/token_classify/ner.py
diff --git a/examples/online_serving/pooling/ner_client.py b/examples/pooling/token_classify/ner_client.py
similarity index 100%
rename from examples/online_serving/pooling/ner_client.py
rename to examples/pooling/token_classify/ner_client.py
diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/pooling/token_embed/multi_vector_retrieval.py
similarity index 100%
rename from examples/offline_inference/pooling/multi_vector_retrieval.py
rename to examples/pooling/token_embed/multi_vector_retrieval.py
diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/pooling/token_embed/multi_vector_retrieval_client.py
similarity index 100%
rename from examples/online_serving/pooling/multi_vector_retrieval_client.py
rename to examples/pooling/token_embed/multi_vector_retrieval_client.py
diff --git a/setup.py b/setup.py
index 0022e7fe0bf36..5b7d12bb373e3 100644
--- a/setup.py
+++ b/setup.py
@@ -311,7 +311,7 @@ class precompiled_build_ext(build_ext):
"""Disables extension building when using precompiled binaries."""
def run(self) -> None:
- assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+ return
def build_extensions(self) -> None:
print("Skipping build_ext: using precompiled extensions.")
@@ -322,14 +322,121 @@ class precompiled_wheel_utils:
"""Extracts libraries and other files from an existing wheel."""
@staticmethod
- def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
+ def fetch_metadata_for_variant(
+ commit: str, variant: str | None
+ ) -> tuple[list[dict], str]:
+ """
+ Fetches metadata for a specific variant of the precompiled wheel.
+ """
+ variant_dir = f"{variant}/" if variant is not None else ""
+ repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
+ meta_url = repo_url + "metadata.json"
+ print(f"Trying to fetch nightly build metadata from {meta_url}")
+ from urllib.request import urlopen
+
+ with urlopen(meta_url) as resp:
+ # urlopen raises HTTPError on unexpected status code
+ wheels = json.loads(resp.read().decode("utf-8"))
+ return wheels, repo_url
+
+ @staticmethod
+ def determine_wheel_url() -> tuple[str, str | None]:
+ """
+ Try to determine the precompiled wheel URL or path to use.
+ The order of preference is:
+ 1. user-specified wheel location (can be either local or remote, via
+ VLLM_PRECOMPILED_WHEEL_LOCATION)
+ 2. user-specified variant from nightly repo (current main commit via
+ VLLM_PRECOMPILED_WHEEL_VARIANT)
+ 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
+ 4. the default variant from nightly repo (current main commit)
+ """
+ wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
+ if wheel_location is not None:
+ print(f"Using user-specified precompiled wheel location: {wheel_location}")
+ return wheel_location, None
+ else:
+ import platform
+
+ arch = platform.machine()
+ # try to fetch the wheel metadata from the nightly wheel repo
+ main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+ variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
+ commit = os.getenv(
+ "VLLM_PRECOMPILED_WHEEL_COMMIT",
+ precompiled_wheel_utils.get_base_commit_in_main_branch(),
+ )
+ print(f"Using precompiled wheel commit {commit} with variant {variant}")
+ try_default = False
+ wheels, repo_url, download_filename = None, None, None
+ try:
+ wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant(
+ commit, variant
+ )
+ except Exception as e:
+ logger.warning(
+ "Failed to fetch precompiled wheel metadata for variant %s: %s",
+ variant,
+ e,
+ )
+ try_default = True # try outside handler to keep the stacktrace simple
+ if try_default:
+ print("Trying the default variant from remote")
+ wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant(
+ commit, None
+ )
+ # if this also fails, then we have nothing more to try / cache
+ assert wheels is not None and repo_url is not None, (
+ "Failed to fetch precompiled wheel metadata"
+ )
+ # The metadata.json has the following format:
+ # see .buildkite/scripts/generate-nightly-index.py for details
+ """[{
+ "package_name": "vllm",
+ "version": "0.11.2.dev278+gdbc3d9991",
+ "build_tag": null,
+ "python_tag": "cp38",
+ "abi_tag": "abi3",
+ "platform_tag": "manylinux1_x86_64",
+ "variant": null,
+ "filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
+ "path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
+ },
+ ...]"""
+ from urllib.parse import urljoin
+
+ for wheel in wheels:
+ # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
+ if wheel.get("package_name") == "vllm" and arch in wheel.get(
+ "platform_tag", ""
+ ):
+ print(f"Found precompiled wheel metadata: {wheel}")
+ if "path" not in wheel:
+ raise ValueError(f"Wheel metadata missing path: {wheel}")
+ wheel_url = urljoin(repo_url, wheel["path"])
+ download_filename = wheel.get("filename")
+ print(f"Using precompiled wheel URL: {wheel_url}")
+ break
+ else:
+ raise ValueError(
+ f"No precompiled vllm wheel found for architecture {arch} "
+ f"from repo {repo_url}. All available wheels: {wheels}"
+ )
+
+ return wheel_url, download_filename
+
+ @staticmethod
+ def extract_precompiled_and_patch_package(
+ wheel_url_or_path: str, download_filename: str | None
+ ) -> dict:
import tempfile
import zipfile
temp_dir = None
try:
if not os.path.isfile(wheel_url_or_path):
- wheel_filename = wheel_url_or_path.split("/")[-1]
+ # use provided filename first, then derive from URL
+ wheel_filename = download_filename or wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
@@ -648,38 +755,13 @@ package_data = {
]
}
+
# If using precompiled, extract and patch package_data (in advance of setup)
if envs.VLLM_USE_PRECOMPILED:
- assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
- wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
- if wheel_location is not None:
- wheel_url = wheel_location
- else:
- import platform
-
- arch = platform.machine()
- if arch == "x86_64":
- wheel_tag = "manylinux1_x86_64"
- elif arch == "aarch64":
- wheel_tag = "manylinux2014_aarch64"
- else:
- raise ValueError(f"Unsupported architecture: {arch}")
- base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
- wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
- nightly_wheel_url = (
- f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
- )
- from urllib.request import urlopen
-
- try:
- with urlopen(wheel_url) as resp:
- if resp.status != 200:
- wheel_url = nightly_wheel_url
- except Exception as e:
- print(f"[warn] Falling back to nightly wheel: {e}")
- wheel_url = nightly_wheel_url
-
- patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
+ wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url()
+ patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
+ wheel_url, download_filename
+ )
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)
diff --git a/tests/benchmarks/test_param_sweep.py b/tests/benchmarks/test_param_sweep.py
new file mode 100644
index 0000000000000..0d47cfd9d6230
--- /dev/null
+++ b/tests/benchmarks/test_param_sweep.py
@@ -0,0 +1,257 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem
+
+
+class TestParameterSweepItem:
+ """Test ParameterSweepItem functionality."""
+
+ @pytest.mark.parametrize(
+ "input_dict,expected",
+ [
+ (
+ {"compilation_config.use_inductor_graph_partition": False},
+ "--compilation-config.use_inductor_graph_partition=false",
+ ),
+ (
+ {"compilation_config.use_inductor_graph_partition": True},
+ "--compilation-config.use_inductor_graph_partition=true",
+ ),
+ (
+ {"compilation_config.use_inductor": False},
+ "--compilation-config.use_inductor=false",
+ ),
+ (
+ {"compilation_config.use_inductor": True},
+ "--compilation-config.use_inductor=true",
+ ),
+ ],
+ )
+ def test_nested_boolean_params(self, input_dict, expected):
+ """Test that nested boolean params use =true/false syntax."""
+ item = ParameterSweepItem.from_record(input_dict)
+ cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+ assert expected in cmd
+
+ @pytest.mark.parametrize(
+ "input_dict,expected",
+ [
+ ({"enable_prefix_caching": False}, "--no-enable-prefix-caching"),
+ ({"enable_prefix_caching": True}, "--enable-prefix-caching"),
+ ({"disable_log_stats": False}, "--no-disable-log-stats"),
+ ({"disable_log_stats": True}, "--disable-log-stats"),
+ ],
+ )
+ def test_non_nested_boolean_params(self, input_dict, expected):
+ """Test that non-nested boolean params use --no- prefix."""
+ item = ParameterSweepItem.from_record(input_dict)
+ cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+ assert expected in cmd
+
+ @pytest.mark.parametrize(
+ "compilation_config",
+ [
+ {"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True},
+ {
+ "cudagraph_mode": "piecewise",
+ "mode": 3,
+ "use_inductor_graph_partition": False,
+ },
+ ],
+ )
+ def test_nested_dict_value(self, compilation_config):
+ """Test that nested dict values are serialized as JSON."""
+ item = ParameterSweepItem.from_record(
+ {"compilation_config": compilation_config}
+ )
+ cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+ assert "--compilation-config" in cmd
+ # The dict should be JSON serialized
+ idx = cmd.index("--compilation-config")
+ assert json.loads(cmd[idx + 1]) == compilation_config
+
+ @pytest.mark.parametrize(
+ "input_dict,expected_key,expected_value",
+ [
+ ({"model": "test-model"}, "--model", "test-model"),
+ ({"max_tokens": 100}, "--max-tokens", "100"),
+ ({"temperature": 0.7}, "--temperature", "0.7"),
+ ],
+ )
+ def test_string_and_numeric_values(self, input_dict, expected_key, expected_value):
+ """Test that string and numeric values are handled correctly."""
+ item = ParameterSweepItem.from_record(input_dict)
+ cmd = item.apply_to_cmd(["vllm", "serve"])
+ assert expected_key in cmd
+ assert expected_value in cmd
+
+ @pytest.mark.parametrize(
+ "input_dict,expected_key,key_idx_offset",
+ [
+ ({"max_tokens": 200}, "--max-tokens", 1),
+ ({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0),
+ ],
+ )
+ def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset):
+ """Test that existing parameters in cmd are replaced."""
+ item = ParameterSweepItem.from_record(input_dict)
+
+ if key_idx_offset == 1:
+ # Key-value pair
+ cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"])
+ assert expected_key in cmd
+ idx = cmd.index(expected_key)
+ assert cmd[idx + 1] == "200"
+ assert "100" not in cmd
+ else:
+ # Boolean flag
+ cmd = item.apply_to_cmd(
+ ["vllm", "serve", "--enable-prefix-caching", "model"]
+ )
+ assert expected_key in cmd
+ assert "--enable-prefix-caching" not in cmd
+
+
+class TestParameterSweep:
+ """Test ParameterSweep functionality."""
+
+ def test_from_records_list(self):
+ """Test creating ParameterSweep from a list of records."""
+ records = [
+ {"max_tokens": 100, "temperature": 0.7},
+ {"max_tokens": 200, "temperature": 0.9},
+ ]
+ sweep = ParameterSweep.from_records(records)
+ assert len(sweep) == 2
+ assert sweep[0]["max_tokens"] == 100
+ assert sweep[1]["max_tokens"] == 200
+
+ def test_read_from_dict(self):
+ """Test creating ParameterSweep from a dict format."""
+ data = {
+ "experiment1": {"max_tokens": 100, "temperature": 0.7},
+ "experiment2": {"max_tokens": 200, "temperature": 0.9},
+ }
+ sweep = ParameterSweep.read_from_dict(data)
+ assert len(sweep) == 2
+
+ # Check that items have the _benchmark_name field
+ names = {item["_benchmark_name"] for item in sweep}
+ assert names == {"experiment1", "experiment2"}
+
+ # Check that parameters are preserved
+ for item in sweep:
+ if item["_benchmark_name"] == "experiment1":
+ assert item["max_tokens"] == 100
+ assert item["temperature"] == 0.7
+ elif item["_benchmark_name"] == "experiment2":
+ assert item["max_tokens"] == 200
+ assert item["temperature"] == 0.9
+
+ def test_read_json_list_format(self):
+ """Test reading JSON file with list format."""
+ records = [
+ {"max_tokens": 100, "temperature": 0.7},
+ {"max_tokens": 200, "temperature": 0.9},
+ ]
+
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+ json.dump(records, f)
+ temp_path = Path(f.name)
+
+ try:
+ sweep = ParameterSweep.read_json(temp_path)
+ assert len(sweep) == 2
+ assert sweep[0]["max_tokens"] == 100
+ assert sweep[1]["max_tokens"] == 200
+ finally:
+ temp_path.unlink()
+
+ def test_read_json_dict_format(self):
+ """Test reading JSON file with dict format."""
+ data = {
+ "experiment1": {"max_tokens": 100, "temperature": 0.7},
+ "experiment2": {"max_tokens": 200, "temperature": 0.9},
+ }
+
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+ json.dump(data, f)
+ temp_path = Path(f.name)
+
+ try:
+ sweep = ParameterSweep.read_json(temp_path)
+ assert len(sweep) == 2
+
+ # Check that items have the _benchmark_name field
+ names = {item["_benchmark_name"] for item in sweep}
+ assert names == {"experiment1", "experiment2"}
+ finally:
+ temp_path.unlink()
+
+ def test_unique_benchmark_names_validation(self):
+ """Test that duplicate _benchmark_name values raise an error."""
+ # Test with duplicate names in list format
+ records = [
+ {"_benchmark_name": "exp1", "max_tokens": 100},
+ {"_benchmark_name": "exp1", "max_tokens": 200},
+ ]
+
+ with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
+ ParameterSweep.from_records(records)
+
+ def test_unique_benchmark_names_multiple_duplicates(self):
+ """Test validation with multiple duplicate names."""
+ records = [
+ {"_benchmark_name": "exp1", "max_tokens": 100},
+ {"_benchmark_name": "exp1", "max_tokens": 200},
+ {"_benchmark_name": "exp2", "max_tokens": 300},
+ {"_benchmark_name": "exp2", "max_tokens": 400},
+ ]
+
+ with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
+ ParameterSweep.from_records(records)
+
+ def test_no_benchmark_names_allowed(self):
+ """Test that records without _benchmark_name are allowed."""
+ records = [
+ {"max_tokens": 100, "temperature": 0.7},
+ {"max_tokens": 200, "temperature": 0.9},
+ ]
+ sweep = ParameterSweep.from_records(records)
+ assert len(sweep) == 2
+
+ def test_mixed_benchmark_names_allowed(self):
+ """Test that mixing records with and without _benchmark_name is allowed."""
+ records = [
+ {"_benchmark_name": "exp1", "max_tokens": 100},
+ {"max_tokens": 200, "temperature": 0.9},
+ ]
+ sweep = ParameterSweep.from_records(records)
+ assert len(sweep) == 2
+
+
+class TestParameterSweepItemKeyNormalization:
+ """Test key normalization in ParameterSweepItem."""
+
+ def test_underscore_to_hyphen_conversion(self):
+ """Test that underscores are converted to hyphens in CLI."""
+ item = ParameterSweepItem.from_record({"max_tokens": 100})
+ cmd = item.apply_to_cmd(["vllm", "serve"])
+ assert "--max-tokens" in cmd
+
+ def test_nested_key_preserves_suffix(self):
+ """Test that nested keys preserve the suffix format."""
+ # The suffix after the dot should preserve underscores
+ item = ParameterSweepItem.from_record(
+ {"compilation_config.some_nested_param": "value"}
+ )
+ cmd = item.apply_to_cmd(["vllm", "serve"])
+ # The prefix (compilation_config) gets converted to hyphens,
+ # but the suffix (some_nested_param) is preserved
+ assert any("compilation-config.some_nested_param" in arg for arg in cmd)
diff --git a/tests/benchmarks/test_plot_filters.py b/tests/benchmarks/test_plot_filters.py
new file mode 100644
index 0000000000000..2b58a99125e6c
--- /dev/null
+++ b/tests/benchmarks/test_plot_filters.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pandas as pd
+import pytest
+
+from vllm.benchmarks.sweep.plot import (
+ PlotEqualTo,
+ PlotFilterBase,
+ PlotFilters,
+ PlotGreaterThan,
+ PlotGreaterThanOrEqualTo,
+ PlotLessThan,
+ PlotLessThanOrEqualTo,
+ PlotNotEqualTo,
+)
+
+
+class TestPlotFilters:
+ """Test PlotFilter functionality including 'inf' edge case."""
+
+ def setup_method(self):
+ """Create sample DataFrames for testing."""
+ # DataFrame with numeric values
+ self.df_numeric = pd.DataFrame(
+ {
+ "request_rate": [1.0, 5.0, 10.0, 50.0, 100.0],
+ "value": [10, 20, 30, 40, 50],
+ }
+ )
+
+ # DataFrame with float('inf') - note: string "inf" values are coerced
+ # to float when loading data, so we only test with float('inf')
+ self.df_inf_float = pd.DataFrame(
+ {
+ "request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")],
+ "value": [10, 20, 30, 40, 50],
+ }
+ )
+
+ @pytest.mark.parametrize(
+ "target,expected_count",
+ [
+ ("5.0", 1),
+ ("10.0", 1),
+ ("1.0", 1),
+ ],
+ )
+ def test_equal_to_numeric(self, target, expected_count):
+ """Test PlotEqualTo with numeric values."""
+ filter_obj = PlotEqualTo("request_rate", target)
+ result = filter_obj.apply(self.df_numeric)
+ assert len(result) == expected_count
+
+ def test_equal_to_inf_float(self):
+ """Test PlotEqualTo with float('inf')."""
+ filter_obj = PlotEqualTo("request_rate", "inf")
+ result = filter_obj.apply(self.df_inf_float)
+ # Should match both float('inf') entries because float('inf') == float('inf')
+ assert len(result) == 2
+
+ @pytest.mark.parametrize(
+ "target,expected_count",
+ [
+ ("5.0", 4), # All except 5.0
+ ("1.0", 4), # All except 1.0
+ ],
+ )
+ def test_not_equal_to_numeric(self, target, expected_count):
+ """Test PlotNotEqualTo with numeric values."""
+ filter_obj = PlotNotEqualTo("request_rate", target)
+ result = filter_obj.apply(self.df_numeric)
+ assert len(result) == expected_count
+
+ def test_not_equal_to_inf_float(self):
+ """Test PlotNotEqualTo with float('inf')."""
+ filter_obj = PlotNotEqualTo("request_rate", "inf")
+ result = filter_obj.apply(self.df_inf_float)
+ # Should exclude float('inf') entries
+ assert len(result) == 3
+
+ @pytest.mark.parametrize(
+ "target,expected_count",
+ [
+ ("10.0", 2), # 1.0, 5.0
+ ("50.0", 3), # 1.0, 5.0, 10.0
+ ("5.0", 1), # 1.0
+ ],
+ )
+ def test_less_than(self, target, expected_count):
+ """Test PlotLessThan with numeric values."""
+ filter_obj = PlotLessThan("request_rate", target)
+ result = filter_obj.apply(self.df_numeric)
+ assert len(result) == expected_count
+
+ @pytest.mark.parametrize(
+ "target,expected_count",
+ [
+ ("10.0", 3), # 1.0, 5.0, 10.0
+ ("5.0", 2), # 1.0, 5.0
+ ],
+ )
+ def test_less_than_or_equal_to(self, target, expected_count):
+ """Test PlotLessThanOrEqualTo with numeric values."""
+ filter_obj = PlotLessThanOrEqualTo("request_rate", target)
+ result = filter_obj.apply(self.df_numeric)
+ assert len(result) == expected_count
+
+ @pytest.mark.parametrize(
+ "target,expected_count",
+ [
+ ("10.0", 2), # 50.0, 100.0
+ ("5.0", 3), # 10.0, 50.0, 100.0
+ ],
+ )
+ def test_greater_than(self, target, expected_count):
+ """Test PlotGreaterThan with numeric values."""
+ filter_obj = PlotGreaterThan("request_rate", target)
+ result = filter_obj.apply(self.df_numeric)
+ assert len(result) == expected_count
+
+ @pytest.mark.parametrize(
+ "target,expected_count",
+ [
+ ("10.0", 3), # 10.0, 50.0, 100.0
+ ("5.0", 4), # 5.0, 10.0, 50.0, 100.0
+ ],
+ )
+ def test_greater_than_or_equal_to(self, target, expected_count):
+ """Test PlotGreaterThanOrEqualTo with numeric values."""
+ filter_obj = PlotGreaterThanOrEqualTo("request_rate", target)
+ result = filter_obj.apply(self.df_numeric)
+ assert len(result) == expected_count
+
+ @pytest.mark.parametrize(
+ "filter_str,expected_var,expected_target,expected_type",
+ [
+ ("request_rate==5.0", "request_rate", "5.0", PlotEqualTo),
+ ("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo),
+ ("request_rate<50.0", "request_rate", "50.0", PlotLessThan),
+ ("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo),
+ ("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan),
+ ("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo),
+ ("request_rate==inf", "request_rate", "inf", PlotEqualTo),
+ ("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo),
+ ],
+ )
+ def test_parse_str(self, filter_str, expected_var, expected_target, expected_type):
+ """Test parsing filter strings."""
+ filter_obj = PlotFilterBase.parse_str(filter_str)
+ assert isinstance(filter_obj, expected_type)
+ assert filter_obj.var == expected_var
+ assert filter_obj.target == expected_target
+
+ def test_parse_str_inf_edge_case(self):
+ """Test parsing 'inf' string in filter."""
+ filter_obj = PlotFilterBase.parse_str("request_rate==inf")
+ assert isinstance(filter_obj, PlotEqualTo)
+ assert filter_obj.var == "request_rate"
+ assert filter_obj.target == "inf"
+
+ def test_parse_multiple_filters(self):
+ """Test parsing multiple filters."""
+ filters = PlotFilters.parse_str("request_rate>5.0,value<=40")
+ assert len(filters) == 2
+ assert isinstance(filters[0], PlotGreaterThan)
+ assert isinstance(filters[1], PlotLessThanOrEqualTo)
+
+ def test_parse_empty_filter(self):
+ """Test parsing empty filter string."""
+ filters = PlotFilters.parse_str("")
+ assert len(filters) == 0
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index c20aea822fe81..1966b03cd9c89 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -8,7 +8,7 @@ import torch
from vllm import LLM, SamplingParams
from vllm.config.compilation import CompilationMode, DynamicShapesType
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from vllm.utils.torch_utils import is_torch_equal_or_newer
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index dbe12dc5de705..4d213e030edb5 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -318,13 +318,18 @@ def test_attention_quant_pattern(
torch.set_default_dtype(dtype)
torch.manual_seed(42)
+ model_config = ModelConfig(
+ model=model_name,
+ max_model_len=2048,
+ dtype=dtype,
+ )
vllm_config = VllmConfig(
- model_config=ModelConfig(
- model=model_name,
- max_model_len=2048,
- dtype=dtype,
+ model_config=model_config,
+ scheduler_config=SchedulerConfig(
+ max_num_seqs=1024,
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
),
- scheduler_config=SchedulerConfig(max_num_seqs=1024),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops_list,
diff --git a/tests/conftest.py b/tests/conftest.py
index 317b36ba6cb80..b20c9efef542a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -59,6 +59,7 @@ from vllm.distributed import (
)
from vllm.logger import init_logger
from vllm.logprobs import Logprob
+from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import fetch_image
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
@@ -1174,6 +1175,7 @@ def caplog_mp_spawn(tmp_path, monkeypatch):
"level": level,
"filename": log_path.as_posix(),
}
+ config["loggers"]["vllm"]["level"] = level
config_path.write_text(json.dumps(config))
@@ -1388,7 +1390,11 @@ class LocalAssetServer:
return f"{self.base_url}/{name}"
def get_image_asset(self, name: str) -> Image.Image:
- return fetch_image(self.url_for(name))
+ image = fetch_image(self.url_for(name))
+ # Unwrap MediaWithBytes if present
+ if isinstance(image, MediaWithBytes):
+ image = image.media
+ return image
@pytest.fixture(scope="session")
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index c055b7a3f6dd7..868cc702866e2 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -22,7 +22,14 @@ def get_model_args(
"num_speculative_tokens": 1,
"max_model_len": model_max_len,
}
-
+ eplb_config = {
+ "num_redundant_experts": tp_size,
+ "window_size": 128,
+ "step_interval": 1024,
+ "log_balancedness": False,
+ }
+ if use_async:
+ eplb_config["use_async"] = True
model_args = {
"pretrained": model_name,
"dtype": "auto",
@@ -31,15 +38,10 @@ def get_model_args(
"gpu_memory_utilization": 0.7,
"speculative_config": speculative_config,
"enable_expert_parallel": True,
- "num_redundant_experts": tp_size,
- "eplb_window_size": 128,
- "eplb_step_interval": 1024,
- "eplb_log_balancedness": False,
+ "eplb_config": eplb_config,
"enable_eplb": True,
"max_model_len": model_max_len,
}
- if use_async:
- model_args["eplb_config"] = {"use_async": True}
return model_args
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index ee79ed59c4102..77087ac21ea8b 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -6,7 +6,7 @@ import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import VLLM_PATH
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index 4856cafef44b3..ea6b3d812d8fe 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from vllm.v1.engine.async_llm import AsyncLLM
MODEL_NAME = "openai-community/gpt2"
diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py
new file mode 100644
index 0000000000000..1b2795770d4c7
--- /dev/null
+++ b/tests/entrypoints/openai/test_response_api_parsable_context.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-8B"
+
+
+@pytest.fixture(scope="module")
+def server():
+ args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+ env_dict = dict(
+ VLLM_ENABLE_RESPONSES_API_STORE="1",
+ VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
+ # uncomment for tool calling
+ # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+ )
+
+ with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+ yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+ async with server.get_async_client() as async_client:
+ yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+ response = await client.responses.create(
+ model=model_name,
+ input="What is 13 * 24?",
+ )
+ assert response is not None
+ print("response: ", response)
+ assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
+ response = await client.responses.create(
+ model=model_name,
+ input=[
+ {"type": "message", "content": "Hello.", "role": "user"},
+ {
+ "type": "reasoning",
+ "id": "lol",
+ "content": [
+ {
+ "type": "reasoning_text",
+ "text": "We need to respond: greeting.",
+ }
+ ],
+ "summary": [],
+ },
+ {
+ "arguments": '{"location": "Paris", "unit": "celsius"}',
+ "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
+ "name": "get_weather",
+ "type": "function_call",
+ "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
+ "status": "completed",
+ },
+ {
+ "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
+ "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
+ "output": "The weather in Paris is 20 Celsius",
+ "status": "completed",
+ "type": "function_call_output",
+ },
+ ],
+ temperature=0.0,
+ )
+ assert response is not None
+ assert response.status == "completed"
+ # make sure we get a reasoning and text output
+ assert response.output[0].type == "reasoning"
+ assert response.output[1].type == "message"
+ assert type(response.output[1].content[0].text) is str
diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py
index feef48a36dfa1..8537082e3f8d1 100644
--- a/tests/entrypoints/openai/test_return_token_ids.py
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -3,7 +3,7 @@
import pytest
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index cedf6ce160607..d4d9a6c5b6120 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -7,7 +7,7 @@
import pytest
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 6a1b15c4131e0..9ea65f9fa6e7a 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -14,7 +14,7 @@ from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import RemoteOpenAIServer
diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py
index 25eb5882be89c..c7f8abe27e6e0 100644
--- a/tests/entrypoints/openai/test_token_in_token_out.py
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@@ -7,7 +7,7 @@ import tempfile
import pytest
from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 751f94319eb9f..052f9fecc18de 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -5,7 +5,7 @@ import pytest
import pytest_asyncio
import requests
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index d83c6726e72da..ae8860ee877b4 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -8,6 +8,7 @@ import pytest
import pytest_asyncio
from transformers import AutoProcessor
+from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import RemoteOpenAIServer
@@ -111,7 +112,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
"content": f"{placeholder}{content}",
}
]
- images = [fetch_image(image_url)]
+ image = fetch_image(image_url)
+ # Unwrap MediaWithBytes if present
+ if isinstance(image, MediaWithBytes):
+ image = image.media
+ images = [image]
prompt = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index b2303ab0e7b7c..ce6727bb04f6c 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -271,7 +271,7 @@ async def test_streaming_product_tool_call():
@pytest.fixture
def qwen_tokenizer() -> TokenizerLike:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
return get_tokenizer("Qwen/Qwen3-32B")
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index 6aac649bc3035..ddba1c790ba8c 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -18,7 +18,7 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py
index 83e7048b9def6..eebbcdd2e4396 100644
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -9,6 +9,7 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
@@ -62,7 +63,11 @@ def get_hf_prompt_tokens(model_name, content, image_url):
placeholder = "<|image_1|> "
prompt = f"{placeholder}{content}"
- images = [fetch_image(image_url)]
+ image = fetch_image(image_url)
+ # Unwrap MediaWithBytes if present
+ if isinstance(image, MediaWithBytes):
+ image = image.media
+ images = [image]
inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1]
diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py
index 977c74d54a351..cc5c2f26f80fb 100644
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -12,7 +12,7 @@ import torch
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
ENDIANNESS,
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
index ad219eec18b79..1c34d738fa7a3 100644
--- a/tests/entrypoints/sagemaker/conftest.py
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -45,7 +45,10 @@ def basic_server_with_lora(smollm2_lora_files):
"64",
]
- envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+ envs = {
+ "VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True",
+ "SAGEMAKER_ENABLE_STATEFUL_SESSIONS": "True",
+ }
with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server:
yield remote_server
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index a351cda60621f..03a0c058ea690 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -28,8 +28,7 @@ from vllm.multimodal.utils import (
encode_image_base64,
encode_video_base64,
)
-from vllm.tokenizers import MistralTokenizer
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import MistralTokenizer, get_tokenizer
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import VLLM_PATH
diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 893d806b65742..3951bd4840085 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -5,6 +5,8 @@ import pytest
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
+from openai.types.responses.response_output_message import ResponseOutputMessage
+from openai.types.responses.response_output_text import ResponseOutputText
from openai.types.responses.response_reasoning_item import (
Content,
ResponseReasoningItem,
@@ -101,3 +103,22 @@ class TestResponsesUtils:
)
with pytest.raises(ValueError):
construct_chat_message_with_tool_call(item)
+
+ output_item = ResponseOutputMessage(
+ id="msg_bf585bbbe3d500e0",
+ content=[
+ ResponseOutputText(
+ annotations=[],
+ text="dongyi",
+ type="output_text",
+ logprobs=None,
+ )
+ ],
+ role="assistant",
+ status="completed",
+ type="message",
+ )
+
+ formatted_item = construct_chat_message_with_tool_call(output_item)
+ assert formatted_item["role"] == "assistant"
+ assert formatted_item["content"] == "dongyi"
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index b163559a9414d..54059ec561907 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -33,14 +33,16 @@ def test_worker_apply_lora(qwen3_lora_files):
lora_requests, lora_mapping
)
+ model_config = ModelConfig(
+ MODEL_PATH,
+ seed=0,
+ dtype="float16",
+ max_model_len=127,
+ enforce_eager=True,
+ )
+
vllm_config = VllmConfig(
- model_config=ModelConfig(
- MODEL_PATH,
- seed=0,
- dtype="float16",
- max_model_len=127,
- enforce_eager=True,
- ),
+ model_config=model_config,
load_config=LoadConfig(
download_dir=None,
load_format="dummy",
@@ -50,7 +52,14 @@ def test_worker_apply_lora(qwen3_lora_files):
tensor_parallel_size=1,
data_parallel_size=1,
),
- scheduler_config=SchedulerConfig("generate", 32, 32, 32),
+ scheduler_config=SchedulerConfig(
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ runner_type="generate",
+ max_num_batched_tokens=32,
+ max_num_seqs=32,
+ max_num_partial_prefills=32,
+ ),
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(
block_size=16,
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 1377776a6d84b..e2d6271e2faed 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -315,3 +315,38 @@ def test_mistral_function_call_nested_json():
assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
# No additional content outside the tool call should be returned.
assert parsed.content is None
+
+ # multiple calls
+ multiple_args_dict = [
+ {
+ "city": "Dallas",
+ "state": "TX",
+ "unit": "fahrenheit",
+ "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
+ },
+ {},
+ {"a": 0},
+ {"a": 1, "b": "c"},
+ ]
+ names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
+
+ model_output = "".join(
+ [
+ f"{parser.bot_token}{name}{json.dumps(args)}"
+ for name, args in zip(names, multiple_args_dict)
+ ]
+ )
+
+ parsed = parser.extract_tool_calls(model_output, None)
+
+ # Assertions: the tool call is detected and the full nested JSON is parsed
+ # without truncation.
+ assert parsed.tools_called
+ assert len(parsed.tool_calls) == len(multiple_args_dict)
+
+ for i, tool_call in enumerate(parsed.tool_calls):
+ assert MistralToolCall.is_valid_id(tool_call.id)
+ assert tool_call.function.name == names[i]
+ assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
+ # No additional content outside the tool call should be returned.
+ assert parsed.content is None
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index c39e522100901..8ef1fba8df3e3 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,10 +22,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
-from vllm.tokenizers import MistralTokenizer
-from vllm.transformers_utils.tokenizer import (
+from vllm.tokenizers import (
+ MistralTokenizer,
+ TokenizerLike,
cached_tokenizer_from_config,
- encode_tokens,
)
from ....multimodal.utils import random_audio, random_image, random_video
@@ -154,7 +154,7 @@ def get_text_token_prompts(
mm_data: MultiModalDataDict,
):
dummy_inputs = processor.dummy_inputs
- tokenizer = processor.info.get_tokenizer()
+ tokenizer: TokenizerLike = processor.info.get_tokenizer()
model_config = processor.info.ctx.model_config
model_type = model_config.hf_config.model_type
@@ -191,10 +191,9 @@ def get_text_token_prompts(
assert isinstance(inputs.prompt, str)
text_prompt = inputs.prompt
- token_prompt = encode_tokens(
- tokenizer,
+ token_prompt = tokenizer.encode(
text_prompt,
- add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
+ add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
)
return text_prompt, token_prompt
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 4c0791ea3cece..b73246b68b36a 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -5,7 +5,6 @@
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import encode_tokens
from ....conftest import ImageTestAssets
from ...utils import build_model_context
@@ -48,7 +47,7 @@ def test_processor_override(
]
}
if tokenized_prompt:
- prompt = encode_tokens(tokenizer, prompt)
+ prompt = tokenizer.encode(prompt)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
mm_data = processed_inputs["mm_kwargs"].get_data()
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 66a3fbe11b6a5..7628ab4fe2349 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -31,7 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d90f3a4d4f781..6b1d24b1c99b5 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -358,6 +358,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True,
),
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
+ "MistralLarge3ForCausalLM": _HfExamplesInfo(
+ "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
+ ),
"MixtralForCausalLM": _HfExamplesInfo(
"mistralai/Mixtral-8x7B-Instruct-v0.1",
{"tiny": "TitanML/tiny-mixtral"},
@@ -770,7 +773,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"PixtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Pixtral-12B-2409",
+ extras={
+ "mistral-large-3": "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4",
+ "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
+ },
tokenizer_mode="mistral",
+ # TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
+ is_available_online=False,
),
"QwenVLForConditionalGeneration": _HfExamplesInfo(
"Qwen/Qwen-VL",
@@ -822,7 +831,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),
"Tarsier2ForConditionalGeneration": _HfExamplesInfo(
"omni-research/Tarsier2-Recap-7b",
- hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
+ hf_overrides={
+ "architectures": ["Tarsier2ForConditionalGeneration"],
+ "model_type": "tarsier2",
+ },
),
"VoxtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Voxtral-Mini-3B-2507",
@@ -870,6 +882,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
use_original_num_layers=True,
max_model_len=10240,
),
+ "EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
+ "mistralai/Mistral-Large-3-675B-Instruct-2512",
+ speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
+ is_available_online=False,
+ ),
"LlamaForCausalLMEagle3": _HfExamplesInfo(
"Qwen/Qwen3-8B",
trust_remote_code=True,
diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py
index 155768ac9bff7..b1674cdf77178 100644
--- a/tests/models/test_gguf_download.py
+++ b/tests/models/test_gguf_download.py
@@ -203,7 +203,7 @@ class TestGGUFModelLoader:
@patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
@patch("vllm.config.model.get_config")
@patch("vllm.config.model.is_gguf", return_value=False)
- @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+ @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
@patch("os.path.isfile", return_value=False)
def test_prepare_weights_invalid_format(
self,
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 9843887a13204..d84b4b820533e 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -13,7 +13,7 @@ from transformers import PretrainedConfig
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
from vllm.multimodal.processing import InputProcessingContext
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
from .. import ci_envs
from .registry import HF_EXAMPLE_MODELS
diff --git a/tests/test_config.py b/tests/test_config.py
index 112b02edd0389..b7ed68fea92ab 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -6,12 +6,14 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
from unittest.mock import patch
import pytest
+from pydantic import ValidationError
from vllm.compilation.backends import VllmBackend
from vllm.config import (
CompilationConfig,
ModelConfig,
PoolerConfig,
+ SchedulerConfig,
VllmConfig,
update_config,
)
@@ -716,7 +718,7 @@ def test_is_chunked_prefill_supported(
):
model_config = ModelConfig(model_id, trust_remote_code=True)
assert model_config.attn_type == expected_attn_type
- with caplog_vllm.at_level(level=logging.DEBUG):
+ with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"):
assert model_config.is_chunked_prefill_supported == expected_result
assert reason in caplog_vllm.text
@@ -835,7 +837,7 @@ def test_is_prefix_caching_supported(
):
model_config = ModelConfig(model_id, trust_remote_code=True)
assert model_config.attn_type == expected_attn_type
- with caplog_vllm.at_level(level=logging.DEBUG):
+ with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"):
assert model_config.is_prefix_caching_supported == expected_result
assert reason in caplog_vllm.text
@@ -1095,3 +1097,14 @@ def test_vllm_config_explicit_overrides():
# Other fields should still use defaults
assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+
+
+def test_scheduler_config_init():
+ with pytest.raises(ValidationError):
+ # Positional InitVars missing
+ # (InitVars cannot have defaults otherwise they will become attributes)
+ SchedulerConfig()
+
+ with pytest.raises(AttributeError):
+ # InitVar does not become an attribute
+ print(SchedulerConfig.default_factory().max_model_len)
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index b1fb4e06a6906..c4339827de8b6 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
+from vllm.tokenizers import init_tokenizer_from_config
pytestmark = pytest.mark.cpu_test
@@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
- tokenizer = init_tokenizer_from_configs(model_config)
+ tokenizer = init_tokenizer_from_config(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)
# HF processor adds sep token
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 8900e9c2a1e69..b4f44f52d4df9 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -57,7 +57,7 @@ def test_default_vllm_root_logger_configuration(monkeypatch):
_configure_vllm_root_logger()
logger = logging.getLogger("vllm")
- assert logger.level == logging.DEBUG
+ assert logger.level == logging.INFO
assert not logger.propagate
handler = logger.handlers[0]
@@ -524,7 +524,7 @@ def mp_function(**kwargs):
def test_caplog_mp_fork(caplog_vllm, caplog_mp_fork):
- with caplog_vllm.at_level(logging.DEBUG), caplog_mp_fork():
+ with caplog_vllm.at_level(logging.DEBUG, logger="vllm"), caplog_mp_fork():
import multiprocessing
ctx = multiprocessing.get_context("fork")
diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py
index 1fca633cc5cd7..b152227a5a50f 100644
--- a/tests/tokenizers_/test_basic.py
+++ b/tests/tokenizers_/test_basic.py
@@ -5,8 +5,7 @@ from typing import _get_protocol_attrs # type: ignore
import pytest
from transformers import PreTrainedTokenizerBase
-from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import TokenizerLike, get_tokenizer
def _get_missing_attrs(obj: object, target: type):
diff --git a/tests/tokenizers_/test_mistral.py b/tests/tokenizers_/test_mistral.py
index 92efac86dff29..faff611502652 100644
--- a/tests/tokenizers_/test_mistral.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -91,6 +91,118 @@ from vllm.tokenizers.mistral import (
],
),
),
+ (
+ {
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is the current local date and time?",
+ }
+ ],
+ "tools": [
+ {
+ "type": "function",
+ "function": {
+ "description": "Fetch the current local date and time.",
+ "unsupported_field": False,
+ "name": "get_current_time",
+ "parameters": {},
+ },
+ },
+ {
+ "type": "function",
+ "function": {
+ "description": "Fetch the current local date and time.",
+ "unsupported_field2": False,
+ "name": "get_current_time",
+ "parameters": {},
+ },
+ },
+ ],
+ },
+ (
+ [
+ {
+ "role": "user",
+ "content": "What is the current local date and time?",
+ }
+ ],
+ [
+ {
+ "type": "function",
+ "function": {
+ "description": "Fetch the current local date and time.",
+ "name": "get_current_time",
+ "parameters": {},
+ },
+ },
+ {
+ "type": "function",
+ "function": {
+ "description": "Fetch the current local date and time.",
+ "name": "get_current_time",
+ "parameters": {},
+ },
+ },
+ ],
+ ),
+ ),
+ (
+ {
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is the current local date and time?",
+ }
+ ],
+ "tools": [
+ {
+ "type": "function",
+ "unsupported_field": False,
+ "function": {
+ "description": "Fetch the current local date and time.",
+ "name": "get_current_time",
+ "parameters": {},
+ },
+ },
+ {
+ "type": "function",
+ "unsupported_field2": False,
+ "function": {
+ "description": "Fetch the current local date and time 2.",
+ "name": "get_current_time2",
+ "parameters": {"a": "1"},
+ },
+ },
+ ],
+ },
+ (
+ [
+ {
+ "role": "user",
+ "content": "What is the current local date and time?",
+ }
+ ],
+ [
+ {
+ "type": "function",
+ "function": {
+ "description": "Fetch the current local date and time.",
+ "name": "get_current_time",
+ "parameters": {},
+ },
+ },
+ {
+ "type": "function",
+ "function": {
+ "description": "Fetch the current local date and time 2.",
+ "name": "get_current_time2",
+ "parameters": {"a": "1"},
+ },
+ },
+ ],
+ ),
+ ),
],
)
def test_prepare_apply_chat_template_tools_and_messages(
@@ -1108,13 +1220,6 @@ class TestMistralTokenizer:
)
== expected_tokens[mistral_tokenizer.is_tekken]
)
- assert (
- mistral_tokenizer.decode(
- ids[mistral_tokenizer.is_tekken],
- skip_special_tokens=skip_special_tokens,
- )
- == expected_tokens[mistral_tokenizer.is_tekken]
- )
def test_decode_empty(
self,
@@ -1140,6 +1245,45 @@ class TestMistralTokenizer:
== ""
)
+ @pytest.mark.parametrize(
+ "skip_special_tokens,expected_tokens",
+ (
+ (
+ False,
+ (
+ ["[INST]▁Hello▁world▁![/INST]▁Hello"],
+ ["[INST]Hello world ![/INST]Hello"],
+ ),
+ ),
+ (True, (["Hello world ! Hello"], ["Hello world !Hello"])),
+ ),
+ )
+ def test_batch_decode(
+ self,
+ mistral_tokenizer: MistralTokenizer,
+ skip_special_tokens: bool,
+ expected_tokens: tuple[str, str],
+ ):
+ ids = (
+ [[1, 3, 23325, 2294, 1686, 4, 23325, 2]],
+ [[1, 3, 22177, 4304, 2662, 4, 22177, 2]],
+ )
+ assert (
+ mistral_tokenizer.batch_decode(
+ ids[mistral_tokenizer.is_tekken],
+ skip_special_tokens=skip_special_tokens,
+ )
+ == expected_tokens[mistral_tokenizer.is_tekken]
+ )
+
+ def test_batch_decode_empty(
+ self,
+ mistral_tokenizer: MistralTokenizer,
+ ):
+ assert mistral_tokenizer.batch_decode(
+ [[]],
+ ) == [""]
+
def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
tokens = (
[
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
index 57b6a14a54b3f..7e795350d64c8 100644
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -2,8 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
-from vllm.tokenizers import TokenizerLike, TokenizerRegistry
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer
class TestTokenizer(TokenizerLike):
diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_use/test_deepseekv31_tool_parser.py
index db5168071fbce..8beb7739b6081 100644
--- a/tests/tool_use/test_deepseekv31_tool_parser.py
+++ b/tests/tool_use/test_deepseekv31_tool_parser.py
@@ -6,7 +6,7 @@ import pytest
from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import (
DeepSeekV31ToolParser,
)
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
MODEL = "deepseek-ai/DeepSeek-V3.1"
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index 8fbbbba325385..92f86de23267b 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -14,9 +14,8 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
# Use a common model that is likely to be available
MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py
index f545f52c02dcb..753b3f1c23adf 100644
--- a/tests/tool_use/test_glm4_moe_tool_parser.py
+++ b/tests/tool_use/test_glm4_moe_tool_parser.py
@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import (
Glm4MoeModelToolParser,
)
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index c7ca024f3a767..9036bd32dd704 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -10,9 +10,8 @@ from partial_json_parser.core.options import Allow
from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index 3a48b5206141d..1558a9c3e01f2 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -8,7 +8,7 @@ import pytest
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
index 4332984083dab..dda63f984a832 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py
index c874a9601ae70..6537f281c0e1b 100644
--- a/tests/tool_use/test_openai_tool_parser.py
+++ b/tests/tool_use/test_openai_tool_parser.py
@@ -16,7 +16,7 @@ from openai_harmony import (
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
MODEL = "gpt2"
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 864bb0d0c06c2..5a56768805fdf 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -17,9 +17,8 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
Qwen3CoderToolParser,
)
from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index d94df61128c9c..8795c35a1347f 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -15,9 +15,8 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index fdcdd4038131a..3098fda036a81 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -13,9 +13,8 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import get_tokenizer
pytestmark = pytest.mark.cpu_test
diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
index 7b56c9f0189d4..85680c41ed74d 100644
--- a/tests/transformers_utils/test_config.py
+++ b/tests/transformers_utils/test_config.py
@@ -6,8 +6,8 @@ only get the `eos_token_id` from the tokenizer as defined by
`vllm.LLMEngine._get_eos_token_id`.
"""
+from vllm.tokenizers import get_tokenizer
from vllm.transformers_utils.config import try_get_generation_config
-from vllm.transformers_utils.tokenizer import get_tokenizer
def test_get_llama3_eos_token():
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index a8d0b9be9ec29..0a6a65b4133c9 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -5,13 +5,15 @@ from unittest.mock import patch
import pytest
+from vllm.transformers_utils.gguf_utils import (
+ is_gguf,
+ is_remote_gguf,
+ split_remote_gguf,
+)
from vllm.transformers_utils.utils import (
is_cloud_storage,
is_gcs,
- is_gguf,
- is_remote_gguf,
is_s3,
- split_remote_gguf,
)
@@ -132,7 +134,7 @@ class TestSplitRemoteGGUF:
class TestIsGGUF:
"""Test is_gguf utility function."""
- @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
+ @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=True)
def test_is_gguf_with_local_file(self, mock_check_gguf):
"""Test is_gguf with local GGUF file."""
assert is_gguf("/path/to/model.gguf")
@@ -149,7 +151,7 @@ class TestIsGGUF:
assert not is_gguf("repo/model:quant")
assert not is_gguf("repo/model:INVALID")
- @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+ @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False)
def test_is_gguf_false(self, mock_check_gguf):
"""Test is_gguf returns False for non-GGUF models."""
assert not is_gguf("unsloth/Qwen3-0.6B")
diff --git a/tests/utils.py b/tests/utils.py
index 9565b0ff06e36..539f67c47ac1d 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -44,7 +44,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.cli.serve import ServeSubcommand
from vllm.model_executor.model_loader import get_model_loader
from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.mem_constants import GB_bytes
from vllm.utils.network_utils import get_open_port
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index df3d53332c7cd..6cab129c116c5 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -185,6 +185,8 @@ def create_vllm_config(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
)
device_config = DeviceConfig()
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 58a7a2692bfc8..fd5cf6d3e74aa 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1128,7 +1128,11 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
dtype="float16",
max_model_len=max_model_len,
)
- scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
+ scheduler_config = SchedulerConfig(
+ max_num_batched_tokens=32768,
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ )
vllm_config = VllmConfig(
model_config=model_config,
@@ -1163,7 +1167,10 @@ def test_get_max_concurrency_for_kv_cache_config():
max_model_len=max_model_len,
)
scheduler_config = SchedulerConfig(
- max_num_batched_tokens=1024, enable_chunked_prefill=True
+ max_num_batched_tokens=1024,
+ enable_chunked_prefill=True,
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
)
vllm_config = VllmConfig(
diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
index b4805be802723..429b179b61dce 100644
--- a/tests/v1/core/test_priority_scheduler_random.py
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -219,7 +219,17 @@ def test_priority_scheduling_blast(
vllm_config=scheduler.vllm_config,
)
scheduler.add_request(req)
-
+ num_initial_requests = 2
+ for _ in range(num_initial_requests):
+ req = _create_random_request(
+ max_tokens_range=(1, max_output_tokens),
+ num_tokens_range=(1, max_input_tokens),
+ arrival_time_range=(0, 0),
+ priority_range=(4, 4),
+ num_mm_item_range=(0, 2),
+ vllm_config=scheduler.vllm_config,
+ )
+ scheduler.add_request(req)
for _ in range(20000):
if len(scheduler.waiting) == 0:
num_new_requests = random.randint(0, 2)
diff --git a/tests/v1/core/test_reset_prefix_cache_e2e.py b/tests/v1/core/test_reset_prefix_cache_e2e.py
new file mode 100644
index 0000000000000..e543c30a156ec
--- /dev/null
+++ b/tests/v1/core/test_reset_prefix_cache_e2e.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import EngineArgs, LLMEngine, SamplingParams
+
+PROMPTS = [
+ "A robot may not injure a human being ",
+ "To be or not to be,",
+ "What is the meaning of life?",
+ "What does the fox say? " * 20, # Test long prompt
+]
+
+
+def test_reset_prefix_cache_e2e():
+ engine_args = EngineArgs(
+ model="Qwen/Qwen3-0.6B",
+ gpu_memory_utilization=0.2,
+ async_scheduling=True,
+ max_num_batched_tokens=32,
+ max_model_len=2048,
+ compilation_config={"mode": 0},
+ )
+ engine = LLMEngine.from_engine_args(engine_args)
+ sampling_params = SamplingParams(
+ temperature=0.0,
+ max_tokens=16,
+ )
+
+ # No preempt case:
+ for i, prompt in enumerate(PROMPTS):
+ engine.add_request("ground_truth_" + str(i), prompt, sampling_params)
+
+ ground_truth_results = {}
+ while engine.has_unfinished_requests():
+ request_outputs = engine.step()
+ for request_output in request_outputs:
+ if request_output.finished:
+ ground_truth_results[request_output.request_id] = request_output
+
+ # Preempt case:
+ for i, prompt in enumerate(PROMPTS):
+ engine.add_request("preempted_" + str(i), prompt, sampling_params)
+
+ step_id = 0
+ preempted_results = {}
+ while engine.has_unfinished_requests():
+ if step_id == 10:
+ engine.reset_prefix_cache(reset_running_requests=True)
+
+ request_outputs = engine.step()
+
+ for request_output in request_outputs:
+ if request_output.finished:
+ preempted_results[request_output.request_id] = request_output
+ step_id += 1
+
+ for i in range(len(PROMPTS)):
+ assert (
+ ground_truth_results["ground_truth_" + str(i)].outputs[0].text
+ == preempted_results["preempted_" + str(i)].outputs[0].text
+ ), (
+ f"ground_truth_results['ground_truth_{i}'].outputs[0].text="
+ f"{ground_truth_results['ground_truth_' + str(i)].outputs[0].text} "
+ f"preempted_results['preempted_{i}'].outputs[0].text="
+ f"{preempted_results['preempted_' + str(i)].outputs[0].text}"
+ )
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index fe4153e609971..c6c4a5085bff7 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -728,6 +728,37 @@ def test_preempt_during_execution():
assert requests[1].output_token_ids[0] == 42
+def test_scheduler_reset_prefix_cache():
+ scheduler = create_scheduler(enable_prefix_caching=True)
+ requests = create_requests(num_requests=10)
+ for request in requests:
+ scheduler.add_request(request)
+
+ # Initial scheduling, requests should be at the running state now
+ _ = scheduler.schedule()
+
+ # Verify requests moved from waiting to running
+ assert len(scheduler.waiting) == 0
+ assert len(scheduler.running) == len(requests)
+ for i, request in enumerate(requests):
+ assert scheduler.running[i] == request
+
+ # Reset prefix cache should fail since there are still running requests
+ # and they are taking KV cache
+ assert not scheduler.reset_prefix_cache()
+
+ # Reset prefix cache with reset_running_requests=True. All running requests
+ # Should be pushed back to the waiting queue and kv cache should be freed
+ assert scheduler.reset_prefix_cache(reset_running_requests=True)
+
+ # Verify requests moved from running to waiting
+ assert len(scheduler.waiting) == len(requests)
+ assert len(scheduler.running) == 0
+
+ for i, request in enumerate(requests):
+ assert scheduler.waiting[i] == request
+
+
# Note - these test cases mirror some of those in test_rejection_sampler.py
@pytest.mark.parametrize(
"spec_tokens,output_tokens,expected",
@@ -1477,6 +1508,12 @@ def create_scheduler_with_priority(
Returns:
{class}`Scheduler` instance with priority scheduling
"""
+ model_config = ModelConfig(
+ model=model,
+ trust_remote_code=True,
+ dtype="float16",
+ seed=42,
+ )
if max_model_len is None:
max_model_len = max_num_batched_tokens
scheduler_config = SchedulerConfig(
@@ -1486,14 +1523,9 @@ def create_scheduler_with_priority(
long_prefill_token_threshold=long_prefill_token_threshold,
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True,
+ is_encoder_decoder=model_config.is_encoder_decoder,
policy="priority", # Enable priority scheduling
)
- model_config = ModelConfig(
- model=model,
- trust_remote_code=True,
- dtype="float16",
- seed=42,
- )
# Cache config, optionally force APC
cache_config = CacheConfig(
block_size=block_size,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 7537c7a60476b..f5ba613d38db1 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -69,6 +69,13 @@ def create_scheduler(
Returns:
{class}`Scheduler` instance
"""
+ model_config = ModelConfig(
+ model=model,
+ trust_remote_code=True,
+ dtype="float16",
+ seed=42,
+ skip_tokenizer_init=skip_tokenizer_init,
+ )
if max_model_len is None:
max_model_len = max_num_batched_tokens
scheduler_config = SchedulerConfig(
@@ -79,13 +86,7 @@ def create_scheduler(
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=enable_chunked_prefill,
async_scheduling=async_scheduling,
- )
- model_config = ModelConfig(
- model=model,
- trust_remote_code=True,
- dtype="float16",
- seed=42,
- skip_tokenizer_init=skip_tokenizer_init,
+ is_encoder_decoder=model_config.is_encoder_decoder,
)
# Cache config, optionally force APC
cache_config = CacheConfig(
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 314e7094ef97f..b86534d3d4381 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -40,7 +40,9 @@ def _create_vllm_config(
) -> MagicMock:
mock_config = MagicMock(spec=VllmConfig)
mock_config.compilation_config = compilation_config
- mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
+ mock_config.scheduler_config = SchedulerConfig.default_factory(
+ max_num_seqs=max_num_seqs,
+ )
mock_config.parallel_config = ParallelConfig()
mock_config.speculative_config = None # No speculative decoding
if not lora_config:
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 12621d493e549..b1895e83b8b37 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -100,32 +100,20 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
# test cudagraph_mode with different compilation mode.
# (backend_name, cudagraph_mode, compilation_mode, supported)
-if current_platform.is_rocm():
- combo_cases_2 = [
- ("RocmAttn", "FULL", CompilationMode.NONE, True),
- ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
- ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
- ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
- ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
- ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
- ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
- ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
- ("RocmAttn", "NONE", CompilationMode.NONE, True),
- ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
- ]
-else:
- combo_cases_2 = [
- ("FA2", "FULL", CompilationMode.NONE, True),
- ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
- ("FA2", "PIECEWISE", CompilationMode.NONE, True),
- ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
- ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
- ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
- ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
- ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
- ("FA2", "NONE", CompilationMode.NONE, True),
- ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
- ]
+attn_backend = "RocmAttn" if current_platform.is_rocm() else "FA2"
+
+combo_cases_2 = [
+ (attn_backend, "FULL", CompilationMode.NONE, True),
+ (attn_backend, "FULL", CompilationMode.VLLM_COMPILE, True),
+ (attn_backend, "PIECEWISE", CompilationMode.NONE, True),
+ (attn_backend, "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+ (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
+ (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+ (attn_backend, "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+ (attn_backend, "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+ (attn_backend, "NONE", CompilationMode.NONE, True),
+ (attn_backend, "NONE", CompilationMode.VLLM_COMPILE, True),
+]
@pytest.mark.parametrize(
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
index 71b0e86c75c18..b6a78eaa09209 100644
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
import pytest
from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
from ...utils import check_answers, prep_prompts
@@ -40,10 +41,17 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
"""
+ # NOTE: For ROCm, we have to enforce eager mode to use custom kernel
+ # implementation of GELU with tanh approximation, as PyTorch's native
+ # implementation is currently unstable with torch.compile and produces garbage.
+ enforce_eager = current_platform.is_rocm()
+
test_config = model_config[model]
llm = LLM(
- model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
+ model=model,
+ disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
+ enforce_eager=enforce_eager,
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index 2778b0c5e5670..f895fb72e94a1 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -7,6 +7,7 @@ import pytest
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationMode
+from vllm.platforms import current_platform
from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
@@ -43,15 +44,26 @@ def test_prompts():
return prompts
-@fork_new_process_for_each_test
+use_fork_for_test = (
+ fork_new_process_for_each_test if not current_platform.is_rocm() else lambda x: x
+)
+
+
+@use_fork_for_test
@pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True])
@pytest.mark.parametrize("enforce_eager", [True, False])
def test_kv_sharing_fast_prefill(
monkeypatch: pytest.MonkeyPatch,
kv_sharing_fast_prefill: bool,
enforce_eager: bool,
- test_prompts: list[str],
):
+ if not enforce_eager and current_platform.is_rocm():
+ # Relevant context: https://github.com/vllm-project/vllm/pull/29244
+ pytest.skip(
+ "ROCm: torch.compile produces incorrect output for gemma-3n's GELU "
+ "with tanh approximation. Use enforce_eager=True instead."
+ )
+
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
compilation_config = CompilationConfig(
# This allows vLLM compilation backend to handle allocating and
@@ -65,7 +77,11 @@ def test_kv_sharing_fast_prefill(
with monkeypatch.context() as m:
# Make scheduling deterministic for reproducibility
- m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+ if current_platform.is_rocm():
+ # Use spawn to prevent cuda re-initialization error
+ m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+ else:
+ m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
prompts, answer, indices = prep_prompts(batch_size)
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index f711715dec0e6..5246ea6517f6c 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -414,7 +414,10 @@ def test_eagle_correctness(
)
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
- m.setenv("VLLM_ROCM_USE_AITER", "1")
+ if "deepseek" in model_setup[1].lower():
+ pytest.skip("FLASH_ATTN for deepseek not supported on ROCm platform")
+ else:
+ m.setenv("VLLM_ROCM_USE_AITER", "1")
method, model_name, spec_model_name, tp_size = model_setup
max_model_len = 2048
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 3ba8ab26f5522..48be8c15aba9e 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -484,12 +484,6 @@ def test_encoder_instance_zero_kv_cache(
vision encoder, so they don't need KV cache for text generation.
"""
# Form vllm config
- scheduler_config = SchedulerConfig(
- max_num_seqs=10,
- max_num_batched_tokens=512,
- max_model_len=512,
- disable_hybrid_kv_cache_manager=True,
- )
model_config = ModelConfig(
model="llava-hf/llava-1.5-7b-hf", # Multimodal model
enforce_eager=True,
@@ -497,6 +491,13 @@ def test_encoder_instance_zero_kv_cache(
dtype="float16",
seed=42,
)
+ scheduler_config = SchedulerConfig(
+ max_num_seqs=10,
+ max_num_batched_tokens=512,
+ max_model_len=512,
+ disable_hybrid_kv_cache_manager=True,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ )
cache_config = CacheConfig(
block_size=16,
gpu_memory_utilization=gpu_memory_utilization,
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 736ccbefbc4da..ddab006d0d31a 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -9,7 +9,7 @@ import regex as re
from openai import BadRequestError
from tests.utils import RemoteOpenAIServer
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index f35f91bb3adf8..98f1f44923b1c 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -92,18 +92,19 @@ def create_vllm_config(
enable_permute_local_kv: bool = False,
) -> VllmConfig:
"""Initialize VllmConfig For Testing."""
- scheduler_config = SchedulerConfig(
- max_num_seqs=max_num_seqs,
- max_num_batched_tokens=max_num_batched_tokens,
- max_model_len=max_model_len,
- enable_chunked_prefill=enable_chunked_prefill,
- )
model_config = ModelConfig(
model=model,
trust_remote_code=True,
dtype="float16",
seed=42,
)
+ scheduler_config = SchedulerConfig(
+ max_num_seqs=max_num_seqs,
+ max_num_batched_tokens=max_num_batched_tokens,
+ max_model_len=max_model_len,
+ enable_chunked_prefill=enable_chunked_prefill,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ )
# Cache config, optionally force APC
cache_config = CacheConfig(
block_size=block_size,
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 1684252174d3d..a75a37befe0e1 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -106,6 +106,25 @@ def test_detokenize_false(llm):
def test_bad_words(llm):
"""Check that we respect bad words."""
+ tokenizer = llm.get_tokenizer()
+
+ def contains_bad_word(text: str, tokens: list[int], bad_word: str) -> bool:
+ """Check if word appears in BOTH text and token sequence."""
+ if bad_word not in text:
+ return False
+
+ for add_prefix_space in [False, True]:
+ prefix = " " if add_prefix_space else ""
+ bad_words_token = tokenizer.encode(
+ prefix + bad_word.lstrip(), add_special_tokens=False
+ )
+ if not bad_words_token:
+ continue
+ for i in range(len(tokens) - len(bad_words_token) + 1):
+ if tokens[i : i + len(bad_words_token)] == bad_words_token:
+ return True
+ return False
+
output = llm.generate(PROMPT, SamplingParams(temperature=0))
split_text = output[0].outputs[0].text.split()
@@ -113,14 +132,16 @@ def test_bad_words(llm):
params = SamplingParams(temperature=0, bad_words=[bad_words_1])
output = llm.generate(PROMPT, params)
new_text = output[0].outputs[0].text
- assert bad_words_1 not in new_text
+ new_tokens = output[0].outputs[0].token_ids
+ assert not contains_bad_word(new_text, new_tokens, bad_words_1)
bad_words_2 = new_text.split()[-1]
params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2])
output = llm.generate(PROMPT, params)
new_text = output[0].outputs[0].text
- assert bad_words_1 not in new_text
- assert bad_words_2 not in new_text
+ new_tokens = output[0].outputs[0].token_ids
+ assert not contains_bad_word(new_text, new_tokens, bad_words_1)
+ assert not contains_bad_word(new_text, new_tokens, bad_words_2)
def test_logits_processor(llm):
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 9436ab471c21b..616e57de339e2 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -66,7 +66,10 @@ def _create_proposer(
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
- scheduler_config=SchedulerConfig(),
+ scheduler_config=SchedulerConfig(
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ ),
)
return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index c5c0491abaf7c..3b8813ceb818a 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -51,7 +51,10 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
- scheduler_config=SchedulerConfig(),
+ scheduler_config=SchedulerConfig(
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ ),
)
return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 692c39282c372..6bc412abe8695 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -2,7 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
-from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.config import (
+ ModelConfig,
+ SpeculativeConfig,
+ VllmConfig,
+)
from vllm.v1.spec_decode.ngram_proposer import (
NgramProposer,
_find_longest_matched_ngram_and_propose_tokens,
@@ -167,6 +171,34 @@ def test_ngram_proposer():
assert np.array_equal(result[0], np.array([3, 1]))
assert np.array_equal(result[1], np.array([]))
+ # Test non-contiguous indices: requests 0 and 2 need proposals,
+ # request 1 is in prefill
+ proposer = get_ngram_proposer(min_n=2, max_n=2, k=2)
+ max_model_len = 20
+ token_ids_cpu = np.zeros((3, max_model_len), dtype=np.int32)
+ token_ids_cpu[0, :5] = [1, 2, 3, 1, 2]
+ token_ids_cpu[1, :3] = [4, 5, 6]
+ token_ids_cpu[2, :5] = [7, 8, 9, 7, 8]
+ num_tokens_no_spec = np.array([5, 3, 5], dtype=np.int32)
+ sampled_token_ids = [[2], [], [8]] # Empty list for request 1 simulates prefill
+ result = proposer.propose(
+ sampled_token_ids=sampled_token_ids,
+ req_ids=["0", "1", "2"],
+ num_tokens_no_spec=num_tokens_no_spec,
+ token_ids_cpu=token_ids_cpu,
+ spec_decode_unsupported_reqs=(),
+ )
+ assert len(result) == 3
+ assert np.array_equal(result[0], [3, 1])
+ assert len(result[1]) == 0
+ assert np.array_equal(result[2], [9, 7])
+ # Verify internal arrays written to correct indices
+ assert proposer.valid_ngram_num_drafts[0] == 2
+ assert proposer.valid_ngram_num_drafts[1] == 0
+ assert proposer.valid_ngram_num_drafts[2] == 2
+ assert np.array_equal(proposer.valid_ngram_draft[0, :2], [3, 1])
+ assert np.array_equal(proposer.valid_ngram_draft[2, :2], [9, 7])
+
# test if 0 threads available: can happen if TP size > CPU count
ngram_proposer = get_ngram_proposer(min_n=2, max_n=2, k=2)
ngram_proposer.num_numba_thread_available = 0
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index e230491cddb01..e62b969fe3b95 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -14,7 +14,7 @@ import pytest
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
if TYPE_CHECKING:
from tests.conftest import VllmRunner
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 7b3a07b4e12a5..cfc06666e7984 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -26,16 +26,17 @@ from vllm.v1.worker.tpu_model_runner import (
def get_vllm_config():
- scheduler_config = SchedulerConfig(
- max_num_seqs=10,
- max_num_batched_tokens=512,
- max_model_len=512,
- )
model_config = ModelConfig(
model="facebook/opt-125m",
dtype="bfloat16", # TPUs typically use bfloat16
seed=42,
)
+ scheduler_config = SchedulerConfig(
+ max_num_seqs=10,
+ max_num_batched_tokens=512,
+ max_model_len=512,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ )
cache_config = CacheConfig(
block_size=16,
gpu_memory_utilization=0.9,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 89669ee8b71a0..0439bef1226e3 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -79,16 +79,17 @@ def initialize_kv_cache(runner: GPUModelRunner):
def get_vllm_config():
- scheduler_config = SchedulerConfig(
- max_num_seqs=10,
- max_num_batched_tokens=512,
- max_model_len=512,
- )
model_config = ModelConfig(
model="facebook/opt-125m",
dtype="float16",
seed=42,
)
+ scheduler_config = SchedulerConfig(
+ max_num_seqs=10,
+ max_num_batched_tokens=512,
+ max_model_len=512,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ )
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
gpu_memory_utilization=0.9,
@@ -784,14 +785,15 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
initialize_model_parallel(tensor_model_parallel_size=1)
torch.set_default_dtype(torch.float16)
+ model_config = ModelConfig(
+ model="ibm-granite/granite-4.0-tiny-preview",
+ dtype="float16",
+ )
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
- )
- model_config = ModelConfig(
- model="ibm-granite/granite-4.0-tiny-preview",
- dtype="float16",
+ is_encoder_decoder=model_config.is_encoder_decoder,
)
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 519303c0bfa0a..2933f5d01b274 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -47,7 +47,7 @@ from vllm.benchmarks.lib.endpoint_request_func import (
)
from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from vllm.utils.gc_utils import freeze_gc_heap
from vllm.utils.network_utils import join_host_port
diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py
index 986561ed8502a..a438a328880fd 100644
--- a/vllm/benchmarks/sweep/param_sweep.py
+++ b/vllm/benchmarks/sweep/param_sweep.py
@@ -9,8 +9,26 @@ class ParameterSweep(list["ParameterSweepItem"]):
@classmethod
def read_json(cls, filepath: os.PathLike):
with open(filepath, "rb") as f:
- records = json.load(f)
+ data = json.load(f)
+ # Support both list and dict formats
+ if isinstance(data, dict):
+ return cls.read_from_dict(data)
+
+ return cls.from_records(data)
+
+ @classmethod
+ def read_from_dict(cls, data: dict[str, dict[str, object]]):
+ """
+ Read parameter sweep from a dict format where keys are names.
+
+ Example:
+ {
+ "experiment1": {"max_tokens": 100, "temperature": 0.7},
+ "experiment2": {"max_tokens": 200, "temperature": 0.9}
+ }
+ """
+ records = [{"_benchmark_name": name, **params} for name, params in data.items()]
return cls.from_records(records)
@classmethod
@@ -21,6 +39,15 @@ class ParameterSweep(list["ParameterSweepItem"]):
f"but found type: {type(records)}"
)
+ # Validate that all _benchmark_name values are unique if provided
+ names = [r["_benchmark_name"] for r in records if "_benchmark_name" in r]
+ if names and len(names) != len(set(names)):
+ duplicates = [name for name in names if names.count(name) > 1]
+ raise ValueError(
+ f"Duplicate _benchmark_name values found: {set(duplicates)}. "
+ f"All _benchmark_name values must be unique."
+ )
+
return cls(ParameterSweepItem.from_record(record) for record in records)
@@ -38,6 +65,18 @@ class ParameterSweepItem(dict[str, object]):
def __or__(self, other: dict[str, Any]):
return type(self)(super().__or__(other))
+ @property
+ def name(self) -> str:
+ """
+ Get the name for this parameter sweep item.
+
+ Returns the '_benchmark_name' field if present, otherwise returns a text
+ representation of all parameters.
+ """
+ if "_benchmark_name" in self:
+ return self["_benchmark_name"]
+ return self.as_text(sep="-")
+
# In JSON, we prefer "_"
def _iter_param_key_candidates(self, param_key: str):
# Inner config arguments are not converted by the CLI
@@ -63,29 +102,57 @@ class ParameterSweepItem(dict[str, object]):
def has_param(self, param_key: str) -> bool:
return any(k in self for k in self._iter_param_key_candidates(param_key))
+ def _normalize_cmd_kv_pair(self, k: str, v: object) -> list[str]:
+ """
+ Normalize a key-value pair into command-line arguments.
+
+ Returns a list containing either:
+ - A single element for boolean flags (e.g., ['--flag'] or ['--flag=true'])
+ - Two elements for key-value pairs (e.g., ['--key', 'value'])
+ """
+ if isinstance(v, bool):
+ # For nested params (containing "."), use =true/false syntax
+ if "." in k:
+ return [f"{self._normalize_cmd_key(k)}={'true' if v else 'false'}"]
+ else:
+ return [self._normalize_cmd_key(k if v else "no-" + k)]
+ else:
+ return [self._normalize_cmd_key(k), str(v)]
+
def apply_to_cmd(self, cmd: list[str]) -> list[str]:
cmd = list(cmd)
for k, v in self.items():
+ # Skip the '_benchmark_name' field, not a parameter
+ if k == "_benchmark_name":
+ continue
+
+ # Serialize dict values as JSON
+ if isinstance(v, dict):
+ v = json.dumps(v)
+
for k_candidate in self._iter_cmd_key_candidates(k):
try:
k_idx = cmd.index(k_candidate)
- if isinstance(v, bool):
- cmd[k_idx] = self._normalize_cmd_key(k if v else "no-" + k)
+ # Replace existing parameter
+ normalized = self._normalize_cmd_kv_pair(k, v)
+ if len(normalized) == 1:
+ # Boolean flag
+ cmd[k_idx] = normalized[0]
else:
- cmd[k_idx + 1] = str(v)
+ # Key-value pair
+ cmd[k_idx] = normalized[0]
+ cmd[k_idx + 1] = normalized[1]
break
except ValueError:
continue
else:
- if isinstance(v, bool):
- cmd.append(self._normalize_cmd_key(k if v else "no-" + k))
- else:
- cmd.extend([self._normalize_cmd_key(k), str(v)])
+ # Add new parameter
+ cmd.extend(self._normalize_cmd_kv_pair(k, v))
return cmd
def as_text(self, sep: str = ", ") -> str:
- return sep.join(f"{k}={v}" for k, v in self.items())
+ return sep.join(f"{k}={v}" for k, v in self.items() if k != "_benchmark_name")
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 9947d6170d891..163d517931342 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -65,6 +65,18 @@ class PlotEqualTo(PlotFilterBase):
return df[df[self.var] == target]
+@dataclass
+class PlotNotEqualTo(PlotFilterBase):
+ @override
+ def apply(self, df: "pd.DataFrame") -> "pd.DataFrame":
+ try:
+ target = float(self.target)
+ except ValueError:
+ target = self.target
+
+ return df[df[self.var] != target]
+
+
@dataclass
class PlotLessThan(PlotFilterBase):
@override
@@ -96,6 +108,7 @@ class PlotGreaterThanOrEqualTo(PlotFilterBase):
# NOTE: The ordering is important! Match longer op_keys first
PLOT_FILTERS: dict[str, type[PlotFilterBase]] = {
"==": PlotEqualTo,
+ "!=": PlotNotEqualTo,
"<=": PlotLessThanOrEqualTo,
">=": PlotGreaterThanOrEqualTo,
"<": PlotLessThan,
@@ -167,6 +180,27 @@ def _json_load_bytes(path: Path) -> list[dict[str, object]]:
return json.load(f)
+def _convert_inf_nan_strings(data: list[dict[str, object]]) -> list[dict[str, object]]:
+ """
+ Convert string values "inf", "-inf", and "nan" to their float equivalents.
+
+ This handles the case where JSON serialization represents inf/nan as strings.
+ """
+ converted_data = []
+ for record in data:
+ converted_record = {}
+ for key, value in record.items():
+ if isinstance(value, str):
+ if value in ["inf", "-inf", "nan"]:
+ converted_record[key] = float(value)
+ else:
+ converted_record[key] = value
+ else:
+ converted_record[key] = value
+ converted_data.append(converted_record)
+ return converted_data
+
+
def _get_metric(run_data: dict[str, object], metric_key: str):
try:
return run_data[metric_key]
@@ -178,12 +212,15 @@ def _get_group(run_data: dict[str, object], group_keys: list[str]):
return tuple((k, str(_get_metric(run_data, k))) for k in group_keys)
-def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]):
+def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...], fig_name: str):
parts = list[str]()
+
+ # Start with figure name (always provided, defaults to "FIGURE")
+ parts.append(fig_name)
+
+ # Always append group data if present
if group:
- parts.extend(("FIGURE-", *(f"{k}={v}" for k, v in group)))
- else:
- parts.append("figure")
+ parts.extend(f"{k}={v}" for k, v in group)
return fig_dir / sanitize_filename("-".join(parts) + ".png")
@@ -217,6 +254,10 @@ def _plot_fig(
scale_x: str | None,
scale_y: str | None,
dry_run: bool,
+ fig_name: str,
+ error_bars: bool,
+ fig_height: float,
+ fig_dpi: int,
):
fig_group, fig_data = fig_group_data
@@ -230,7 +271,7 @@ def _plot_fig(
for _, row_data in row_groups
)
- fig_path = _get_fig_path(fig_dir, fig_group)
+ fig_path = _get_fig_path(fig_dir, fig_group, fig_name)
print("[BEGIN FIGURE]")
print(f"Group: {dict(fig_group)}")
@@ -241,6 +282,8 @@ def _plot_fig(
print("[END FIGURE]")
return
+ # Convert string "inf", "-inf", and "nan" to their float equivalents
+ fig_data = _convert_inf_nan_strings(fig_data)
df = pd.DataFrame.from_records(fig_data)
if var_x not in df.columns:
@@ -275,6 +318,10 @@ def _plot_fig(
df = filter_by.apply(df)
df = bin_by.apply(df)
+ # Sort by curve_by columns alphabetically for consistent legend ordering
+ if curve_by:
+ df = df.sort_values(by=curve_by)
+
df["row_group"] = (
pd.concat(
[k + "=" + df[k].astype(str) for k in row_by],
@@ -293,7 +340,7 @@ def _plot_fig(
else "(All)"
)
- g = sns.FacetGrid(df, row="row_group", col="col_group")
+ g = sns.FacetGrid(df, row="row_group", col="col_group", height=fig_height)
if row_by and col_by:
g.set_titles("{row_name}\n{col_name}")
@@ -320,6 +367,7 @@ def _plot_fig(
style=style,
size=size,
markers=True,
+ errorbar="sd" if error_bars else None,
)
g.add_legend(title=hue)
@@ -339,11 +387,12 @@ def _plot_fig(
y=var_y,
hue="curve_group",
markers=True,
+ errorbar="sd" if error_bars else None,
)
g.add_legend()
- g.savefig(fig_path)
+ g.savefig(fig_path, dpi=fig_dpi)
plt.close(g.figure)
print("[END FIGURE]")
@@ -364,6 +413,10 @@ def plot(
scale_x: str | None,
scale_y: str | None,
dry_run: bool,
+ fig_name: str = "FIGURE",
+ error_bars: bool = True,
+ fig_height: float = 6.4,
+ fig_dpi: int = 300,
):
all_data = [
run_data
@@ -398,6 +451,10 @@ def plot(
scale_x=scale_x,
scale_y=scale_y,
dry_run=dry_run,
+ fig_name=fig_name,
+ error_bars=error_bars,
+ fig_height=fig_height,
+ fig_dpi=fig_dpi,
),
fig_groups,
)
@@ -419,6 +476,10 @@ class SweepPlotArgs:
scale_x: str | None
scale_y: str | None
dry_run: bool
+ fig_name: str = "FIGURE"
+ error_bars: bool = True
+ fig_height: float = 6.4
+ fig_dpi: int = 300
parser_name: ClassVar[str] = "plot"
parser_help: ClassVar[str] = "Plot performance curves from parameter sweep results."
@@ -448,6 +509,10 @@ class SweepPlotArgs:
scale_x=args.scale_x,
scale_y=args.scale_y,
dry_run=args.dry_run,
+ fig_name=args.fig_name,
+ error_bars=not args.no_error_bars,
+ fig_height=args.fig_height,
+ fig_dpi=args.fig_dpi,
)
@classmethod
@@ -541,6 +606,32 @@ class SweepPlotArgs:
"Currently only accepts string values such as 'log' and 'sqrt'. "
"See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html",
)
+ parser.add_argument(
+ "--fig-name",
+ type=str,
+ default="FIGURE",
+ help="Name prefix for the output figure file. "
+ "Group data is always appended when present. "
+ "Default: 'FIGURE'. Example: --fig-name my_performance_plot",
+ )
+ parser.add_argument(
+ "--no-error-bars",
+ action="store_true",
+ help="If set, disables error bars on the plot. "
+ "By default, error bars are shown.",
+ )
+ parser.add_argument(
+ "--fig-height",
+ type=float,
+ default=6.4,
+ help="Height of each subplot in inches. Default: 6.4",
+ )
+ parser.add_argument(
+ "--fig-dpi",
+ type=int,
+ default=300,
+ help="Resolution of the output figure in dots per inch. Default: 300",
+ )
parser.add_argument(
"--dry-run",
action="store_true",
@@ -566,6 +657,10 @@ def run_main(args: SweepPlotArgs):
scale_x=args.scale_x,
scale_y=args.scale_y,
dry_run=args.dry_run,
+ fig_name=args.fig_name,
+ error_bars=args.error_bars,
+ fig_height=args.fig_height,
+ fig_dpi=args.fig_dpi,
)
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 1298e4acbd87d..6626707cf2a52 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -138,9 +138,9 @@ def _get_comb_base_path(
):
parts = list[str]()
if serve_comb:
- parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
+ parts.extend(("SERVE-", serve_comb.name))
if bench_comb:
- parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
+ parts.extend(("BENCH-", bench_comb.name))
return output_dir / sanitize_filename("-".join(parts))
@@ -345,8 +345,9 @@ class SweepServeArgs:
"--serve-params",
type=str,
default=None,
- help="Path to JSON file containing a list of parameter combinations "
- "for the `vllm serve` command. "
+ help="Path to JSON file containing parameter combinations "
+ "for the `vllm serve` command. Can be either a list of dicts or a dict "
+ "where keys are benchmark names. "
"If both `serve_params` and `bench_params` are given, "
"this script will iterate over their Cartesian product.",
)
@@ -354,8 +355,9 @@ class SweepServeArgs:
"--bench-params",
type=str,
default=None,
- help="Path to JSON file containing a list of parameter combinations "
- "for the `vllm bench serve` command. "
+ help="Path to JSON file containing parameter combinations "
+ "for the `vllm bench serve` command. Can be either a list of dicts or "
+ "a dict where keys are benchmark names. "
"If both `serve_params` and `bench_params` are given, "
"this script will iterate over their Cartesian product.",
)
diff --git a/vllm/config/model.py b/vllm/config/model.py
index ef592ac001535..5de97697698a1 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -37,15 +37,13 @@ from vllm.transformers_utils.config import (
uses_xdrope_dim,
)
from vllm.transformers_utils.gguf_utils import (
- maybe_patch_hf_config_from_gguf,
-)
-from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import (
is_gguf,
is_remote_gguf,
- maybe_model_redirect,
+ maybe_patch_hf_config_from_gguf,
split_remote_gguf,
)
+from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
+from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils.import_utils import LazyLoader
from vllm.utils.torch_utils import common_broadcastable_dtype
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index ff1ac0e18f324..8da3ae538d671 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -28,6 +28,19 @@ SchedulerPolicy = Literal["fcfs", "priority"]
class SchedulerConfig:
"""Scheduler configuration."""
+ max_model_len: InitVar[int]
+ """Maximum length of a sequence (including prompt and generated text).
+
+ Note: This is stored in the ModelConfig, and is used only here to
+ provide fallbacks and validate other attributes."""
+
+ is_encoder_decoder: InitVar[bool]
+ """True if the model is an encoder-decoder model.
+
+ Note: This is stored in the ModelConfig, and is used only here to
+ disable chunked prefill and prefix caching for encoder-decoder models.
+ """
+
DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
@@ -73,19 +86,6 @@ class SchedulerConfig:
is_multimodal_model: bool = False
"""True if the model is multimodal."""
- max_model_len: InitVar[int] = 8192
- """Maximum length of a sequence (including prompt and generated text).
-
- Note: This is stored in the ModelConfig, and is used only here to
- provide fallbacks and validate other attributes."""
-
- is_encoder_decoder: InitVar[bool] = False
- """True if the model is an encoder-decoder model.
-
- Note: This is stored in the ModelConfig, and is used only here to
- disable chunked prefill and prefix caching for encoder-decoder models.
- """
-
# TODO (ywang96): Make this configurable.
max_num_encoder_input_tokens: int = Field(init=False)
"""Multimodal encoder compute budget, only used in V1.
@@ -141,6 +141,17 @@ class SchedulerConfig:
while a larger value (e.g., 10) reduces host overhead and may increase throughput
by batching multiple tokens before sending."""
+ @staticmethod
+ def default_factory(**kwargs):
+ """
+ Factory method to create `SchedulerConfig` with default values for `InitVar`s.
+ """
+ if "max_model_len" not in kwargs:
+ kwargs["max_model_len"] = 8192
+ if "is_encoder_decoder" not in kwargs:
+ kwargs["is_encoder_decoder"] = False
+ return SchedulerConfig(**kwargs)
+
def get_scheduler_cls(self) -> type["SchedulerInterface"]:
if self.scheduler_cls is None:
if self.async_scheduling:
@@ -175,9 +186,19 @@ class SchedulerConfig:
excluding anything before input ids/embeddings and after
the final hidden states.
"""
- # no factors to consider.
- # this config will not affect the computation graph.
factors: list[Any] = []
+
+ # max_num_batched_tokens need to be included in the hash due
+ # to two reasons:
+ # 1. LoRA creates static buffers based on max_num_batched_tokens.
+ # The tensor sizes and strides get captured in the torch.compile
+ # graph explicitly.
+ # 2. Inductor decides whether using 32-bit or 64-bit indexing integer
+ # based on the data sizes. `max_num_batched_tokens` has an
+ # impact on that. For more details, please check
+ # https://github.com/vllm-project/vllm/issues/29585
+ factors.append(self.max_num_batched_tokens)
+
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 80d53a543f149..c6d6f705f535c 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -167,6 +167,7 @@ class SpeculativeConfig:
@staticmethod
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
+ initial_architecture = hf_config.architectures[0]
if hf_config.model_type in ("deepseek_v3", "deepseek_v32"):
hf_config.model_type = "deepseek_mtp"
if hf_config.model_type == "deepseek_mtp":
@@ -226,6 +227,9 @@ class SpeculativeConfig:
{"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]}
)
+ if initial_architecture == "MistralLarge3ForCausalLM":
+ hf_config.update({"architectures": ["EagleMistralLarge3ForCausalLM"]})
+
return hf_config
def __post_init__(self):
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 4542866aa166c..5b3a9c437662b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -170,7 +170,9 @@ class VllmConfig:
"""Cache configuration."""
parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
"""Parallel configuration."""
- scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig)
+ scheduler_config: SchedulerConfig = Field(
+ default_factory=SchedulerConfig.default_factory,
+ )
"""Scheduler configuration."""
device_config: DeviceConfig = Field(default_factory=DeviceConfig)
"""Device configuration."""
@@ -265,10 +267,6 @@ class VllmConfig:
vllm_factors.append("None")
if self.lora_config:
vllm_factors.append(self.lora_config.compute_hash())
- # LoRA creates static buffers based on max_num_batched_tokens.
- # The tensor sizes and strides get captured in the torch.compile
- # graph explicitly.
- vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens))
else:
vllm_factors.append("None")
if self.speculative_config:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5a2836668174f..096217da4fe44 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -86,8 +86,9 @@ from vllm.transformers_utils.config import (
is_interleaved,
maybe_override_with_speculators,
)
+from vllm.transformers_utils.gguf_utils import is_gguf
from vllm.transformers_utils.repo_utils import get_model_path
-from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
+from vllm.transformers_utils.utils import is_cloud_storage
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_ip
@@ -420,10 +421,6 @@ class EngineArgs:
)
_api_process_count: int = ParallelConfig._api_process_count
_api_process_rank: int = ParallelConfig._api_process_rank
- num_redundant_experts: int = EPLBConfig.num_redundant_experts
- eplb_window_size: int = EPLBConfig.window_size
- eplb_step_interval: int = EPLBConfig.step_interval
- eplb_log_balancedness: bool = EPLBConfig.log_balancedness
max_parallel_loading_workers: int | None = (
ParallelConfig.max_parallel_loading_workers
)
@@ -1581,16 +1578,6 @@ class EngineArgs:
)
self.disable_nccl_for_dp_synchronization = True
- # Forward the deprecated CLI args to the EPLB config.
- if self.num_redundant_experts is not None:
- self.eplb_config.num_redundant_experts = self.num_redundant_experts
- if self.eplb_window_size is not None:
- self.eplb_config.window_size = self.eplb_window_size
- if self.eplb_step_interval is not None:
- self.eplb_config.step_interval = self.eplb_step_interval
- if self.eplb_log_balancedness is not None:
- self.eplb_config.log_balancedness = self.eplb_log_balancedness
-
parallel_config = ParallelConfig(
pipeline_parallel_size=self.pipeline_parallel_size,
tensor_parallel_size=self.tensor_parallel_size,
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index f2b19c845018c..1b6330c9f9b65 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -116,7 +116,7 @@ class EngineClient(ABC):
...
@abstractmethod
- async def reset_prefix_cache(self) -> None:
+ async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
"""Reset the prefix cache"""
...
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1643906894c66..2dd5b9c8f8aa0 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1530,6 +1530,7 @@ def _parse_chat_message_content(
role = message["role"]
content = message.get("content")
reasoning = message.get("reasoning") or message.get("reasoning_content")
+
if content is None:
content = []
elif isinstance(content, str):
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 7a41c668d7645..1260f65dba59a 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -5,6 +5,7 @@ import contextlib
import json
import logging
from abc import ABC, abstractmethod
+from collections.abc import Callable
from contextlib import AsyncExitStack
from typing import TYPE_CHECKING, Union
@@ -17,9 +18,19 @@ from vllm.entrypoints.harmony_utils import (
get_streamable_parser_for_assistant,
render_for_completion,
)
+from vllm.entrypoints.openai.parser.responses_parser import (
+ get_responses_parser_for_simple_context,
+)
+from vllm.entrypoints.openai.protocol import (
+ ResponseInputOutputItem,
+ ResponsesRequest,
+)
+from vllm.entrypoints.responses_utils import construct_tool_dicts
from vllm.entrypoints.tool import Tool
from vllm.entrypoints.tool_server import ToolServer
from vllm.outputs import RequestOutput
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
if TYPE_CHECKING:
from mcp.client import ClientSession
@@ -180,6 +191,71 @@ class SimpleContext(ConversationContext):
raise NotImplementedError("Should not be called.")
+class ParsableContext(ConversationContext):
+ def __init__(
+ self,
+ *,
+ response_messages: list[ResponseInputOutputItem],
+ tokenizer: AnyTokenizer,
+ reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None,
+ request: ResponsesRequest,
+ ):
+ self.num_prompt_tokens = 0
+ self.num_output_tokens = 0
+ self.num_cached_tokens = 0
+ # TODO: num_reasoning_tokens is not implemented yet.
+ self.num_reasoning_tokens = 0
+ # not implemented yet for ParsableContext
+ self.all_turn_metrics: list[TurnMetrics] = []
+
+ if reasoning_parser_cls is None:
+ raise ValueError("reasoning_parser_cls must be provided.")
+
+ self.parser = get_responses_parser_for_simple_context(
+ tokenizer=tokenizer,
+ reasoning_parser_cls=reasoning_parser_cls,
+ response_messages=response_messages,
+ request=request,
+ )
+
+ self._tool_sessions: dict[str, ClientSession | Tool] = {}
+ self.called_tools: set[str] = set()
+
+ self.tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
+
+ def append_output(self, output: RequestOutput) -> None:
+ self.num_prompt_tokens = len(output.prompt_token_ids or [])
+ self.num_cached_tokens = output.num_cached_tokens or 0
+ self.num_output_tokens += len(output.outputs[0].token_ids or [])
+ self.parser.process(output.outputs[0])
+
+ def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
+ raise NotImplementedError("Should not be called.")
+
+ def need_builtin_tool_call(self) -> bool:
+ """Return true if the last message is a MCP tool call"""
+ return False
+
+ async def call_tool(self) -> list[ResponseInputOutputItem]:
+ raise NotImplementedError("Should not be called.")
+
+ def render_for_completion(self):
+ raise NotImplementedError("Should not be called.")
+
+ async def init_tool_sessions(
+ self,
+ tool_server: ToolServer | None,
+ exit_stack: AsyncExitStack,
+ request_id: str,
+ mcp_tools: dict[str, Mcp],
+ ):
+ pass
+
+ async def cleanup_session(self, *args, **kwargs) -> None:
+ """Can be used as coro to used in __aexit__"""
+ raise NotImplementedError("Should not be called.")
+
+
class HarmonyContext(ConversationContext):
def __init__(
self,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f005605c08d7e..c121fa71f0196 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1492,8 +1492,8 @@ class LLM:
def stop_profile(self) -> None:
self.llm_engine.stop_profile()
- def reset_prefix_cache(self) -> None:
- self.llm_engine.reset_prefix_cache()
+ def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+ return self.llm_engine.reset_prefix_cache(reset_running_requests)
def sleep(self, level: int = 1):
"""
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 92161f67f1cf0..cdc316b65ba78 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -877,13 +877,15 @@ if envs.VLLM_SERVER_DEV_MODE:
return JSONResponse(content=server_info)
@router.post("/reset_prefix_cache")
- async def reset_prefix_cache(raw_request: Request):
+ async def reset_prefix_cache(
+ raw_request: Request, reset_running_requests: bool = Query(default=False)
+ ):
"""
Reset the prefix cache. Note that we currently do not check if the
prefix cache is successfully reset in the API server.
"""
logger.info("Resetting prefix cache...")
- await engine_client(raw_request).reset_prefix_cache()
+ await engine_client(raw_request).reset_prefix_cache(reset_running_requests)
return Response(status_code=200)
@router.post("/reset_mm_cache")
diff --git a/vllm/entrypoints/openai/parser/__init__.py b/vllm/entrypoints/openai/parser/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
new file mode 100644
index 0000000000000..1bc8e81bd9dfc
--- /dev/null
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from collections.abc import Callable
+
+from openai.types.responses.response_output_message import ResponseOutputMessage
+from openai.types.responses.response_output_text import ResponseOutputText
+from openai.types.responses.response_reasoning_item import (
+ Content,
+ ResponseReasoningItem,
+)
+
+from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
+from vllm.outputs import CompletionOutput
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = logging.getLogger(__name__)
+
+
+class ResponsesParser:
+ """Incremental parser over completion tokens with reasoning support."""
+
+ def __init__(
+ self,
+ *,
+ tokenizer: AnyTokenizer,
+ reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
+ response_messages: list[ResponseInputOutputItem],
+ request: ResponsesRequest,
+ ):
+ self.response_messages: list[ResponseInputOutputItem] = (
+ # TODO: initial messages may not be properly typed
+ response_messages
+ )
+ self.num_init_messages = len(response_messages)
+ self.tokenizer = tokenizer
+ self.request = request
+
+ self.reasoning_parser_instance = reasoning_parser_cls(tokenizer)
+
+ def process(self, output: CompletionOutput) -> "ResponsesParser":
+ reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
+ output.text, request=self.request
+ )
+ if reasoning_content:
+ self.response_messages.append(
+ ResponseReasoningItem(
+ type="reasoning",
+ id=f"rs_{random_uuid()}",
+ summary=[],
+ content=[
+ Content(
+ type="reasoning_text",
+ text=reasoning_content,
+ )
+ ],
+ )
+ )
+
+ if content:
+ self.response_messages.append(
+ ResponseOutputMessage(
+ type="message",
+ id=f"msg_{random_uuid()}",
+ status="completed",
+ role="assistant",
+ content=[
+ ResponseOutputText(
+ annotations=[], # TODO
+ type="output_text",
+ text=content,
+ logprobs=None, # TODO
+ )
+ ],
+ )
+ )
+
+ return self
+
+
+def get_responses_parser_for_simple_context(
+ *,
+ tokenizer: AnyTokenizer,
+ reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser],
+ response_messages: list[ResponseInputOutputItem],
+ request: ResponsesRequest,
+) -> ResponsesParser:
+ """Factory function to create a ResponsesParser with
+ optional reasoning parser.
+
+ Returns:
+ ResponsesParser instance configured with the provided parser
+ """
+ return ResponsesParser(
+ tokenizer=tokenizer,
+ reasoning_parser_cls=reasoning_parser_cls,
+ response_messages=response_messages,
+ request=request,
+ )
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 81495a0777546..5ad86194ce1b2 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -60,6 +60,7 @@ from vllm.entrypoints.chat_utils import (
from vllm.entrypoints.context import (
ConversationContext,
HarmonyContext,
+ ParsableContext,
SimpleContext,
StreamingHarmonyContext,
)
@@ -96,8 +97,9 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.responses_utils import (
construct_input_messages,
- convert_tool_responses_to_completions_format,
+ construct_tool_dicts,
extract_tool_types,
+ make_response_output_items_from_parsable_context,
)
from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
@@ -228,7 +230,6 @@ class OpenAIServingResponses(OpenAIServing):
self.tool_parser = self._get_tool_parser(
tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
)
- self.exclude_tools_when_tool_choice_none = False
# HACK(woosuk): This is a hack. We should use a better store.
# FIXME: If enable_store=True, this may cause a memory leak since we
# never remove responses from the store.
@@ -413,7 +414,17 @@ class OpenAIServingResponses(OpenAIServing):
else:
context = HarmonyContext(messages, available_tools)
else:
- context = SimpleContext()
+ if envs.VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT:
+ # This is an feature in development for parsing
+ # tokens during generation instead of at the end
+ context = ParsableContext(
+ response_messages=messages,
+ tokenizer=tokenizer,
+ reasoning_parser_cls=self.reasoning_parser,
+ request=request,
+ )
+ else:
+ context = SimpleContext()
if self.reasoning_parser is not None:
reasoning_parser = self.reasoning_parser(tokenizer)
@@ -534,15 +545,7 @@ class OpenAIServingResponses(OpenAIServing):
prev_response: ResponsesResponse | None,
tokenizer: TokenizerLike,
):
- if request.tools is None or (
- request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
- ):
- tool_dicts = None
- else:
- tool_dicts = [
- convert_tool_responses_to_completions_format(tool.model_dump())
- for tool in request.tools
- ]
+ tool_dicts = construct_tool_dicts(request.tools, request.tool_choice)
# Construct the input messages.
messages = construct_input_messages(
request_instructions=request.instructions,
@@ -642,6 +645,22 @@ class OpenAIServingResponses(OpenAIServing):
status = "cancelled"
else:
status = "incomplete"
+ elif isinstance(context, ParsableContext):
+ response_messages = context.parser.response_messages[
+ context.parser.num_init_messages :
+ ]
+ output = make_response_output_items_from_parsable_context(response_messages)
+
+ # TODO: context for non-gptoss models doesn't use messages
+ # so we can't get them out yet
+ if request.enable_response_messages:
+ raise NotImplementedError(
+ "enable_response_messages is currently only supported for gpt-oss"
+ )
+
+ # TODO: Calculate usage.
+ # assert final_res.prompt_token_ids is not None
+ num_tool_output_tokens = 0
else:
assert isinstance(context, SimpleContext)
final_res = context.last_output
@@ -661,7 +680,7 @@ class OpenAIServingResponses(OpenAIServing):
assert final_res.prompt_token_ids is not None
num_tool_output_tokens = 0
- assert isinstance(context, (SimpleContext, HarmonyContext))
+ assert isinstance(context, (SimpleContext, HarmonyContext, ParsableContext))
num_prompt_tokens = context.num_prompt_tokens
num_generated_tokens = context.num_output_tokens
num_cached_tokens = context.num_cached_tokens
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index b34446d3230b1..cea9924ebbaca 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -37,7 +37,7 @@ from vllm.inputs.data import PromptType
from vllm.logger import init_logger
from vllm.model_executor.models import SupportsTranscription
from vllm.outputs import RequestOutput
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
from vllm.utils.import_utils import PlaceholderModule
try:
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 7e2d67a1fb659..b89db60545abd 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -80,7 +80,7 @@ class MistralToolParser(ToolParser):
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
if _is_fn_name_regex_support(self.model_tokenizer):
self.fn_name_regex = re.compile(
- r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)", re.DOTALL
+ r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL
)
else:
self.fn_name_regex = None
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 10b90bbbb0f32..f31b309b8ca48 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -33,7 +33,7 @@ class RenderConfig:
`0` yields an empty list (and skips embeds).
`-1` maps to `model_config.max_model_len`."""
- add_special_tokens: bool | None = True
+ add_special_tokens: bool = True
"""Whether to add model-specific special tokens during tokenization."""
cache_salt: str | None = None
@@ -315,7 +315,7 @@ class CompletionRenderer(BaseRenderer):
text: str,
max_length: int | None,
truncate_prompt_tokens: int | None,
- add_special_tokens: bool | None,
+ add_special_tokens: bool,
cache_salt: str | None,
) -> EngineTokensPrompt:
"""Tokenize text input asynchronously."""
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 07abb80ebc9e3..5f21e2c44450c 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
from openai.types.chat import (
ChatCompletionAssistantMessageParam,
ChatCompletionMessageToolCallParam,
@@ -10,6 +12,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
Function as FunctionCallTool,
)
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response import ToolChoice
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
@@ -24,6 +27,20 @@ from vllm.entrypoints.openai.protocol import (
)
+def make_response_output_items_from_parsable_context(
+ response_messages: list[ResponseInputOutputItem],
+) -> list[ResponseOutputItem]:
+ """Given a list of sentences, construct ResponseOutput Items."""
+ output_messages: list[ResponseOutputItem] = []
+ for message in response_messages:
+ if not isinstance(message, ResponseFunctionToolCallOutputItem):
+ output_messages.append(message)
+ else:
+ raise NotImplementedError("tool calls not supported for response context")
+
+ return output_messages
+
+
def construct_input_messages(
*,
request_instructions: str | None = None,
@@ -97,13 +114,18 @@ def construct_chat_message_with_tool_call(
"role": "assistant",
"reasoning": reasoning_content,
}
+ elif isinstance(item, ResponseOutputMessage):
+ return {
+ "role": "assistant",
+ "content": item.content[0].text,
+ }
elif isinstance(item, ResponseFunctionToolCallOutputItem):
return ChatCompletionToolMessageParam(
role="tool",
content=item.output,
tool_call_id=item.call_id,
)
- elif item.get("type") == "function_call_output":
+ elif isinstance(item, dict) and item.get("type") == "function_call_output":
# Append the function call output as a tool message.
return ChatCompletionToolMessageParam(
role="tool",
@@ -141,3 +163,16 @@ def convert_tool_responses_to_completions_format(tool: dict) -> dict:
"type": "function",
"function": tool,
}
+
+
+def construct_tool_dicts(
+ tools: list[Tool], tool_choice: ToolChoice
+) -> list[dict[str, Any]] | None:
+ if tools is None or (tool_choice == "none"):
+ tool_dicts = None
+ else:
+ tool_dicts = [
+ convert_tool_responses_to_completions_format(tool.model_dump())
+ for tool in tools
+ ]
+ return tool_dicts
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 602f59ac09f55..8819c85af9a26 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -19,7 +19,7 @@ from vllm.inputs import TokensPrompt
from vllm.model_executor.models.interfaces import supports_score_template
from vllm.multimodal.inputs import MultiModalDataDict
from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import TokenizerLike
+from vllm.tokenizers import TokenizerLike
ScoreContentPartParam: TypeAlias = (
ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
diff --git a/vllm/envs.py b/vllm/envs.py
index c22f30ce8956e..4796560d2ca70 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,7 +75,7 @@ if TYPE_CHECKING:
VLLM_MEDIA_CONNECTOR: str = "http"
VLLM_MM_INPUT_CACHE_GIB: int = 4
VLLM_TARGET_DEVICE: str = "cuda"
- VLLM_MAIN_CUDA_VERSION: str = "12.8"
+ VLLM_MAIN_CUDA_VERSION: str = "12.9"
MAX_JOBS: str | None = None
NVCC_THREADS: str | None = None
VLLM_USE_PRECOMPILED: bool = False
@@ -215,6 +215,7 @@ if TYPE_CHECKING:
VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
VLLM_TUNED_CONFIG_FOLDER: str | None = None
VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set()
+ VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: bool = False
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False
VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
@@ -446,10 +447,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Target device of vLLM, supporting [cuda (by default),
# rocm, cpu]
"VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
- # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
- # 12.8 is the default. This follows PyTorch but can be overridden.
+ # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
- or "12.8",
+ or "12.9",
# Maximum number of compilation jobs to run in parallel.
# By default this is the number of CPUs
"MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
@@ -1452,6 +1452,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
),
+ # Experimental: use this to enable MCP tool calling for non harmony models
+ "VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT": lambda: bool(
+ int(os.getenv("VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT", "0"))
+ ),
# Allows vllm to find tuned config under customized folder
"VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
# Valid values are container,code_interpreter,web_search_preview
diff --git a/vllm/logger.py b/vllm/logger.py
index ad3123c0f0149..3b7bb1f22ec96 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -62,7 +62,7 @@ DEFAULT_LOGGING_CONFIG = {
"loggers": {
"vllm": {
"handlers": ["vllm"],
- "level": "DEBUG",
+ "level": envs.VLLM_LOGGING_LEVEL,
"propagate": False,
},
},
@@ -175,6 +175,9 @@ def _configure_vllm_root_logger() -> None:
vllm_handler["stream"] = envs.VLLM_LOGGING_STREAM
vllm_handler["formatter"] = "vllm_color" if _use_color() else "vllm"
+ vllm_loggers = logging_config["loggers"]["vllm"]
+ vllm_loggers["level"] = envs.VLLM_LOGGING_LEVEL
+
if envs.VLLM_LOGGING_CONFIG_PATH:
if not path.exists(envs.VLLM_LOGGING_CONFIG_PATH):
raise RuntimeError(
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 3471ee327cf8c..7038d0868c7eb 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -159,6 +159,13 @@ class GeluAndMulSparse(CustomOp):
self.approximate = approximate
if approximate not in ("none", "tanh"):
raise ValueError(f"Unknown approximate mode: {approximate}")
+ if current_platform.is_rocm() and approximate == "tanh":
+ # TODO:[ROCm] PyTorch native GELU with tanh is unstable with torch.compile
+ logger.warning_once(
+ "[ROCm] Pytorch's native GELU with tanh approximation is currently "
+ "unstable and produces garbage. Fallback to 'none' approximation."
+ )
+ self.approximate = "none"
# Sparsity.
if activation_sparsity == 0.0:
@@ -209,6 +216,12 @@ class GeluAndMul(CustomOp):
self.op = torch.ops._C.gelu_and_mul
elif approximate == "tanh":
self.op = torch.ops._C.gelu_tanh_and_mul
+ if current_platform.is_rocm() and approximate == "tanh":
+ logger.warning_once(
+ "[ROCm] PyTorch's native GELU with tanh approximation is unstable "
+ "with torch.compile. For native implementation, fallback to 'none' "
+ "approximation. The custom kernel implementation is unaffected."
+ )
elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops
@@ -219,8 +232,12 @@ class GeluAndMul(CustomOp):
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""
+ # TODO: [ROCm] PyTorch's native GELU with tanh is unstable with torch.compile
+ approximate = self.approximate
+ if current_platform.is_rocm() and approximate == "tanh":
+ approximate = "none"
d = x.shape[-1] // 2
- return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
+ return F.gelu(x[..., :d], approximate=approximate) * x[..., d:]
def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
@@ -522,7 +539,16 @@ _ACTIVATION_REGISTRY = LazyDict(
"gelu": lambda: nn.GELU(),
"gelu_fast": lambda: FastGELU(),
"gelu_new": lambda: NewGELU(),
- "gelu_pytorch_tanh": lambda: nn.GELU(approximate="tanh"),
+ "gelu_pytorch_tanh": lambda: (
+ # TODO:[ROCm] PyTorch native GELU with tanh is unstable with torch.compile
+ logger.warning_once(
+ "[ROCm] PyTorch's native GELU with tanh approximation is unstable. "
+ "Falling back to GELU(approximate='none')."
+ ),
+ nn.GELU(approximate="none"),
+ )[1]
+ if current_platform.is_rocm()
+ else nn.GELU(approximate="tanh"),
"relu": lambda: nn.ReLU(),
"relu2": lambda: ReLUSquaredActivation(),
"silu": lambda: nn.SiLU(),
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..a9f24c20a25a2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 90e520e244416..0b63acf2dc5a5 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -252,7 +252,6 @@ class MambaMixer(MambaBase, CustomOp):
conv_state = self_kv_cache[0].transpose(-1, -2)
ssm_state = self_kv_cache[1]
has_initial_states_p = attn_metadata.has_initial_states_p
- num_padded_decodes = attn_metadata.num_padded_decodes
# 1. Gated MLP's linear projection
projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
@@ -281,7 +280,7 @@ class MambaMixer(MambaBase, CustomOp):
state_indices_tensor,
num_prefill_tokens,
num_prefills,
- num_padded_decodes,
+ num_decode_tokens,
)
hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d
@@ -470,24 +469,24 @@ def split_batch_to_prefill_and_decode(
state_indices_tensor: torch.Tensor,
num_prefill_tokens: int,
num_prefills: int,
- num_padded_decodes: int,
+ num_decode_tokens: int,
) -> PrefillDecodeSplit:
- num_actual_tokens = num_prefill_tokens + num_padded_decodes
+ num_actual_tokens = num_prefill_tokens + num_decode_tokens
# In v1, decode tokens come first, then prefill tokens.
hidden_states_BC_d, hidden_states_BC_p = torch.split(
hidden_states_BC[..., :num_actual_tokens],
- [num_padded_decodes, num_prefill_tokens],
+ [num_decode_tokens, num_prefill_tokens],
dim=-1,
)
gate_d, gate_p = torch.split(
- gate[..., :num_actual_tokens], [num_padded_decodes, num_prefill_tokens], dim=-1
+ gate[..., :num_actual_tokens], [num_decode_tokens, num_prefill_tokens], dim=-1
)
- # num_padded_decodes accounts for CUDA graph padding when applicable
+ # num_decode_tokens accounts for CUDA graph padding when applicable
state_indices_tensor_d, state_indices_tensor_p = torch.split(
- state_indices_tensor[: num_padded_decodes + num_prefills],
- [num_padded_decodes, num_prefills],
+ state_indices_tensor[: num_decode_tokens + num_prefills],
+ [num_decode_tokens, num_prefills],
dim=0,
)
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index 6ebfa47a9dc3f..dad960160f2ad 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -111,6 +111,7 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
+ llama_4_scaling: torch.Tensor | None = None,
) -> torch.Tensor:
q_c = None
kv_lora = None
@@ -159,6 +160,9 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
hidden_states, q_c, positions, self.indexer_rope_emb
)
+ if llama_4_scaling is not None:
+ q *= llama_4_scaling
+
attn_out = self.mla_attn(
q,
kv_c_normed,
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 0f10bff6ac4f5..aa6ece30026d3 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -238,7 +238,7 @@ def get_rope(
dtype,
**extra_kwargs,
)
- elif scaling_type == "deepseek_yarn":
+ elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]:
scaling_factor = rope_parameters["factor"]
original_max_position = rope_parameters["original_max_position_embeddings"]
# assert max_position == original_max_position * scaling_factor
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 05f257feea3ee..007d847ac3b7b 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -444,7 +444,7 @@ def load_weights_using_from_2_way_softmax(
)
loaded_weights = pooling_model_cls.load_weights(model, weights, load_lm_head=True)
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
tokenizer = get_tokenizer(
model_config.tokenizer,
@@ -498,7 +498,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
# Skip ModelForSequenceClassification in MRO to avoid infinite recursion
loaded_weights = type(model).__mro__[1].load_weights(model, weights)
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
tokenizer = get_tokenizer(
model_config.tokenizer,
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 6e23037b919ab..ca77b8322e2e8 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -346,11 +346,16 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
# Use expert_params_mapping to locate the destination
# param and delegate to its expert-aware weight_loader
# with expert_id.
+ is_expert_weight = False
for mapping in expert_params_mapping:
param_name, weight_name, expert_id, shard_id = mapping
if weight_name not in chunk_name:
continue
+ # Anyway, this is an expert weight and should not be
+ # attempted to load as other weights later
+ is_expert_weight = True
+
# Do not modify `name` since the loop may continue here
# Instead, create a new variable
name_mapped = chunk_name.replace(weight_name, param_name)
@@ -377,6 +382,12 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
loaded_params.add(name_mapped)
break
else:
+ if is_expert_weight:
+ # We've checked that this is an expert weight
+ # However it's not mapped locally to this rank
+ # So we simply skip it
+ continue
+
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index 8179f916ff417..019fb3e29ab91 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -45,6 +45,7 @@ from vllm.multimodal.processing import (
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import cached_tokenizer_from_config
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
from vllm.transformers_utils.processors.deepseek_ocr import (
BASE_SIZE,
@@ -53,7 +54,6 @@ from vllm.transformers_utils.processors.deepseek_ocr import (
DeepseekOCRProcessor,
count_tiles,
)
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.v1.sample.logits_processor import (
AdapterLogitsProcessor,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 73cac2556c55a..a8eb4a69b6f2b 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -395,6 +395,16 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
return 0.1 * mscale * math.log(scale) + 1.0
+def _get_llama_4_scaling(
+ original_max_position_embeddings: int, scaling_beta: float, positions: torch.Tensor
+) -> torch.Tensor:
+ scaling = 1 + scaling_beta * torch.log(
+ 1 + torch.floor(positions / original_max_position_embeddings)
+ )
+ # Broadcast over num_heads and head_dim
+ return scaling[..., None, None]
+
+
class DeepseekV2Attention(nn.Module):
def __init__(
self,
@@ -481,7 +491,11 @@ class DeepseekV2Attention(nn.Module):
prefix=f"{prefix}.o_proj",
)
if config.rope_parameters["rope_type"] != "default":
- config.rope_parameters["rope_type"] = "deepseek_yarn"
+ config.rope_parameters["rope_type"] = (
+ "deepseek_yarn"
+ if config.rope_parameters.get("apply_yarn_scaling", True)
+ else "deepseek_llama_scaling"
+ )
self.rotary_emb = get_rope(
qk_rope_head_dim,
@@ -491,7 +505,10 @@ class DeepseekV2Attention(nn.Module):
is_neox_style=False,
)
- if config.rope_parameters["rope_type"] != "default":
+ if (
+ config.rope_parameters["rope_type"] != "default"
+ and config.rope_parameters["rope_type"] == "deepseek_yarn"
+ ):
mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
@@ -511,6 +528,7 @@ class DeepseekV2Attention(nn.Module):
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
+ llama_4_scaling: torch.Tensor | None,
) -> torch.Tensor:
if self.q_lora_rank is not None:
q = self.q_a_proj(hidden_states)[0]
@@ -536,6 +554,11 @@ class DeepseekV2Attention(nn.Module):
k = torch.empty_like(q)
k[..., : self.qk_nope_head_dim] = k_nope
k[..., self.qk_nope_head_dim :] = k_pe
+
+ # Apply llama 4 scaling if provided
+ if llama_4_scaling is not None:
+ q *= llama_4_scaling
+
# padding value to qk_head_dim for alignment
v = torch.nn.functional.pad(
v, [0, self.qk_head_dim - self.v_head_dim], value=0
@@ -987,7 +1010,12 @@ class DeepseekV2MLAAttention(nn.Module):
)
if config.rope_parameters["rope_type"] != "default":
- config.rope_parameters["rope_type"] = "deepseek_yarn"
+ config.rope_parameters["rope_type"] = (
+ "deepseek_yarn"
+ if config.rope_parameters.get("apply_yarn_scaling", True)
+ else "deepseek_llama_scaling"
+ )
+
self.rotary_emb = get_rope(
qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
@@ -995,7 +1023,11 @@ class DeepseekV2MLAAttention(nn.Module):
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
- if config.rope_parameters["rope_type"] != "default":
+
+ if (
+ config.rope_parameters["rope_type"] != "default"
+ and config.rope_parameters["rope_type"] == "deepseek_yarn"
+ ):
mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
@@ -1064,8 +1096,9 @@ class DeepseekV2MLAAttention(nn.Module):
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
+ llama_4_scaling: torch.Tensor | None,
) -> torch.Tensor:
- return self.mla_attn(positions, hidden_states)
+ return self.mla_attn(positions, hidden_states, llama_4_scaling)
class DeepseekV2DecoderLayer(nn.Module):
@@ -1102,6 +1135,8 @@ class DeepseekV2DecoderLayer(nn.Module):
dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim)
)
+ self.use_mha = use_mha
+
if use_mha:
attn_cls = DeepseekAttention
elif model_config.use_mla:
@@ -1155,6 +1190,7 @@ class DeepseekV2DecoderLayer(nn.Module):
positions: torch.Tensor,
hidden_states: torch.Tensor,
residual: torch.Tensor | None,
+ llama_4_scaling: torch.Tensor | None = None,
) -> torch.Tensor:
# Self Attention
if residual is None:
@@ -1162,10 +1198,14 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states = self.input_layernorm(hidden_states)
else:
hidden_states, residual = self.input_layernorm(hidden_states, residual)
- hidden_states = self.self_attn(
- positions=positions,
- hidden_states=hidden_states,
- )
+
+ attn_kwargs = {
+ "positions": positions,
+ "hidden_states": hidden_states,
+ }
+ if not self.use_mha:
+ attn_kwargs["llama_4_scaling"] = llama_4_scaling
+ hidden_states = self.self_attn(**attn_kwargs)
if (
not isinstance(self.self_attn, DeepseekAttention)
@@ -1266,8 +1306,24 @@ class DeepseekV2Model(nn.Module):
hidden_states = intermediate_tensors["hidden_states"]
residual = intermediate_tensors["residual"]
+ # Compute llama 4 scaling once per forward pass if enabled
+ llama_4_scaling_config = getattr(self.config, "llama_4_scaling", None)
+ llama_4_scaling: torch.Tensor | None
+ if llama_4_scaling_config is not None:
+ llama_4_scaling = _get_llama_4_scaling(
+ original_max_position_embeddings=llama_4_scaling_config[
+ "original_max_position_embeddings"
+ ],
+ scaling_beta=llama_4_scaling_config["beta"],
+ positions=positions,
+ )
+ else:
+ llama_4_scaling = None
+
for layer in islice(self.layers, self.start_layer, self.end_layer):
- hidden_states, residual = layer(positions, hidden_states, residual)
+ hidden_states, residual = layer(
+ positions, hidden_states, residual, llama_4_scaling
+ )
if not get_pp_group().is_last_rank:
return IntermediateTensors(
@@ -1325,6 +1381,7 @@ class DeepseekV2ForCausalLM(
packed_modules_mapping = {
"gate_up_proj": ["gate_proj", "up_proj"],
}
+ model_cls = DeepseekV2Model
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
@@ -1355,7 +1412,7 @@ class DeepseekV2ForCausalLM(
"kv_a_proj_with_mqa",
]
- self.model = DeepseekV2Model(
+ self.model = self.model_cls(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
)
if get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 1b6e4110039c4..56c1a87a25401 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -41,13 +41,13 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import cached_tokenizer_from_config
from vllm.transformers_utils.configs.deepseek_vl2 import (
DeepseekVLV2Config,
MlpProjectorConfig,
VisionEncoderConfig,
)
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.utils.torch_utils import set_default_torch_dtype
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 1797adab8d146..accf7e6ef2f47 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -59,8 +59,8 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processor import cached_get_processor
-from vllm.transformers_utils.tokenizer import cached_get_tokenizer
+from vllm.tokenizers import cached_tokenizer_from_config
+from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .blip2 import Blip2QFormerModel
@@ -862,7 +862,7 @@ class GraniteSpeechForConditionalGeneration(
else:
raise ValueError(f"Unsupported task type {task_type}")
- tokenizer = cached_get_tokenizer(model_config.model)
+ tokenizer = cached_tokenizer_from_config(model_config)
chat = [dict(role="user", content=user_prompt)]
prompt = tokenizer.apply_chat_template(
chat,
@@ -886,7 +886,7 @@ class GraniteSpeechForConditionalGeneration(
model_config: ModelConfig,
) -> int | None:
"""Get the number of audio tokens for an audio duration in sec."""
- processor = cached_get_processor(model_config.model)
+ processor = cached_processor_from_config(model_config)
hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
proj_win_size = processor.audio_processor.projector_window_size
ds_rate = processor.audio_processor.projector_downsample_rate
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 181c4ed2dca5a..550e8b014d5e7 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -19,7 +19,7 @@ from vllm.model_executor.layers.pooler import (
)
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.tasks import PoolingTask
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
from vllm.v1.outputs import PoolerOutput
from vllm.v1.pool.metadata import PoolingMetadata
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 9c5f9389e54bb..7c3933c6feb7e 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -338,6 +338,7 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+ mm_kwargs = {"input_data_format": "channels_last", **mm_kwargs}
processed_outputs = super()._call_hf_processor(
prompt,
mm_data,
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 2c99fce8d918c..e8d521ec2e8aa 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -76,6 +76,7 @@ def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool:
"this method to `embed_input_ids`."
)
model.embed_input_ids = model_get_input_embeddings
+ return True
logger.warning(
"The model (%s) is missing the `embed_input_ids` method.",
model,
diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index 05a40837954d8..8bba7b62882f1 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -29,7 +29,7 @@ logger = init_logger(__name__)
class JinaVLScorer(nn.Module):
def __init__(self, model_config: "ModelConfig"):
super().__init__()
- config = model_config.hf_config
+ config = model_config.hf_config.get_text_config()
head_dtype = model_config.head_dtype
self.dense = ColumnParallelLinear(
config.hidden_size, config.hidden_size, params_dtype=head_dtype, bias=True
diff --git a/vllm/model_executor/models/mistral_large_3.py b/vllm/model_executor/models/mistral_large_3.py
new file mode 100644
index 0000000000000..ff7e9b60c1d3c
--- /dev/null
+++ b/vllm/model_executor/models/mistral_large_3.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+
+import regex as re
+import torch
+
+from vllm.model_executor.models.deepseek_v2 import DeepseekV3ForCausalLM
+
+
+class MistralLarge3ForCausalLM(DeepseekV3ForCausalLM):
+ # fmt: off
+ remapping = {
+ r"layers\.(\d+)\.attention_norm\.weight": r"model.layers.\1.input_layernorm.weight", # noqa: E501
+ r"layers\.(\d+)\.attention\.wq_a\.(\w+)": r"model.layers.\1.self_attn.q_a_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.attention\.q_a_norm\.weight": r"model.layers.\1.self_attn.q_a_layernorm.weight", # noqa: E501
+ r"layers\.(\d+)\.attention\.wq_b\.(\w+)": r"model.layers.\1.self_attn.q_b_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.attention\.wkv_a_with_mqa\.(\w+)": r"model.layers.\1.self_attn.kv_a_proj_with_mqa.\2", # noqa: E501
+ r"layers\.(\d+)\.attention\.kv_a_norm\.weight": r"model.layers.\1.self_attn.kv_a_layernorm.weight", # noqa: E501
+ r"layers\.(\d+)\.attention\.wkv_b\.(\w+)": r"model.layers.\1.self_attn.kv_b_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.attention\.wo\.(\w+)": r"model.layers.\1.self_attn.o_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.ffn_norm\.weight": r"model.layers.\1.post_attention_layernorm.weight", # noqa: E501
+ r"layers\.(\d+)\.feed_forward\.w1\.(\w+)": r"model.layers.\1.mlp.gate_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.feed_forward\.w2\.(\w+)": r"model.layers.\1.mlp.down_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.feed_forward\.w3\.(\w+)": r"model.layers.\1.mlp.up_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.gate\.weight": r"model.layers.\1.mlp.gate.weight", # noqa: E501
+ r"layers\.(\d+)\.shared_experts\.w1\.(\w+)": r"model.layers.\1.mlp.shared_experts.gate_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.shared_experts\.w2\.(\w+)": r"model.layers.\1.mlp.shared_experts.down_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.shared_experts\.w3\.(\w+)": r"model.layers.\1.mlp.shared_experts.up_proj.\2", # noqa: E501
+ r"layers\.(\d+)\.experts\.(\d+)\.w1\.(\w+)": r"model.layers.\1.mlp.experts.\2.gate_proj.\3", # noqa: E501
+ r"layers\.(\d+)\.experts\.(\d+)\.w2\.(\w+)": r"model.layers.\1.mlp.experts.\2.down_proj.\3", # noqa: E501
+ r"layers\.(\d+)\.experts\.(\d+)\.w3\.(\w+)": r"model.layers.\1.mlp.experts.\2.up_proj.\3", # noqa: E501
+ r"norm\.weight": "model.norm.weight", # noqa: E501
+ r"tok_embeddings\.weight": "model.embed_tokens.weight", # noqa: E501
+ r"output\.weight": "lm_head.weight", # noqa: E501
+ }
+ # fmt: on
+
+ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+ return super().load_weights(map(self._remap_mistral_to_ds, weights))
+
+ def _remap_mistral_to_ds(
+ self, weight: tuple[str, torch.Tensor]
+ ) -> tuple[str, torch.Tensor]:
+ """Remap Mistral parameters to DeepseekV2 parameters."""
+ name, loaded_weight = weight
+
+ for k, v in self.remapping.items():
+ match = re.fullmatch(k, name)
+ if match:
+ name = re.sub(k, v, name)
+ break
+ else:
+ raise ValueError(f"Cannot remap {name}")
+
+ # Remapping scale names. We could do this in the regex above but it
+ # would triple the number of lines for most layers.
+ if name.endswith(".qscale_act"):
+ name = re.sub(r"\.qscale_act$", ".input_scale", name)
+ elif name.endswith(".qscale_weight"):
+ name = re.sub(r"\.qscale_weight$", ".weight_scale", name)
+
+ return name, loaded_weight
diff --git a/vllm/model_executor/models/mistral_large_3_eagle.py b/vllm/model_executor/models/mistral_large_3_eagle.py
new file mode 100644
index 0000000000000..e3ca9e4ca82d0
--- /dev/null
+++ b/vllm/model_executor/models/mistral_large_3_eagle.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.models.deepseek_v2 import (
+ DeepseekV2DecoderLayer,
+ DeepseekV2Model,
+)
+from vllm.model_executor.models.interfaces import MultiModalEmbeddings
+from vllm.model_executor.models.mistral_large_3 import MistralLarge3ForCausalLM
+from vllm.multimodal.inputs import NestedTensors
+
+from .utils import (
+ _merge_multimodal_embeddings,
+ make_empty_intermediate_tensors_factory,
+ maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+@support_torch_compile
+class EagleMistralLarge3Model(DeepseekV2Model):
+ def __init__(
+ self, *, vllm_config: VllmConfig, prefix: str = "", start_layer_id: int = 0
+ ):
+ nn.Module.__init__(self)
+
+ config = vllm_config.model_config.hf_config
+ quant_config = vllm_config.quant_config
+ self.config = config
+ self.vllm_config = vllm_config
+
+ self.vocab_size = config.vocab_size
+
+ assert get_pp_group().world_size == 1
+ self.embed_tokens = VocabParallelEmbedding(
+ config.vocab_size,
+ config.hidden_size,
+ quant_config=quant_config,
+ prefix=f"{prefix}.embed_tokens",
+ )
+
+ self.layers = nn.ModuleList(
+ [
+ DeepseekV2DecoderLayer(
+ vllm_config=vllm_config,
+ prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ self.start_layer = 0
+ self.end_layer = self.config.num_hidden_layers
+
+ self.fc = RowParallelLinear(
+ self.config.hidden_size * 2,
+ self.config.hidden_size,
+ bias=False,
+ input_is_parallel=False,
+ quant_config=quant_config,
+ return_bias=False,
+ )
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+ ["hidden_states", "residual"], config.hidden_size
+ )
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ inputs_embeds: torch.Tensor | None = None,
+ ) -> torch.Tensor:
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_input_ids(input_ids)
+ inputs_embeds = self.fc(torch.cat((inputs_embeds, hidden_states), dim=-1))
+ output = super().forward(
+ input_ids, positions, intermediate_tensors=None, inputs_embeds=inputs_embeds
+ )
+ assert isinstance(output, torch.Tensor)
+ return output
+
+
+class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM):
+ remapping = MistralLarge3ForCausalLM.remapping | {
+ r"eagle_linear\.weight": r"model.fc.weight",
+ r"eagle_linear\.qscale_act": r"model.fc.input_scale",
+ r"eagle_linear\.qscale_weight": r"model.fc.weight_scale",
+ }
+
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ target_layer_num = vllm_config.model_config.get_num_layers(
+ vllm_config.parallel_config
+ )
+ vllm_config.model_config = vllm_config.speculative_config.draft_model_config
+ # draft model quantization config may differ from target model
+ self.quant_config = VllmConfig.get_quantization_config(
+ vllm_config.speculative_config.draft_model_config, vllm_config.load_config
+ )
+ vllm_config.quant_config = self.quant_config
+ self.model_cls = partial(
+ EagleMistralLarge3Model, start_layer_id=target_layer_num
+ )
+ super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+ def get_input_embeddings(
+ self,
+ input_ids: torch.Tensor,
+ multimodal_embeddings: MultiModalEmbeddings | None = None,
+ *,
+ is_multimodal: torch.Tensor | None = None,
+ handle_oov_mm_token: bool = False,
+ ) -> torch.Tensor:
+ inputs_embeds = super().embed_input_ids(input_ids)
+
+ if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+ return inputs_embeds
+
+ assert is_multimodal is not None
+
+ return _merge_multimodal_embeddings(
+ inputs_embeds=inputs_embeds,
+ multimodal_embeddings=multimodal_embeddings,
+ is_multimodal=is_multimodal,
+ )
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ inputs_embeds: torch.Tensor | None = None,
+ ) -> tuple[torch.Tensor, torch.Tensor]:
+ hidden_states = self.model(input_ids, positions, hidden_states, inputs_embeds)
+ return hidden_states, hidden_states
+
+ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+ # Pretend we've loaded the embedding and lm_head weights
+ # (later copied from target model)
+ return super().load_weights(weights) | {
+ "model.embed_tokens.weight",
+ "lm_head.weight",
+ }
+
+ def embed_input_ids(
+ self,
+ input_ids: torch.Tensor,
+ multimodal_embeddings: NestedTensors | None = None,
+ is_multimodal: torch.Tensor | None = None,
+ ) -> torch.Tensor:
+ return self.model.embed_input_ids(input_ids)
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 743bc23d9876f..be36f761c63aa 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.pooler import (
PoolingParamsUpdate,
PoolingType,
)
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors
@@ -62,19 +62,6 @@ class ModernBertEmbeddings(nn.Module):
return embeddings
-class ModernBertRotaryEmbedding(RotaryEmbedding):
- def __init__(self, config: ModernBertConfig, head_size: int, dim: int, base: float):
- super().__init__(
- head_size=head_size,
- rotary_dim=dim,
- max_position_embeddings=config.max_position_embeddings,
- base=base,
- is_neox_style=True,
- dtype=torch.float16,
- )
- self.config = config
-
-
class ModernBertAttention(nn.Module):
def __init__(self, config: ModernBertConfig, layer_id: int | None = None):
super().__init__()
@@ -95,19 +82,33 @@ class ModernBertAttention(nn.Module):
bias=config.attention_bias,
)
- sliding_window = None
- if layer_id % config.global_attn_every_n_layers != 0:
- sliding_window = config.local_attention // 2
- rope_theta = (
- config.local_rope_theta
- if config.local_rope_theta is not None
- else config.global_rope_theta
- )
+ if layer_types := getattr(config, "layer_types", None):
+ # Transformers v5
+ layer_type = layer_types[layer_id]
+ rope_parameters = config.rope_parameters[layer_type]
+ sliding_window: int | None = None
+ if layer_type == "sliding_attention":
+ sliding_window = config.local_attention // 2
else:
- rope_theta = config.global_rope_theta
+ # Transformers v4
+ sliding_window = None
+ if layer_id % config.global_attn_every_n_layers != 0:
+ sliding_window = config.local_attention // 2
+ rope_theta = (
+ config.local_rope_theta
+ if config.local_rope_theta is not None
+ else config.global_rope_theta
+ )
+ else:
+ rope_theta = config.global_rope_theta
+ rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
- self.rotary_emb = ModernBertRotaryEmbedding(
- config=config, head_size=self.head_dim, dim=self.head_dim, base=rope_theta
+ self.rotary_emb = get_rope(
+ head_size=self.head_dim,
+ rotary_dim=self.head_dim,
+ max_position=config.max_position_embeddings,
+ rope_parameters=rope_parameters,
+ dtype=torch.float16,
)
self.attn = EncoderOnlyAttention(
self.num_heads,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 11beeddabe307..891a9ce080233 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -73,12 +73,8 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.tokenizer import (
- cached_tokenizer_from_config,
- encode_tokens,
-)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .utils import _merge_multimodal_embeddings
@@ -457,14 +453,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
- self._img_start_token_ids = encode_tokens(
- tokenizer, IMG_START, add_special_tokens=False
+ self._img_start_token_ids = tokenizer.encode(
+ IMG_START, add_special_tokens=False
)
- self._img_end_token_ids = encode_tokens(
- tokenizer, IMG_END, add_special_tokens=False
- )
- self._img_context_token_ids = encode_tokens(
- tokenizer, IMG_CONTEXT, add_special_tokens=False
+ self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+ self._img_context_token_ids = tokenizer.encode(
+ IMG_CONTEXT, add_special_tokens=False
)
@property
@@ -1182,14 +1176,12 @@ class NemotronH_Nano_VL_V2(
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
- self._img_start_token_ids = encode_tokens(
- tokenizer, IMG_START, add_special_tokens=False
+ self._img_start_token_ids = tokenizer.encode(
+ IMG_START, add_special_tokens=False
)
- self._img_end_token_ids = encode_tokens(
- tokenizer, IMG_END, add_special_tokens=False
- )
- self._img_context_token_ids = encode_tokens(
- tokenizer, IMG_CONTEXT, add_special_tokens=False
+ self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
+ self._img_context_token_ids = tokenizer.encode(
+ IMG_CONTEXT, add_special_tokens=False
)
def pixel_shuffle(self, x, scale_factor=0.5):
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 54bde75cc0131..cad241842cd30 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -59,8 +59,7 @@ from vllm.multimodal.processing import (
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import MistralTokenizer
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 34c31d8deee23..f5501bae78418 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -503,7 +503,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
- config = vllm_config.model_config.hf_config
+ config = vllm_config.model_config.hf_config.get_text_config()
quant_config = vllm_config.quant_config
self.config = config
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 7506ee8656fda..1ce0fb4e4d93d 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -88,7 +88,6 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import encode_tokens
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (
@@ -591,7 +590,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
tokenization_kwargs=tokenization_kwargs,
)
tokenizer = self.info.get_tokenizer()
- prompt_ids = encode_tokens(tokenizer, prompt)
+ prompt_ids = tokenizer.encode(prompt)
else:
prompt_ids = self._apply_hf_processor_tokens_only(prompt)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8fbd896223944..b748768498412 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1576,15 +1576,6 @@ class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
}
)
- def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
- # Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
- # as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig.
- config = vllm_config.model_config.hf_config
- qwen2vl_config = config.text_config
- qwen2vl_config.architectures = config.architectures
- vllm_config.model_config.hf_config = qwen2vl_config
- super().__init__(vllm_config=vllm_config, prefix=prefix)
-
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
skip_prefixes = []
if self.visual is None:
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 39dd42552ae8f..fe825198dcaa4 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -62,6 +62,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
@@ -1137,6 +1138,18 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
}
)
+ packed_modules_mapping = {
+ "qkv_proj": [
+ "q_proj",
+ "k_proj",
+ "v_proj",
+ ],
+ "gate_up_proj": [
+ "gate_proj",
+ "up_proj",
+ ],
+ }
+
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"):
@@ -1763,3 +1776,13 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
mrope_position_delta = llm_positions.max() + 1 - seq_len
return llm_positions, mrope_position_delta
+
+ def get_mm_mapping(self) -> MultiModelKeys:
+ """
+ Get the module prefix in multimodal models
+ """
+ return MultiModelKeys.from_string_field(
+ language_model="language_model",
+ connector="visual.merger",
+ tower_model=["visual.", "audio_tower."],
+ )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 73a61f1148b50..d3b6268e7647b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -145,6 +145,7 @@ _TEXT_GENERATION_MODELS = {
"MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
"MiniMaxM2ForCausalLM": ("minimax_m2", "MiniMaxM2ForCausalLM"),
"MistralForCausalLM": ("llama", "LlamaForCausalLM"),
+ "MistralLarge3ForCausalLM": ("mistral_large_3", "MistralLarge3ForCausalLM"),
"MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
# transformers's mpt class has lower case
"MptForCausalLM": ("mpt", "MPTForCausalLM"),
@@ -424,6 +425,10 @@ _SPECULATIVE_DECODING_MODELS = {
"LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
"Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
"Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+ "EagleMistralLarge3ForCausalLM": (
+ "mistral_large_3_eagle",
+ "EagleMistralLarge3ForCausalLM",
+ ),
"EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
"DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
"ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 0a39ea7ef5bff..45f8fa079c714 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -51,8 +51,7 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import MistralTokenizer
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
from .utils import init_vllm_registered_model, maybe_prefix
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 1ed6ae4366d0c..0daf6bda61ccb 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -48,7 +48,7 @@ from vllm.multimodal.processing import (
PromptUpdate,
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.transformers_utils.processor import cached_get_processor
+from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.utils.jsontree import json_map_leaves
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.utils.torch_utils import set_default_torch_dtype
@@ -850,7 +850,7 @@ class WhisperForConditionalGeneration(
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: str
) -> SpeechToTextConfig:
- processor = cached_get_processor(model_config.model)
+ processor = cached_processor_from_config(model_config)
return SpeechToTextConfig(
max_audio_clip_s=processor.feature_extractor.chunk_length,
@@ -864,7 +864,7 @@ class WhisperForConditionalGeneration(
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
) -> int | None:
- processor = cached_get_processor(model_config.model)
+ processor = cached_processor_from_config(model_config)
hop_length = processor.feature_extractor.hop_length
assert hop_length is not None
# NOTE(NickLucche) user can't pass encoder
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fef118a93c6cb..4a619fd303ca9 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,12 +2,40 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
+from dataclasses import dataclass
from pathlib import Path
from typing import Generic, TypeVar
+import numpy as np
+
_T = TypeVar("_T")
+@dataclass
+class MediaWithBytes(Generic[_T]):
+ """
+ Wrapper that couples a media object with its original encoded bytes.
+
+ This ensures the raw bytes and media object remain synchronized,
+ preventing cache corruption from in-place modifications.
+
+ The wrapper delegates attribute access to the underlying media object,
+ making it behave transparently like the wrapped type (e.g., PIL.Image).
+ """
+
+ media: _T
+ original_bytes: bytes
+
+ def __array__(self, *args, **kwargs) -> np.ndarray:
+ """Allow np.array(obj) to return np.array(obj.media)."""
+ return np.array(self.media, *args, **kwargs)
+
+ def __getattr__(self, name: str):
+ """Delegate attribute access to the underlying media object."""
+ # This is only called when the attribute is not found on self
+ return getattr(self.media, name)
+
+
class MediaIO(ABC, Generic[_T]):
@abstractmethod
def load_bytes(self, data: bytes) -> _T:
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index d0dcbb25fcce8..cc50322fed902 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -12,6 +12,8 @@ from PIL import Image
from vllm.logger import init_logger
+from .base import MediaWithBytes
+
logger = init_logger(__name__)
@@ -31,14 +33,26 @@ class MultiModalHasher:
if Image.ExifTags.Base.ImageID in exif and isinstance(
exif[Image.ExifTags.Base.ImageID], uuid.UUID
):
- # If the image has exif ImageID tag, use that
return (exif[Image.ExifTags.Base.ImageID].bytes,)
+
data = {"mode": obj.mode, "data": np.asarray(obj)}
- if obj.palette is not None:
- data["palette"] = obj.palette.palette
- if obj.palette.rawmode is not None:
- data["palette_rawmode"] = obj.palette.rawmode
+ palette = obj.palette
+ if palette is not None:
+ data["palette"] = palette.palette
+ if palette.rawmode is not None:
+ data["palette_rawmode"] = palette.rawmode
+
return cls.iter_item_to_bytes("image", data)
+
+ if isinstance(obj, MediaWithBytes) and isinstance(obj.media, Image.Image):
+ exif = obj.media.getexif()
+ if Image.ExifTags.Base.ImageID in exif and isinstance(
+ exif[Image.ExifTags.Base.ImageID], uuid.UUID
+ ):
+ return (exif[Image.ExifTags.Base.ImageID].bytes,)
+
+ return cls.iter_item_to_bytes("image", obj.original_bytes)
+
if isinstance(obj, torch.Tensor):
tensor_obj: torch.Tensor = obj.cpu()
tensor_dtype = tensor_obj.dtype
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 21e8bef97a787..789421e9e0c3b 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -8,7 +8,7 @@ import pybase64
import torch
from PIL import Image
-from .base import MediaIO
+from .base import MediaIO, MediaWithBytes
def rescale_image_size(
@@ -74,8 +74,12 @@ class ImageMediaIO(MediaIO[Image.Image]):
)
self.rgba_background_color = rgba_bg
- def _convert_image_mode(self, image: Image.Image) -> Image.Image:
+ def _convert_image_mode(
+ self, image: Image.Image | MediaWithBytes[Image.Image]
+ ) -> Image.Image:
"""Convert image mode with custom background color."""
+ if isinstance(image, MediaWithBytes):
+ image = image.media
if image.mode == self.image_mode:
return image
elif image.mode == "RGBA" and self.image_mode == "RGB":
@@ -83,18 +87,18 @@ class ImageMediaIO(MediaIO[Image.Image]):
else:
return convert_image_mode(image, self.image_mode)
- def load_bytes(self, data: bytes) -> Image.Image:
+ def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
image = Image.open(BytesIO(data))
- image.load()
- return self._convert_image_mode(image)
+ return MediaWithBytes(self._convert_image_mode(image), data)
- def load_base64(self, media_type: str, data: str) -> Image.Image:
+ def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
return self.load_bytes(pybase64.b64decode(data, validate=True))
- def load_file(self, filepath: Path) -> Image.Image:
- image = Image.open(filepath)
- image.load()
- return self._convert_image_mode(image)
+ def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
+ with open(filepath, "rb") as f:
+ data = f.read()
+ image = Image.open(BytesIO(data))
+ return MediaWithBytes(self._convert_image_mode(image), data)
def encode_base64(
self,
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 810f29072a0fe..0d3b8289e4e12 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -23,6 +23,7 @@ from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import LazyLoader
from .audio import AudioResampler
+from .base import MediaWithBytes
from .inputs import (
AudioItem,
HfAudioItem,
@@ -84,6 +85,12 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
"""Get all data items."""
return [self.get(idx) for idx in range(self.get_count())]
+ def get_item_for_hash(self, index: int) -> object:
+ return self.get(index)
+
+ def get_all_items_for_hash(self) -> list[object]:
+ return [self.get_item_for_hash(idx) for idx in range(self.get_count())]
+
@abstractmethod
def get_processor_data(self) -> Mapping[str, object]:
"""Get the data to pass to the HF processor."""
@@ -98,10 +105,18 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
"""Base class for data items that are arranged in a list."""
+ def _unwrap(self, item: _T | MediaWithBytes[_T]) -> _T:
+ """Extract media from wrapper if present."""
+ return item.media if isinstance(item, MediaWithBytes) else item
+
def get_count(self) -> int:
return len(self.data)
def get(self, index: int) -> _T:
+ return self._unwrap(self.data[index])
+
+ def get_item_for_hash(self, index: int) -> _T | MediaWithBytes[_T]:
+ # Return raw item for hashing (preserves original_bytes if present)
return self.data[index]
def get_processor_data(self) -> Mapping[str, object]:
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 2f651bd71706f..0390773783961 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -25,7 +25,6 @@ from typing_extensions import TypeVar, assert_never
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -80,9 +79,9 @@ def _cached_encode(
tokenizer: TokenizerLike,
text: str,
*,
- add_special_tokens: bool | None = None,
+ add_special_tokens: bool = True,
) -> list[int]:
- return encode_tokens(tokenizer, text, add_special_tokens=add_special_tokens)
+ return tokenizer.encode(text, add_special_tokens=add_special_tokens)
@lru_cache(maxsize=2048)
@@ -90,11 +89,9 @@ def _cached_decode(
tokenizer: TokenizerLike,
token_ids: tuple[int, ...],
*,
- skip_special_tokens: bool | None = None,
+ skip_special_tokens: bool = False,
) -> str:
- return decode_tokens(
- tokenizer, list(token_ids), skip_special_tokens=skip_special_tokens
- )
+ return tokenizer.decode(list(token_ids), skip_special_tokens=skip_special_tokens)
def _seq2text(
@@ -110,7 +107,7 @@ def _seq2text(
raise ValueError("You cannot decode tokens when `skip_tokenizer_init=True`")
if not use_cache:
- return decode_tokens(tokenizer, seq)
+ return tokenizer.decode(seq)
return _cached_decode(tokenizer, tuple(seq))
@@ -126,7 +123,7 @@ def _seq2tokens(
raise ValueError("You cannot encode text when `skip_tokenizer_init=True`")
if not use_cache:
- return encode_tokens(tokenizer, seq, add_special_tokens=False)
+ return tokenizer.encode(seq, add_special_tokens=False)
return _cached_encode(tokenizer, seq, add_special_tokens=False)
@@ -1687,7 +1684,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
# For None entries, compute a hash; otherwise, use provided ID.
computed: list[str] = []
- for i, item in enumerate(items):
+ for i, item in enumerate(items.get_all_items_for_hash()):
item_uuid = mm_uuids_per_modality[i]
# NOTE: Even if a item_uuid is provided, we still compute a
@@ -2198,8 +2195,8 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
tokenizer = self.info.get_tokenizer()
decoder_prompt_raw = self.create_decoder_prompt(prompt, mm_data)
if isinstance(decoder_prompt_raw, str):
- decoder_prompt_ids = encode_tokens(
- tokenizer, decoder_prompt_raw, add_special_tokens=False
+ decoder_prompt_ids = tokenizer.encode(
+ decoder_prompt_raw, add_special_tokens=False
)
else:
decoder_prompt_ids = decoder_prompt_raw
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 2fdae46e547b0..00a84f9dec4f7 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,8 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
from vllm.config.multimodal import BaseDummyOptions
from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from .cache import BaseMultiModalProcessorCache
from .processing import (
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 14f0148cf7ba8..42487f5f51651 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -4,12 +4,21 @@
from .hf import HfTokenizer
from .mistral import MistralTokenizer
from .protocol import TokenizerLike
-from .registry import TokenizerRegistry, get_tokenizer
+from .registry import (
+ TokenizerRegistry,
+ cached_get_tokenizer,
+ cached_tokenizer_from_config,
+ get_tokenizer,
+ init_tokenizer_from_config,
+)
__all__ = [
"TokenizerLike",
"HfTokenizer",
"MistralTokenizer",
"TokenizerRegistry",
+ "cached_get_tokenizer",
"get_tokenizer",
+ "cached_tokenizer_from_config",
+ "init_tokenizer_from_config",
]
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 7e6745004b01f..37d67607c2cfe 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -14,13 +14,19 @@ if TYPE_CHECKING:
)
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from transformers import BatchEncoding
- from transformers.tokenization_mistral_common import (
- MistralCommonTokenizer as TransformersMistralTokenizer,
- )
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+ try:
+ # Transformers v5
+ from transformers.tokenization_mistral_common import MistralCommonBackend
+ except ImportError:
+ # Transformers v4
+ from transformers.tokenization_mistral_common import (
+ MistralCommonTokenizer as MistralCommonBackend,
+ )
+
logger = init_logger(__name__)
@@ -97,6 +103,8 @@ def _prepare_apply_chat_template_tools_and_messages(
continue_final_message: bool = False,
add_generation_prompt: bool = False,
) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
+ from mistral_common.protocol.instruct.tool_calls import Function, Tool
+
if add_generation_prompt and continue_final_message:
raise ValueError(
"Cannot set both `add_generation_prompt` and "
@@ -139,6 +147,33 @@ def _prepare_apply_chat_template_tools_and_messages(
if function.get("description") is None:
function["description"] = ""
+ # We filter not supported arguments to avoid throwing an error.
+ # TODO(juliendenize): remove this once OpenAI API is better supported by
+ # `mistral-common`.
+ tools_fields = set(Tool.model_fields.keys())
+ function_fields = set(Function.model_fields.keys())
+ for tool in tools:
+ tool_keys = list(tool.keys())
+ for tool_key in tool_keys:
+ if tool_key not in tools_fields:
+ tool.pop(tool_key)
+ logger.warning_once(
+ f"'{tool_key}' is not supported by mistral-common for tools. "
+ "It has been poped from the tool definition."
+ )
+ if tool["type"] == "function":
+ function_keys = list(tool["function"].keys())
+ for function_key in function_keys:
+ if function_key not in function_fields:
+ tool["function"].pop(function_key)
+ logger.warning_once(
+ f"'{function_key}' is not supported by mistral-common "
+ "for function tools. It has been poped from the "
+ "function definition."
+ )
+ else:
+ raise ValueError("mistral-common only supports function tools.")
+
return messages, tools
@@ -179,11 +214,17 @@ class MistralTokenizer(TokenizerLike):
**kwargs,
) -> "MistralTokenizer":
from mistral_common.protocol.instruct.validator import ValidationMode
- from transformers.tokenization_mistral_common import (
- MistralCommonTokenizer as TransformersMistralTokenizer,
- )
- tokenizer = TransformersMistralTokenizer.from_pretrained(
+ try:
+ # Transformers v5
+ from transformers.tokenization_mistral_common import MistralCommonBackend
+ except ImportError:
+ # Transformers v4
+ from transformers.tokenization_mistral_common import (
+ MistralCommonTokenizer as MistralCommonBackend,
+ )
+
+ tokenizer = MistralCommonBackend.from_pretrained(
path_or_repo_id,
*args,
mode=ValidationMode.test,
@@ -194,7 +235,7 @@ class MistralTokenizer(TokenizerLike):
return cls(tokenizer)
- def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+ def __init__(self, tokenizer: "MistralCommonBackend") -> None:
super().__init__()
from mistral_common.protocol.instruct.validator import ValidationMode
@@ -410,6 +451,13 @@ class MistralTokenizer(TokenizerLike):
ids, skip_special_tokens=skip_special_tokens
)
+ def batch_decode(
+ self, ids: list[list[int]] | list[int], skip_special_tokens: bool = False
+ ) -> str:
+ return self.transformers_tokenizer.batch_decode(
+ ids, skip_special_tokens=skip_special_tokens
+ )
+
def convert_tokens_to_string(self, tokens: list[str]) -> str:
from mistral_common.tokens.tokenizers.base import (
SpecialTokenPolicy,
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index d5e7899321615..87048f2ec7845 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -2,25 +2,30 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
from collections.abc import Callable
+from functools import lru_cache
from pathlib import Path
-from typing import TypeVar, overload
+from typing import TYPE_CHECKING, TypeVar, overload
import huggingface_hub
+from typing_extensions import assert_never
import vllm.envs as envs
from vllm.logger import init_logger
-from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
-from vllm.transformers_utils.utils import (
+from vllm.transformers_utils.gguf_utils import (
check_gguf_file,
+ get_gguf_file_path_from_hf,
is_gguf,
is_remote_gguf,
split_remote_gguf,
)
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike
+if TYPE_CHECKING:
+ from vllm.config import ModelConfig
+
logger = init_logger(__name__)
_T = TypeVar("_T", bound=type[TokenizerLike])
@@ -195,3 +200,34 @@ def get_tokenizer(
)
return tokenizer
+
+
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+
+def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
+ return cached_get_tokenizer(
+ model_config.tokenizer,
+ tokenizer_mode=model_config.tokenizer_mode,
+ revision=model_config.tokenizer_revision,
+ trust_remote_code=model_config.trust_remote_code,
+ **kwargs,
+ )
+
+
+def init_tokenizer_from_config(model_config: "ModelConfig"):
+ runner_type = model_config.runner_type
+ if runner_type == "generate" or runner_type == "draft":
+ truncation_side = "left"
+ elif runner_type == "pooling":
+ truncation_side = "right"
+ else:
+ assert_never(runner_type)
+
+ return get_tokenizer(
+ model_config.tokenizer,
+ tokenizer_mode=model_config.tokenizer_mode,
+ trust_remote_code=model_config.trust_remote_code,
+ revision=model_config.tokenizer_revision,
+ truncation_side=truncation_side,
+ )
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1bb5791e19016..2911dcff2ab49 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -26,8 +26,15 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
from vllm import envs
from vllm.logger import init_logger
+from vllm.transformers_utils.utils import parse_safetensors_file_metadata
from .config_parser_base import ConfigParserBase
+from .gguf_utils import (
+ check_gguf_file,
+ is_gguf,
+ is_remote_gguf,
+ split_remote_gguf,
+)
from .repo_utils import (
_get_hf_token,
file_or_path_exists,
@@ -36,13 +43,6 @@ from .repo_utils import (
try_get_local_file,
with_retry,
)
-from .utils import (
- check_gguf_file,
- is_gguf,
- is_remote_gguf,
- parse_safetensors_file_metadata,
- split_remote_gguf,
-)
if envs.VLLM_USE_MODELSCOPE:
from modelscope import AutoConfig
@@ -89,6 +89,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
step3_text="Step3TextConfig",
qwen3_next="Qwen3NextConfig",
lfm2_moe="Lfm2MoeConfig",
+ tarsier2="Tarsier2Config",
)
_CONFIG_ATTRS_MAPPING: dict[str, str] = {
@@ -127,6 +128,9 @@ class HFConfigParser(ConfigParserBase):
if config_dict.get("speculators_config") is not None
else model_type
)
+ # Allow hf_overrides to override model_type before checking _CONFIG_REGISTRY
+ if (hf_overrides := kwargs.pop("hf_overrides", None)) is not None:
+ model_type = hf_overrides.get("model_type", model_type)
if model_type in _CONFIG_REGISTRY:
config_class = _CONFIG_REGISTRY[model_type]
@@ -310,7 +314,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
config.rope_parameters["rope_theta"] = rope_theta
# No RoPE parameters to patch
- if not hasattr(config, "rope_parameters"):
+ if getattr(config, "rope_parameters", None) is None:
return
# Add original_max_position_embeddings if present
@@ -351,7 +355,10 @@ def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
rope_parameters["rope_type"] = "longrope"
logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
elif rope_parameters["rope_type"] == "mrope":
- assert "mrope_section" in rope_parameters
+ if "mrope_section" not in rope_parameters:
+ raise ValueError(
+ "Legacy rope_type 'mrope' requires 'mrope_section' in rope_parameters"
+ )
rope_parameters["rope_type"] = "default"
logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
@@ -584,6 +591,7 @@ def get_config(
trust_remote_code=trust_remote_code,
revision=revision,
code_revision=code_revision,
+ hf_overrides=hf_overrides_kw,
**kwargs,
)
# Special architecture mapping check for GGUF models
@@ -915,11 +923,13 @@ def get_hf_text_config(config: PretrainedConfig):
"""
text_config = config.get_text_config()
- if text_config is not config:
- # The code operates under the assumption that text_config should have
- # `num_attention_heads` (among others). Assert here to fail early
- # if transformers config doesn't align with this assumption.
- assert hasattr(text_config, "num_attention_heads")
+ if text_config is not config and not hasattr(text_config, "num_attention_heads"):
+ raise ValueError(
+ "The text_config extracted from the model config does not have "
+ "`num_attention_heads` attribute. This indicates a mismatch "
+ "between the model config and vLLM's expectations. Please "
+ "ensure that the model config is compatible with vLLM."
+ )
return text_config
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 109f2b6986514..0e8d167886935 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -48,6 +48,7 @@ from vllm.transformers_utils.configs.step3_vl import (
Step3VisionEncoderConfig,
Step3VLConfig,
)
+from vllm.transformers_utils.configs.tarsier2 import Tarsier2Config
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
__all__ = [
@@ -81,4 +82,5 @@ __all__ = [
"Step3VisionEncoderConfig",
"Step3TextConfig",
"Qwen3NextConfig",
+ "Tarsier2Config",
]
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index f5dc9ddfbc575..ce428e567c844 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -82,3 +82,9 @@ class EAGLEConfig(PretrainedConfig):
pretrained_model_name_or_path, **kwargs
)
return cls.from_dict(config_dict, **kwargs)
+
+ def to_json_string(self, use_diff: bool = True) -> str:
+ # we override use_diff to False as initializing
+ # EAGLEConfig with default arguments is not supported
+ del use_diff
+ return super().to_json_string(use_diff=False)
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 966737aad0867..d59169d95f0c9 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -18,9 +18,31 @@ def adapt_config_dict(
if bool(config_dict.get("quantization")):
config_dict = _remap_mistral_quantization_args(config_dict)
+ is_moe = bool(config_dict.get("moe"))
+ is_mistral_large_3 = (
+ is_moe and (config_dict["moe"].get("num_shared_experts") or 0) > 0
+ )
if config_dict.get("model_type") == "mamba":
config_dict["architectures"] = ["Mamba2ForCausalLM"]
- elif bool(config_dict.get("moe")):
+ elif is_moe and is_mistral_large_3:
+ config_dict = _remap_moe_args(config_dict)
+ config_dict["model_type"] = "deepseek_v3"
+ config_dict["architectures"] = ["MistralLarge3ForCausalLM"]
+
+ assert "llama_4_scaling" in config_dict, (
+ "MistralLarge3 expect llama4 scaling config."
+ )
+ llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
+ assert all(
+ [
+ key in config_dict["llama_4_scaling"]
+ for key in llama_4_scaling_config_keys
+ ]
+ ), (
+ "llama_4_scaling config should define the keys: "
+ f"{','.join(llama_4_scaling_config_keys)}"
+ )
+ elif is_moe:
config_dict["architectures"] = ["MixtralForCausalLM"]
else:
config_dict["architectures"] = ["MistralForCausalLM"]
@@ -140,17 +162,20 @@ def _remap_general_mistral_args(config: dict) -> dict:
def _remap_mistral_quantization_args(config: dict) -> dict:
- quantization = config.get("quantization", {})
- if quantization.get("qformat_weight") == "fp8_e4m3":
- # This maps to the FP8 static per-tensor quantization scheme
- quantization_config = {"quant_method": "fp8", "activation_scheme": "static"}
- elif quantization.get("quant_method") == "compressed-tensors":
- # Pass through the quantization config to compressed-tensors
- quantization_config = quantization
- else:
- raise ValueError(f"Found unknown quantization='{quantization}' in config")
-
- config["quantization_config"] = quantization_config
+ if config.get("quantization"):
+ quantization = config.pop("quantization", {})
+ if quantization.get("qformat_weight") == "fp8_e4m3":
+ qscheme_act = quantization.get("qscheme_act")
+ assert qscheme_act in ("NO_SCALES", "TENSOR", None), (
+ "Only NO_SCALES and TENSOR (default) are supported for qscheme_act"
+ )
+ is_dynamic = qscheme_act == "NO_SCALES"
+ config["quantization_config"] = {
+ "quant_method": "fp8",
+ "activation_scheme": "dynamic" if is_dynamic else "static",
+ }
+ else:
+ raise ValueError(f"Found unknown quantization='{quantization}' in config")
return config
@@ -183,3 +208,28 @@ def _remap_mistral_audio_args(config: dict) -> dict:
if quant_config:
config["quantization_config"] = quant_config
return config
+
+
+def _remap_moe_args(config: dict) -> dict:
+ moe_config_map = {
+ "route_every_n": "moe_layer_freq",
+ "first_k_dense_replace": "first_k_dense_replace",
+ "num_experts_per_tok": "num_experts_per_tok",
+ "num_experts": "n_routed_experts",
+ "expert_hidden_dim": "moe_intermediate_size",
+ "routed_scale": "routed_scaling_factor",
+ "num_shared_experts": "n_shared_experts",
+ "num_expert_groups": "n_group",
+ "num_expert_groups_per_tok": "topk_group",
+ }
+ moe_config = config.get("moe", {})
+ for old_name, new_name in moe_config_map.items():
+ if old_name in moe_config:
+ value = moe_config.pop(old_name)
+ config[new_name] = value
+
+ config["topk_method"] = None
+ config["norm_topk_prob"] = True
+ config["scoring_func"] = "softmax"
+
+ return config
diff --git a/vllm/transformers_utils/configs/tarsier2.py b/vllm/transformers_utils/configs/tarsier2.py
new file mode 100644
index 0000000000000..12ebb4b7f602d
--- /dev/null
+++ b/vllm/transformers_utils/configs/tarsier2.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import Qwen2VLConfig
+
+
+class Tarsier2Config(Qwen2VLConfig):
+ """
+ Tarsier2's config.json is written such that AutoConfig.from_pretrained will create
+ a deeply nested config consisting of:
+
+ - LlavaConfig
+ - Qwen2VLConfig
+ - Qwen2VLTextConfig
+ - Qwen2VLVisionConfig
+ - Qwen2VLConfig
+ - Qwen2VLTextConfig
+ - Qwen2VLVisionConfig
+
+ When it should really just be a single Qwen2VLConfig.
+
+ This class is a hack to stop AutoConfig from creating the nested config structure.
+ """
+
+ model_type = "tarsier2"
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index cb1fc2d092e01..f3fd43c6ace51 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -2,10 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""GGUF utility functions."""
+from functools import cache
+from os import PathLike
from pathlib import Path
import gguf
+import regex as re
from gguf.constants import Keys, VisionProjectorType
+from gguf.quants import GGMLQuantizationType
from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
from vllm.logger import init_logger
@@ -15,6 +19,73 @@ from .repo_utils import list_filtered_repo_files
logger = init_logger(__name__)
+@cache
+def check_gguf_file(model: str | PathLike) -> bool:
+ """Check if the file is a GGUF model."""
+ model = Path(model)
+ if not model.is_file():
+ return False
+ elif model.suffix == ".gguf":
+ return True
+
+ try:
+ with model.open("rb") as f:
+ header = f.read(4)
+
+ return header == b"GGUF"
+ except Exception as e:
+ logger.debug("Error reading file %s: %s", model, e)
+ return False
+
+
+@cache
+def is_remote_gguf(model: str | Path) -> bool:
+ """Check if the model is a remote GGUF model."""
+ pattern = r"^[a-zA-Z0-9][a-zA-Z0-9._-]*/[a-zA-Z0-9][a-zA-Z0-9._-]*:[A-Za-z0-9_+-]+$"
+ model = str(model)
+ if re.fullmatch(pattern, model):
+ _, quant_type = model.rsplit(":", 1)
+ return is_valid_gguf_quant_type(quant_type)
+ return False
+
+
+def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
+ """Check if the quant type is a valid GGUF quant type."""
+ return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
+
+
+def split_remote_gguf(model: str | Path) -> tuple[str, str]:
+ """Split the model into repo_id and quant type."""
+ model = str(model)
+ if is_remote_gguf(model):
+ parts = model.rsplit(":", 1)
+ return (parts[0], parts[1])
+ raise ValueError(
+ f"Wrong GGUF model or invalid GGUF quant type: {model}.\n"
+ "- It should be in repo_id:quant_type format.\n"
+ f"- Valid GGMLQuantizationType values: {GGMLQuantizationType._member_names_}",
+ )
+
+
+def is_gguf(model: str | Path) -> bool:
+ """Check if the model is a GGUF model.
+
+ Args:
+ model: Model name, path, or Path object to check.
+
+ Returns:
+ True if the model is a GGUF model, False otherwise.
+ """
+ model = str(model)
+
+ # Check if it's a local GGUF file
+ if check_gguf_file(model):
+ return True
+
+ # Check if it's a remote GGUF model (repo_id:quant_type format)
+ return is_remote_gguf(model)
+
+
def detect_gguf_multimodal(model: str) -> Path | None:
"""Check if GGUF model has multimodal projector file.
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 63cdf63370342..e9864b0c1531d 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -18,7 +18,8 @@ from transformers.processing_utils import ProcessorMixin
from transformers.video_processing_utils import BaseVideoProcessor
from typing_extensions import TypeVar
-from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
+from vllm.transformers_utils.gguf_utils import is_gguf
+from vllm.transformers_utils.utils import convert_model_repo_to_path
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
if TYPE_CHECKING:
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 0911848c02e14..32999903b3480 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -2,17 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
-from functools import lru_cache
-from typing import TYPE_CHECKING, Any
+from typing import Any
-from typing_extensions import assert_never
+from typing_extensions import deprecated
from vllm.logger import init_logger
-from vllm.tokenizers import TokenizerLike, get_tokenizer
-
-if TYPE_CHECKING:
- from vllm.config import ModelConfig
-
+from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
@@ -28,22 +23,59 @@ def __getattr__(name: str):
)
return TokenizerLike
- if name == "get_cached_tokenizer":
- from vllm.tokenizers.hf import get_cached_tokenizer
+ if name == "get_tokenizer":
+ from vllm.tokenizers import get_tokenizer
warnings.warn(
- "`vllm.transformers_utils.tokenizer.get_cached_tokenizer` "
- "has been moved to `vllm.tokenizers.hf.get_cached_tokenizer`. "
+ "`vllm.transformers_utils.tokenizer.get_tokenizer` "
+ "has been moved to `vllm.tokenizers.get_tokenizer`. "
"The old name will be removed in v0.13.",
DeprecationWarning,
stacklevel=2,
)
- return get_cached_tokenizer
+ return get_tokenizer
+ if name == "cached_get_tokenizer":
+ from vllm.tokenizers import cached_get_tokenizer
+
+ warnings.warn(
+ "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
+ "has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
+ "The old name will be removed in v0.13.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+
+ return cached_get_tokenizer
+ if name == "cached_tokenizer_from_config":
+ from vllm.tokenizers import cached_tokenizer_from_config
+
+ warnings.warn(
+ "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
+ "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
+ "The old name will be removed in v0.13.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+
+ return cached_tokenizer_from_config
+ if name == "init_tokenizer_from_configs":
+ from vllm.tokenizers import init_tokenizer_from_config
+
+ warnings.warn(
+ "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
+ "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
+ "The old name will be removed in v0.13.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+
+ return init_tokenizer_from_config
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
def decode_tokens(
tokenizer: TokenizerLike,
token_ids: list[int],
@@ -65,6 +97,7 @@ def decode_tokens(
return tokenizer.decode(token_ids, **kw_args)
+@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
def encode_tokens(
tokenizer: TokenizerLike,
text: str,
@@ -92,37 +125,3 @@ def encode_tokens(
kw_args["add_special_tokens"] = add_special_tokens
return tokenizer.encode(text, **kw_args)
-
-
-cached_get_tokenizer = lru_cache(get_tokenizer)
-
-
-def cached_tokenizer_from_config(
- model_config: "ModelConfig",
- **kwargs: Any,
-):
- return cached_get_tokenizer(
- model_config.tokenizer,
- tokenizer_mode=model_config.tokenizer_mode,
- revision=model_config.tokenizer_revision,
- trust_remote_code=model_config.trust_remote_code,
- **kwargs,
- )
-
-
-def init_tokenizer_from_configs(model_config: "ModelConfig"):
- runner_type = model_config.runner_type
- if runner_type == "generate" or runner_type == "draft":
- truncation_side = "left"
- elif runner_type == "pooling":
- truncation_side = "right"
- else:
- assert_never(runner_type)
-
- return get_tokenizer(
- model_config.tokenizer,
- tokenizer_mode=model_config.tokenizer_mode,
- trust_remote_code=model_config.trust_remote_code,
- revision=model_config.tokenizer_revision,
- truncation_side=truncation_side,
- )
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 45a873c9f7001..96f292f4c949e 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -9,8 +9,6 @@ from os import PathLike
from pathlib import Path
from typing import Any
-from gguf import GGMLQuantizationType
-
import vllm.envs as envs
from vllm.logger import init_logger
@@ -29,76 +27,6 @@ def is_cloud_storage(model_or_path: str) -> bool:
return is_s3(model_or_path) or is_gcs(model_or_path)
-@cache
-def check_gguf_file(model: str | PathLike) -> bool:
- """Check if the file is a GGUF model."""
- model = Path(model)
- if not model.is_file():
- return False
- elif model.suffix == ".gguf":
- return True
-
- try:
- with model.open("rb") as f:
- header = f.read(4)
-
- return header == b"GGUF"
- except Exception as e:
- logger.debug("Error reading file %s: %s", model, e)
- return False
-
-
-@cache
-def is_remote_gguf(model: str | Path) -> bool:
- """Check if the model is a remote GGUF model."""
- model = str(model)
- return (
- (not is_cloud_storage(model))
- and (not model.startswith(("http://", "https://")))
- and ("/" in model and ":" in model)
- and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
- )
-
-
-def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
- """Check if the quant type is a valid GGUF quant type."""
- return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
-
-
-def split_remote_gguf(model: str | Path) -> tuple[str, str]:
- """Split the model into repo_id and quant type."""
- model = str(model)
- if is_remote_gguf(model):
- parts = model.rsplit(":", 1)
- return (parts[0], parts[1])
- raise ValueError(
- "Wrong GGUF model or invalid GGUF quant type: %s.\n"
- "- It should be in repo_id:quant_type format.\n"
- "- Valid GGMLQuantizationType values: %s",
- model,
- GGMLQuantizationType._member_names_,
- )
-
-
-def is_gguf(model: str | Path) -> bool:
- """Check if the model is a GGUF model.
-
- Args:
- model: Model name, path, or Path object to check.
-
- Returns:
- True if the model is a GGUF model, False otherwise.
- """
- model = str(model)
-
- # Check if it's a local GGUF file
- if check_gguf_file(model):
- return True
-
- # Check if it's a remote GGUF model (repo_id:quant_type format)
- return is_remote_gguf(model)
-
-
def modelscope_list_repo_files(
repo_id: str,
revision: str | None = None,
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 69b5a6fb48564..e921f8c3de073 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -254,17 +254,11 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
)
else:
has_initial_state = None
- num_actual_tokens = (
- num_prefill_tokens + num_decode_tokens + num_spec_decode_tokens
- )
- # prepare tensors for cudagraph
- #
- # With speculative decoding, the xgrammar backend may rollback tokens
- # and causing some sequences has less draft tokens than self.num_spec.
- #
- # In above cases, the max possible batch size for n tokens, can be
- # min(n, cudagraph_max_bs).
+ # Prepare tensors for cudagraph
+ # Note: m.num_actual_tokens is already padded by the model runner for CUDAGraph
+ batch_size = m.num_actual_tokens
+
if (
self.use_full_cuda_graph
and num_prefills == 0
@@ -272,9 +266,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
and num_spec_decodes <= self.decode_cudagraph_max_bs
and num_spec_decode_tokens <= self.decode_cudagraph_max_bs
):
- num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens)
- batch_size = min(self.decode_cudagraph_max_bs, num_actual_tokens)
-
self.spec_state_indices_tensor[:num_spec_decodes].copy_(
spec_state_indices_tensor, non_blocking=True
)
@@ -319,9 +310,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
and num_spec_decodes == 0
and num_decodes <= self.decode_cudagraph_max_bs
):
- num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens)
- batch_size = num_actual_tokens
-
self.non_spec_state_indices_tensor[:num_decodes].copy_(
non_spec_state_indices_tensor, non_blocking=True
)
@@ -344,7 +332,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
num_decode_tokens=num_decode_tokens,
num_spec_decodes=num_spec_decodes,
num_spec_decode_tokens=num_spec_decode_tokens,
- num_actual_tokens=num_actual_tokens,
+ num_actual_tokens=m.num_actual_tokens,
has_initial_state=has_initial_state,
spec_query_start_loc=spec_query_start_loc,
non_spec_query_start_loc=non_spec_query_start_loc,
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 8e949e53330c1..fcda6134016ba 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -31,7 +31,6 @@ class Mamba1AttentionMetadata:
num_prefill_tokens: int
num_decodes: int
num_decode_tokens: int
- num_padded_decodes: int
block_idx_last_scheduled_token: torch.Tensor # shape: [batch,]
block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,]
@@ -68,7 +67,6 @@ class Mamba1AttentionMetadataBuilder(
has_initial_states_p = None
query_start_loc_p = None
- padded_decodes = num_decodes
num_computed_tokens, num_computed_tokens_p = None, None
block_idx_first_scheduled_token = None
block_idx_first_scheduled_token_p = None
@@ -125,11 +123,10 @@ class Mamba1AttentionMetadataBuilder(
and num_decodes <= self.decode_cudagraph_max_bs
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
):
- padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
self.state_indices_tensor[:num_decodes].copy_(
state_indices_tensor, non_blocking=True
)
- state_indices_tensor = self.state_indices_tensor[:padded_decodes]
+ state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
state_indices_tensor[num_decodes:] = PAD_SLOT_ID
if self.vllm_config.cache_config.enable_prefix_caching:
@@ -137,17 +134,15 @@ class Mamba1AttentionMetadataBuilder(
block_idx_last_scheduled_token, non_blocking=True
)
block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
- :padded_decodes
+ :num_decode_tokens
]
- block_idx_last_scheduled_token[num_decodes:] = 0
self.block_idx_last_computed_token[:num_decodes].copy_(
block_idx_last_computed_token, non_blocking=True
)
block_idx_last_computed_token = self.block_idx_last_computed_token[
- :padded_decodes
+ :num_decode_tokens
]
- block_idx_last_computed_token[num_decodes:] = 0
return Mamba1AttentionMetadata(
query_start_loc_p=query_start_loc_p,
@@ -157,7 +152,6 @@ class Mamba1AttentionMetadataBuilder(
num_prefill_tokens=num_prefill_tokens,
num_decodes=num_decodes,
num_decode_tokens=num_decode_tokens,
- num_padded_decodes=padded_decodes,
block_idx_last_scheduled_token=block_idx_last_scheduled_token,
block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
block_idx_last_computed_token=block_idx_last_computed_token,
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 888734e5d2b6b..bf1d8f09ab0ac 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -10,7 +10,6 @@ from vllm.config import VllmConfig
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
from vllm.v1.attention.backends.utils import (
- PAD_SLOT_ID,
CommonAttentionMetadata,
compute_causal_conv1d_metadata,
split_decodes_and_prefills,
@@ -304,30 +303,25 @@ class Mamba2AttentionMetadataBuilder(
num_decodes <= self.decode_cudagraph_max_bs
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
):
- # Pad state tensor for CUDA graph
- num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
self.state_indices_tensor[:num_decodes].copy_(
state_indices_tensor, non_blocking=True
)
- state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
- state_indices_tensor[num_decodes:] = PAD_SLOT_ID
+ state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
if self.vllm_config.cache_config.enable_prefix_caching:
self.block_idx_last_scheduled_token[:num_decodes].copy_(
block_idx_last_scheduled_token, non_blocking=True
)
block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
- :num_input_tokens
+ :num_decode_tokens
]
- block_idx_last_scheduled_token[num_decodes:] = 0
self.block_idx_last_computed_token[:num_decodes].copy_(
block_idx_last_computed_token, non_blocking=True
)
block_idx_last_computed_token = self.block_idx_last_computed_token[
- :num_input_tokens
+ :num_decode_tokens
]
- block_idx_last_computed_token[num_decodes:] = 0
attn_metadata = Mamba2AttentionMetadata(
num_prefills=num_prefills,
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index de0cb73db0917..c8fe0faf71088 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -83,11 +83,10 @@ class ShortConvAttentionMetadataBuilder(
and num_decodes <= self.decode_cudagraph_max_bs
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
):
- num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
self.state_indices_tensor[:num_decodes].copy_(
state_indices_tensor, non_blocking=True
)
- state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
+ state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
state_indices_tensor[num_decodes:] = PAD_SLOT_ID
attn_metadata = ShortConvAttentionMetadata(
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 3823384881cd3..33e8c81514c5f 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -230,6 +230,9 @@ class KVCacheManager:
delay_cache_blocks: Whether to skip caching the blocks. This is
used by P/D when allocating blocks used in a KV transfer
which will complete in a future step.
+ num_encoder_tokens: The number of encoder tokens to allocate for
+ cross-attention in encoder-decoder models(e.g., Whisper).
+ For decoder-only models, this should be 0.
Blocks layout:
```
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index 7916fafdae1fb..df61eebb395e5 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -45,6 +45,12 @@ class AsyncScheduler(Scheduler):
request: Request,
new_token_ids: list[int],
) -> tuple[list[int], bool]:
+ if request.discard_latest_async_tokens:
+ # If the request is force preempted in reset_prefix_cache, we
+ # should discard the latest async token.
+ request.discard_latest_async_tokens = False
+ return [], False
+
status_before_update = request.status
new_token_ids, stopped = super()._update_request_with_output(
request, new_token_ids
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index 88d99d9402821..c2f503ef2354e 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -152,10 +152,16 @@ class SchedulerInterface(ABC):
return self.has_unfinished_requests() or self.has_finished_requests()
@abstractmethod
- def reset_prefix_cache(self) -> bool:
+ def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
"""Reset the prefix cache for KV cache.
This is particularly required when the model weights are live-updated.
+
+ Args:
+ reset_running_requests: If True, all the running requests will be
+ preempted and moved to the waiting queue. Otherwise, this method
+ will only reset the KV prefix cache when there is no running request
+ taking KV cache.
"""
raise NotImplementedError
diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 7bc1010db23a2..a00ca1912b0f3 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -137,31 +137,30 @@ class PriorityRequestQueue(RequestQueue):
"""
A priority queue that supports heap operations.
- Requests with a smaller value of `priority` are processed first.
+ Respects the ordering defined in the Request class, where
+ requests with a smaller value of `priority` are processed first.
If multiple requests have the same priority, the one with the earlier
`arrival_time` is processed first.
"""
def __init__(self) -> None:
- self._heap: list[tuple[int, float, Request]] = []
+ self._heap: list[Request] = []
def add_request(self, request: Request) -> None:
"""Add a request to the queue according to priority policy."""
- heapq.heappush(self._heap, (request.priority, request.arrival_time, request))
+ heapq.heappush(self._heap, request)
def pop_request(self) -> Request:
"""Pop a request from the queue according to priority policy."""
if not self._heap:
raise IndexError("pop from empty heap")
- _, _, request = heapq.heappop(self._heap)
- return request
+ return heapq.heappop(self._heap)
def peek_request(self) -> Request:
"""Peek at the next request in the queue without removing it."""
if not self._heap:
raise IndexError("peek from empty heap")
- _, _, request = self._heap[0]
- return request
+ return self._heap[0]
def prepend_request(self, request: Request) -> None:
"""Add a request to the queue according to priority policy.
@@ -180,15 +179,13 @@ class PriorityRequestQueue(RequestQueue):
def remove_request(self, request: Request) -> None:
"""Remove a specific request from the queue."""
- self._heap = [(p, t, r) for p, t, r in self._heap if r != request]
+ self._heap.remove(request)
heapq.heapify(self._heap)
def remove_requests(self, requests: Iterable[Request]) -> None:
"""Remove multiple specific requests from the queue."""
- requests_to_remove = set(requests)
- self._heap = [
- (p, t, r) for p, t, r in self._heap if r not in requests_to_remove
- ]
+ requests_to_remove = requests if isinstance(requests, set) else set(requests)
+ self._heap = [r for r in self._heap if r not in requests_to_remove]
heapq.heapify(self._heap)
def __bool__(self) -> bool:
@@ -203,8 +200,7 @@ class PriorityRequestQueue(RequestQueue):
"""Iterate over the queue according to priority policy."""
heap_copy = self._heap[:]
while heap_copy:
- _, _, request = heapq.heappop(heap_copy)
- yield request
+ yield heapq.heappop(heap_copy)
def __reversed__(self) -> Iterator[Request]:
"""Iterate over the queue in reverse priority order."""
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c1ead200ba8d6..52b98ef654592 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -347,17 +347,7 @@ class Scheduler(SchedulerInterface):
else:
preempted_req = self.running.pop()
- self.kv_cache_manager.free(preempted_req)
- self.encoder_cache_manager.free(preempted_req)
- preempted_req.status = RequestStatus.PREEMPTED
- preempted_req.num_computed_tokens = 0
- preempted_req.num_preemptions += 1
- if self.log_stats:
- preempted_req.record_event(
- EngineCoreEventType.PREEMPTED, scheduled_timestamp
- )
-
- self.waiting.prepend_request(preempted_req)
+ self._preempt_request(preempted_req, scheduled_timestamp)
preempted_reqs.append(preempted_req)
if preempted_req == request:
# No more request to preempt. Cannot schedule this request.
@@ -756,6 +746,30 @@ class Scheduler(SchedulerInterface):
self._update_after_schedule(scheduler_output)
return scheduler_output
+ def _preempt_request(
+ self,
+ request: Request,
+ timestamp: float,
+ ) -> None:
+ """Preempt a request and put it back to the waiting queue.
+
+ NOTE: The request should be popped from the running queue outside of this
+ method.
+ """
+ assert request.status == RequestStatus.RUNNING, (
+ "Only running requests can be preempted"
+ )
+ self.kv_cache_manager.free(request)
+ self.encoder_cache_manager.free(request)
+ request.status = RequestStatus.PREEMPTED
+ request.num_computed_tokens = 0
+ request.num_preemptions += 1
+ if self.log_stats:
+ request.record_event(EngineCoreEventType.PREEMPTED, timestamp)
+
+ # Put the request back to the waiting queue.
+ self.waiting.prepend_request(request)
+
def _update_after_schedule(
self,
scheduler_output: SchedulerOutput,
@@ -1362,8 +1376,45 @@ class Scheduler(SchedulerInterface):
def has_finished_requests(self) -> bool:
return len(self.finished_req_ids) > 0
- def reset_prefix_cache(self) -> bool:
- return self.kv_cache_manager.reset_prefix_cache()
+ def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+ """Reset the KV prefix cache.
+
+ If reset_running_requests is True, all the running requests will be
+ preempted and moved to the waiting queue.
+ Otherwise, this method will only reset the KV prefix cache when there
+ is no running requests taking KV cache.
+ """
+ if reset_running_requests:
+ # For logging.
+ timestamp = time.monotonic()
+ # Invalidate all the current running requests KV's by pushing them to
+ # the waiting queue. In this case, we can reduce the ref count of all
+ # the kv blocks to 0 and thus we can make sure the reset is successful.
+ # Preempt in reverse order so the requests will be added back to the
+ # running queue in FIFO order.
+ while self.running:
+ request = self.running.pop()
+ self._preempt_request(request, timestamp)
+ # NOTE(zhuohan): For async scheduling, we need to discard the latest
+ # output token on the fly to avoid a redundant repetitive output token.
+ request.num_output_placeholders = 0
+ request.discard_latest_async_tokens = True
+
+ # Clear scheduled request ids cache. Since we are forcing preemption
+ # + resumption in the same step, we must act as if these requests were
+ # not scheduled in the prior step. They will be flushed from the
+ # persistent batch in the model runner.
+ self.prev_step_scheduled_req_ids.clear()
+
+ reset_successful = self.kv_cache_manager.reset_prefix_cache()
+ if reset_running_requests and not reset_successful:
+ raise RuntimeError(
+ "Failed to reset KV cache even when all the running requests are "
+ "preempted and moved to the waiting queue. This is likely due to "
+ "the presence of running requests waiting for remote KV transfer, "
+ "which is not supported yet."
+ )
+ return reset_successful
def make_stats(
self,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d0708a8a046d1..ec5d6e95ce3aa 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,10 +26,9 @@ from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
from vllm.tracing import init_tracer
from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
from vllm.utils.async_utils import cancel_task_threadsafe
from vllm.utils.collection_utils import as_list
@@ -112,7 +111,7 @@ class AsyncLLM(EngineClient):
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
- tokenizer = init_tokenizer_from_configs(self.model_config)
+ tokenizer = init_tokenizer_from_config(self.model_config)
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(
@@ -750,8 +749,8 @@ class AsyncLLM(EngineClient):
self.input_processor.clear_mm_cache()
await self.engine_core.reset_mm_cache_async()
- async def reset_prefix_cache(self) -> None:
- await self.engine_core.reset_prefix_cache_async()
+ async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+ return await self.engine_core.reset_prefix_cache_async(reset_running_requests)
async def sleep(self, level: int = 1) -> None:
await self.reset_prefix_cache()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e3a5f51a8fc56..61b8422dd6633 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -483,8 +483,8 @@ class EngineCore:
self.model_executor.reset_mm_cache()
- def reset_prefix_cache(self):
- self.scheduler.reset_prefix_cache()
+ def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+ return self.scheduler.reset_prefix_cache(reset_running_requests)
def sleep(self, level: int = 1):
self.model_executor.sleep(level)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 9b440505bd9dc..afa0593921d06 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -138,7 +138,7 @@ class EngineCoreClient(ABC):
def reset_mm_cache(self) -> None:
raise NotImplementedError
- def reset_prefix_cache(self) -> None:
+ def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
raise NotImplementedError
def sleep(self, level: int = 1) -> None:
@@ -208,7 +208,9 @@ class EngineCoreClient(ABC):
async def reset_mm_cache_async(self) -> None:
raise NotImplementedError
- async def reset_prefix_cache_async(self) -> None:
+ async def reset_prefix_cache_async(
+ self, reset_running_requests: bool = False
+ ) -> bool:
raise NotImplementedError
async def sleep_async(self, level: int = 1) -> None:
@@ -287,8 +289,8 @@ class InprocClient(EngineCoreClient):
def reset_mm_cache(self) -> None:
self.engine_core.reset_mm_cache()
- def reset_prefix_cache(self) -> None:
- self.engine_core.reset_prefix_cache()
+ def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+ return self.engine_core.reset_prefix_cache(reset_running_requests)
def sleep(self, level: int = 1) -> None:
self.engine_core.sleep(level)
@@ -751,8 +753,8 @@ class SyncMPClient(MPClient):
def reset_mm_cache(self) -> None:
self.call_utility("reset_mm_cache")
- def reset_prefix_cache(self) -> None:
- self.call_utility("reset_prefix_cache")
+ def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+ return self.call_utility("reset_prefix_cache", reset_running_requests)
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.call_utility("add_lora", lora_request)
@@ -955,8 +957,12 @@ class AsyncMPClient(MPClient):
async def reset_mm_cache_async(self) -> None:
await self.call_utility_async("reset_mm_cache")
- async def reset_prefix_cache_async(self) -> None:
- await self.call_utility_async("reset_prefix_cache")
+ async def reset_prefix_cache_async(
+ self, reset_running_requests: bool = False
+ ) -> bool:
+ return await self.call_utility_async(
+ "reset_prefix_cache", reset_running_requests
+ )
async def sleep_async(self, level: int = 1) -> None:
await self.call_utility_async("sleep", level)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a3bde7ba8d64d..d21cdf04ead26 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -23,9 +23,8 @@ from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
-from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config
from vllm.tracing import init_tracer
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core_client import EngineCoreClient
@@ -87,7 +86,7 @@ class LLMEngine:
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
- tokenizer = init_tokenizer_from_configs(self.model_config)
+ tokenizer = init_tokenizer_from_config(self.model_config)
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(
@@ -329,8 +328,8 @@ class LLMEngine:
self.input_processor.clear_mm_cache()
self.engine_core.reset_mm_cache()
- def reset_prefix_cache(self):
- self.engine_core.reset_prefix_cache()
+ def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool:
+ return self.engine_core.reset_prefix_cache(reset_running_requests)
def sleep(self, level: int = 1):
self.engine_core.sleep(level)
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index a319ffb1d2573..4b46669d5d3bf 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -7,37 +7,55 @@ from vllm.v1.metrics.loggers import PrometheusStatLogger
from vllm.v1.spec_decode.metrics import SpecDecodingProm
try:
+ from ray import serve as ray_serve
from ray.util import metrics as ray_metrics
from ray.util.metrics import Metric
except ImportError:
ray_metrics = None
+ ray_serve = None
import regex as re
+def _get_replica_id() -> str | None:
+ """Get the current Ray Serve replica ID, or None if not in a Serve context."""
+ if ray_serve is None:
+ return None
+ try:
+ return ray_serve.get_replica_context().replica_id.unique_id
+ except ray_serve.exceptions.RayServeException:
+ return None
+
+
class RayPrometheusMetric:
def __init__(self):
if ray_metrics is None:
raise ImportError("RayPrometheusMetric requires Ray to be installed.")
-
self.metric: Metric = None
+ @staticmethod
+ def _get_tag_keys(labelnames: list[str] | None) -> tuple[str, ...]:
+ labels = list(labelnames) if labelnames else []
+ labels.append("ReplicaId")
+ return tuple(labels)
+
def labels(self, *labels, **labelskwargs):
+ if labels:
+ # -1 because ReplicaId was added automatically
+ expected = len(self.metric._tag_keys) - 1
+ if len(labels) != expected:
+ raise ValueError(
+ "Number of labels must match the number of tag keys. "
+ f"Expected {expected}, got {len(labels)}"
+ )
+ labelskwargs.update(zip(self.metric._tag_keys, labels))
+
+ labelskwargs["ReplicaId"] = _get_replica_id() or ""
+
if labelskwargs:
for k, v in labelskwargs.items():
if not isinstance(v, str):
labelskwargs[k] = str(v)
-
self.metric.set_default_tags(labelskwargs)
-
- if labels:
- if len(labels) != len(self.metric._tag_keys):
- raise ValueError(
- "Number of labels must match the number of tag keys. "
- f"Expected {len(self.metric._tag_keys)}, got {len(labels)}"
- )
-
- self.metric.set_default_tags(dict(zip(self.metric._tag_keys, labels)))
-
return self
@staticmethod
@@ -71,10 +89,14 @@ class RayGaugeWrapper(RayPrometheusMetric):
# "mostrecent", "all", "sum" do not apply. This logic can be manually
# implemented at the observability layer (Prometheus/Grafana).
del multiprocess_mode
- labelnames_tuple = tuple(labelnames) if labelnames else None
+
+ tag_keys = self._get_tag_keys(labelnames)
name = self._get_sanitized_opentelemetry_name(name)
+
self.metric = ray_metrics.Gauge(
- name=name, description=documentation, tag_keys=labelnames_tuple
+ name=name,
+ description=documentation,
+ tag_keys=tag_keys,
)
def set(self, value: int | float):
@@ -95,10 +117,12 @@ class RayCounterWrapper(RayPrometheusMetric):
documentation: str | None = "",
labelnames: list[str] | None = None,
):
- labelnames_tuple = tuple(labelnames) if labelnames else None
+ tag_keys = self._get_tag_keys(labelnames)
name = self._get_sanitized_opentelemetry_name(name)
self.metric = ray_metrics.Counter(
- name=name, description=documentation, tag_keys=labelnames_tuple
+ name=name,
+ description=documentation,
+ tag_keys=tag_keys,
)
def inc(self, value: int | float = 1.0):
@@ -118,13 +142,14 @@ class RayHistogramWrapper(RayPrometheusMetric):
labelnames: list[str] | None = None,
buckets: list[float] | None = None,
):
- labelnames_tuple = tuple(labelnames) if labelnames else None
+ tag_keys = self._get_tag_keys(labelnames)
name = self._get_sanitized_opentelemetry_name(name)
+
boundaries = buckets if buckets else []
self.metric = ray_metrics.Histogram(
name=name,
description=documentation,
- tag_keys=labelnames_tuple,
+ tag_keys=tag_keys,
boundaries=boundaries,
)
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 366cdadf5a583..33762fe34e64f 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -93,7 +93,12 @@ class Request:
if self.prompt_token_ids is not None
else [0] * self.num_prompt_tokens
)
- self.num_output_placeholders = 0 # Used in async scheduling.
+
+ # Used in async scheduling.
+ self.num_output_placeholders = 0
+ # Used in forced preemption (reset_prefix_cache) with async scheduling.
+ self.discard_latest_async_tokens = False
+
self.spec_token_ids: list[int] = []
self.num_computed_tokens = 0
self.cache_salt: str | None = cache_salt
@@ -222,6 +227,19 @@ class Request:
events, self.events = self.events, []
return events
+ def __lt__(self, other: "Request") -> bool:
+ """
+ Compare two requests based on priority, arrival time, and request ID.
+ Used in priority scheduling.
+ """
+ if self.priority != other.priority:
+ return self.priority < other.priority
+ if self.arrival_time != other.arrival_time:
+ return self.arrival_time < other.arrival_time
+ if self.request_id != other.request_id:
+ return self.request_id < other.request_id
+ return id(self) < id(other)
+
class RequestStatus(enum.IntEnum):
"""Status of a request."""
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 4ee7dc2880c8c..82743f72b0310 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -110,7 +110,7 @@ class MinPLogitsProcessor(LogitsProcessor):
# Identify valid tokens using threshold comparison
invalid_token_mask = probability_values < adjusted_min_p
# Apply mask using boolean indexing
- logits[invalid_token_mask] = -float("inf")
+ logits.masked_fill_(invalid_token_mask, -float("inf"))
return logits
@@ -178,6 +178,10 @@ class MinTokensLogitsProcessor(LogitsProcessor):
self._device_tensor([], torch.int32),
)
+ self.neg_inf_tensor = torch.tensor(
+ -float("inf"), dtype=torch.float32, device=self.device
+ )
+
def is_argmax_invariant(self) -> bool:
"""By censoring stop tokens, min-tokens can change the outcome
of the argmax operation in greedy sampling."""
@@ -229,7 +233,7 @@ class MinTokensLogitsProcessor(LogitsProcessor):
def apply(self, logits: torch.Tensor) -> torch.Tensor:
if self.min_toks:
# Inhibit EOS token for requests which have not reached min length
- logits[self.logits_slice] = -float("inf")
+ logits.index_put_(self.logits_slice, self.neg_inf_tensor)
return logits
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index d7111d52dd8a1..1c7845a14b742 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1016,6 +1016,10 @@ class EagleProposer:
"Qwen3VLForConditionalGeneration",
]:
self.model.config.image_token_index = target_model.config.image_token_id
+ elif self.get_model_name(target_model) == "PixtralForConditionalGeneration":
+ self.model.config.image_token_index = (
+ target_model.config.vision_config.image_token_id
+ )
else:
self.model.config.image_token_index = (
target_model.config.image_token_index
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 10b3f0aa040e5..1273ca12c3600 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -196,9 +196,9 @@ def batch_propose_numba(
k=k,
)
- valid_ngram_num_drafts[i] = drafter_output.shape[0]
+ valid_ngram_num_drafts[idx] = drafter_output.shape[0]
if len(drafter_output):
- valid_ngram_draft[i, : drafter_output.shape[0]] = drafter_output
+ valid_ngram_draft[idx, : drafter_output.shape[0]] = drafter_output
@jit(nopython=True)
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 029129cf1a475..d087d28b1dae3 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
-from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
+from vllm.tokenizers import init_tokenizer_from_config
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_guidance import GuidanceBackend
from vllm.v1.structured_output.backend_types import (
@@ -61,7 +61,7 @@ class StructuredOutputManager:
# of CPUs.
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
self.executor = ThreadPoolExecutor(max_workers=max_workers)
- self.tokenizer = init_tokenizer_from_configs(
+ self.tokenizer = init_tokenizer_from_config(
model_config=self.vllm_config.model_config
)
reasoning_parser = (
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e7991baeaa1b8..516c76a5e4b15 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -482,6 +482,8 @@ class InputBatch:
self.generators.pop(req_index, None)
self.num_logprobs.pop(req_id, None)
self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
+ if self.prev_req_id_to_index is not None:
+ self.prev_req_id_to_index.pop(req_id, None)
self.has_allowed_token_ids.discard(req_id)
if self.allowed_token_ids_mask_cpu_tensor is not None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2218e4f023f92..8c22ada029b1a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -774,7 +774,14 @@ class GPUModelRunner(
# they will be scheduled again sometime in the future.
scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
cached_req_ids = self.input_batch.req_id_to_index.keys()
- unscheduled_req_ids = cached_req_ids - scheduled_req_ids
+ resumed_req_ids = scheduler_output.scheduled_cached_reqs.resumed_req_ids
+ # NOTE(zhuohan): cached_req_ids and resumed_req_ids are usually disjoint,
+ # so `(scheduled_req_ids - resumed_req_ids) == scheduled_req_ids` holds
+ # apart from the forced-preemption case in reset_prefix_cache. And in
+ # that case we include the resumed_req_ids in the unscheduled set so
+ # that they get cleared from the persistent batch before being re-scheduled
+ # in the normal resumed request path.
+ unscheduled_req_ids = cached_req_ids - (scheduled_req_ids - resumed_req_ids)
# NOTE(woosuk): The persistent batch optimization assumes that
# consecutive batches contain mostly the same requests. If batches
# have low request overlap (e.g., alternating between two distinct
@@ -2432,16 +2439,13 @@ class GPUModelRunner(
]:
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
is_first_rank = get_pp_group().is_first_rank
+ is_encoder_decoder = self.model_config.is_encoder_decoder
# _prepare_inputs may reorder the batch, so we must gather multi
# modal outputs after that to ensure the correct order
ec_connector_output = None
- if (
- self.supports_mm_inputs
- and is_first_rank
- and not self.model_config.is_encoder_decoder
- ):
+ if self.supports_mm_inputs and is_first_rank and not is_encoder_decoder:
# Run the multimodal encoder if any.
with self.maybe_get_ec_connector_output(
scheduler_output,
@@ -2519,10 +2523,7 @@ class GPUModelRunner(
num_input_tokens, intermediate_tensors, True
)
- if (
- self.model_config.is_encoder_decoder
- and scheduler_output.scheduled_encoder_inputs
- ):
+ if is_encoder_decoder and scheduler_output.scheduled_encoder_inputs:
# Run the encoder, just like we do with other multimodal inputs.
# For an encoder-decoder model, our processing here is a bit
# simpler, because the outputs are just passed to the decoder.
@@ -3999,7 +4000,7 @@ class GPUModelRunner(
num_reqs=num_reqs_padded,
max_query_len=max_query_len,
ubatch_slices=ubatch_slices,
- for_cudagraph_capture=True,
+ for_cudagraph_capture=is_graph_capturing,
)
with self.maybe_dummy_run_with_lora(