diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
new file mode 100644
index 0000000000000..199c33159fde3
--- /dev/null
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+ - ".buildkite/test_areas"
+ - ".buildkite/image_build"
+run_all_patterns:
+ - "docker/Dockerfile"
+ - "CMakeLists.txt"
+ - "requirements/common.txt"
+ - "requirements/cuda.txt"
+ - "requirements/build.txt"
+ - "requirements/test.txt"
+ - "setup.py"
+ - "csrc/"
+ - "cmake/"
+run_all_exclude_patterns:
+ - "docker/Dockerfile."
+ - "csrc/cpu/"
+ - "csrc/rocm/"
+ - "cmake/hipify.py"
+ - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+ main: "vllm-ci-postmerge-repo"
+ premerge: "vllm-ci-test-repo"
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe8476..0000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """
-
-
- Links for vLLM
- {x86_wheel}
- {arm_wheel}
-
-
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
- print(f"Generated index.html for {args.wheel}")
- # sync the abi tag with .buildkite/scripts/upload-wheels.sh
- if "x86_64" in filename:
- x86_wheel = filename
- arm_wheel = filename.replace("x86_64", "aarch64").replace(
- "manylinux1", "manylinux2014"
- )
- elif "aarch64" in filename:
- x86_wheel = filename.replace("aarch64", "x86_64").replace(
- "manylinux2014", "manylinux1"
- )
- arm_wheel = filename
- else:
- raise ValueError(f"Unsupported wheel: {filename}")
- # cloudfront requires escaping the '+' character
- f.write(
- template.format(
- x86_wheel=x86_wheel,
- x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
- arm_wheel=arm_wheel,
- arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
- )
- )
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
new file mode 100755
index 0000000000000..9a2384e524b63
--- /dev/null
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+ merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+ merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+ --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+ --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+ ${merge_base_commit_build_args} \
+ --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+ --cache-to type=registry,ref=${CACHE_TO},mode=max \
+ --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+ $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+ --push \
+ --target test \
+ --progress plain .
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
new file mode 100644
index 0000000000000..d01c71dd9becf
--- /dev/null
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,57 @@
+group: Abuild
+steps:
+ - label: ":docker: Build image"
+ key: image-build
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU image"
+ key: image-build-cpu
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build HPU image"
+ soft_fail: true
+ depends_on: []
+ key: image-build-hpu
+ commands:
+ - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU arm64 image"
+ key: cpu-arm64-image-build
+ depends_on: []
+ optional: true
+ commands:
+ - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
new file mode 100755
index 0000000000000..a69732f430985
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg VLLM_CPU_AVX512BF16=true \
+ --build-arg VLLM_CPU_AVX512VNNI=true \
+ --build-arg VLLM_CPU_AMXBF16=true \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
new file mode 100755
index 0000000000000..615298b6555bd
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
new file mode 100755
index 0000000000000..192447ef4577e
--- /dev/null
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build \
+ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+ --progress plain \
+ https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
index 46f1a9fbf6ff9..6c0b5540cbb6a 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -8,3 +8,4 @@ tasks:
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5
+rtol: 0.05
diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
new file mode 100644
index 0000000000000..4fb0b84bc4d81
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 3627b760eddcf..f94d681197d2d 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1
"""
+import os
+from contextlib import contextmanager
+
import lm_eval
import numpy as np
import yaml
-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+ if not new_env:
+ # Fast path: nothing to do
+ yield
+ return
+
+ old_values = {}
+ new_keys = []
+
+ try:
+ for key, value in new_env.items():
+ if key in os.environ:
+ old_values[key] = os.environ[key]
+ else:
+ new_keys.append(key)
+ os.environ[key] = str(value)
+ yield
+ finally:
+ # Restore / clean up
+ for key, value in old_values.items():
+ os.environ[key] = value
+ for key in new_keys:
+ os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size):
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len},"
)
- results = lm_eval.simple_evaluate(
- model=backend,
- model_args=model_args,
- tasks=[task["name"] for task in eval_config["tasks"]],
- num_fewshot=eval_config["num_fewshot"],
- limit=eval_config["limit"],
- # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
- # text models. however, this is regressing measured strict-match for
- # existing text models in CI, so only apply it for mm, or explicitly set
- apply_chat_template=eval_config.get(
- "apply_chat_template", backend == "vllm-vlm"
- ),
- fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
- # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
- gen_kwargs=eval_config.get("gen_kwargs"),
- batch_size=batch_size,
- )
+
+ env_vars = eval_config.get("env_vars", None)
+ with scoped_env_vars(env_vars):
+ results = lm_eval.simple_evaluate(
+ model=backend,
+ model_args=model_args,
+ tasks=[task["name"] for task in eval_config["tasks"]],
+ num_fewshot=eval_config["num_fewshot"],
+ limit=eval_config["limit"],
+ # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+ # text models. however, this is regressing measured strict-match for
+ # existing text models in CI, so only apply it for mm, or explicitly set
+ apply_chat_template=eval_config.get(
+ "apply_chat_template", backend == "vllm-vlm"
+ ),
+ fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+ # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+ gen_kwargs=eval_config.get("gen_kwargs"),
+ batch_size=batch_size,
+ )
return results
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size)
+ rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]]
print(
f"{task['name']} | {metric['name']}: "
- f"ground_truth={ground_truth} | measured={measured_value}"
+ f"ground_truth={ground_truth:.3f} | "
+ f"measured={measured_value:.3f} | rtol={rtol}"
)
- success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+ success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success
diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md
index 6d494f64f14fa..015f48c2520d6 100644
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
+#### Default Parameters Field
+
+We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
+
+
+ An Example of default parameters field
+
+```json
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "server_environment_variables": {
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+ },
+ "server_parameters": {
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "block_size": 128,
+ "disable_log_stats": "",
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "num_prompts": 200,
+ "ignore-eos": ""
+ }
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama3B_tp2_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 2,
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ }
+ },
+ {
+ "test_name": "serving_qwen3_tp4_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ "tensor_parallel_size": 4,
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-14B",
+ }
+ },
+ ]
+}
+```
+
+
+
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
index 99a5a5e334f8e..34ceefe0996f2 100644
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -110,7 +110,8 @@ json2envs() {
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
- timeout 1200 bash -c '
+ local timeout_val="1200"
+ timeout "$timeout_val" bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
@@ -316,12 +317,44 @@ run_throughput_tests() {
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
+ #
+ # Supported JSON formats:
+ # 1) Plain format: top-level array
+ # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ #
+ # 2) Default parameters field + plain format tests
+ # {
+ # "defaults": { ... },
+ # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+ # }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
- jq -c '.[]' "$serving_test_file" | while read -r params; do
+ jq -c '
+ if type == "array" then
+ # Plain format: test cases array
+ .[]
+ elif (type == "object" and has("tests")) then
+ # merge the default parameters into each test cases
+ . as $root
+ | ($root.defaults // {}) as $d
+ | ($root.tests // [])[]
+ # default qps / max_concurrency from defaults if missing
+ | .qps_list = (.qps_list // $d.qps_list)
+ | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
+ # merge envs / params: test overrides defaults
+ | .server_environment_variables =
+ (($d.server_environment_variables // {}) + (.server_environment_variables // {}))
+ | .server_parameters =
+ (($d.server_parameters // {}) + (.server_parameters // {}))
+ | .client_parameters =
+ (($d.client_parameters // {}) + (.client_parameters // {}))
+ else
+ error("Unsupported serving test file format: must be array or object with .tests")
+ end
+ ' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -335,20 +368,25 @@ run_serving_tests() {
continue
fi
- # get client and server arguments
+ # get client and server arguments (after merged the default parameters)
server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
+
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
+
+ # qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
+
+ # max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
- num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
- max_concurrency_list="[$num_prompts]"
+ num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+ max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
deleted file mode 100644
index f758097e098e4..0000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json
+++ /dev/null
@@ -1,610 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
deleted file mode 100644
index 0b1a42e790255..0000000000000
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json
+++ /dev/null
@@ -1,1023 +0,0 @@
-[
- {
- "test_name": "serving_llama8B_bf16_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 200
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp1_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp4_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 4,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- },
- {
- "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
- "qps_list": ["inf"],
- "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "quantization": "awq",
- "tensor_parallel_size": 2,
- "pipeline_parallel_size": 3,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 1000
- }
- }
-]
diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
index f792956f39472..8f7200862d20c 100644
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -1,276 +1,246 @@
-[
- {
- "test_name": "serving_llama8B_tp1_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+{
+ "defaults": {
+ "qps_list": [
+ "inf"
+ ],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
},
- {
- "test_name": "serving_llama8B_tp2_sharegpt",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "sharegpt",
- "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "num_prompts": 32
- }
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
},
- {
- "test_name": "serving_llama8B_tp1_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_128_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp1_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_128_2048",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 128,
- "random-output-len": 2048,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp1_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 1,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
- },
- {
- "test_name": "serving_llama8B_tp2_random_2048_128",
- "qps_list": [1, 4, 16, "inf"],
- "max_concurrency_list": [32],
- "server_environment_variables": {
- "VLLM_RPC_TIMEOUT": 100000,
- "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
- "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
- "VLLM_CPU_SGL_KERNEL": 1,
- "VLLM_CPU_KVCACHE_SPACE": 40
- },
- "server_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "tensor_parallel_size": 2,
- "dtype": "bfloat16",
- "distributed_executor_backend": "mp",
- "block_size": 128,
- "trust_remote_code": "",
- "enable_chunked_prefill": "",
- "disable_log_stats": "",
- "enforce_eager": "",
- "max_num_batched_tokens": 2048,
- "max_num_seqs": 256,
- "load_format": "dummy"
- },
- "client_parameters": {
- "model": "meta-llama/Llama-3.1-8B-Instruct",
- "backend": "vllm",
- "dataset_name": "random",
- "random-input-len": 2048,
- "random-output-len": 128,
- "ignore-eos": "",
- "num_prompts": 32
- }
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "ignore-eos": "",
+ "num_prompts": 200
}
-]
+ },
+ "tests": [
+ {
+ "test_name": "serving_llama8B_tp1_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_sharegpt",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_128_2048",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 2048
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp1_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp2_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 2
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama8B_tp4_random_2048_128",
+ "server_parameters": {
+ "tensor_parallel_size": 4
+ },
+ "client_parameters": {
+ "dataset_name": "random",
+ "random-input-len": 2048,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_llama3B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_granite2B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "ibm-granite/granite-3.2-2b-instruct",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen1.7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-1.7B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen4B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-4B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_qwen8B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "Qwen/Qwen3-8B",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_glm9B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "zai-org/glm-4-9b-hf",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ },
+ {
+ "test_name": "serving_gemma7B_tp1_random_128_128",
+ "server_parameters": {
+ "model": "google/gemma-7b",
+ "tensor_parallel_size": 1
+ },
+ "client_parameters": {
+ "model": "google/gemma-7b",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128
+ }
+ }
+ ]
+}
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf5..a9d51557bd9bb 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,13 +8,28 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
+ - label: "Build arm64 wheel - CUDA 13.0"
+ depends_on: ~
+ id: build-wheel-arm64-cuda-13-0
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+ # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+ env:
+ DOCKER_BUILDKIT: "1"
+
# aarch64 build
- label: "Build arm64 CPU wheel"
depends_on: ~
@@ -25,24 +40,11 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- - label: "Build wheel - CUDA 12.8"
- depends_on: ~
- id: build-wheel-cuda-12-8
- agents:
- queue: cpu_queue_postmerge
- commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- - "mkdir artifacts"
- - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
- env:
- DOCKER_BUILDKIT: "1"
-
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
@@ -52,7 +54,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
@@ -65,7 +67,21 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+ env:
+ DOCKER_BUILDKIT: "1"
+
+ # x86 CPU wheel build
+ - label: "Build x86 CPU wheel"
+ depends_on: ~
+ id: build-wheel-x86-cpu
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
@@ -109,7 +125,6 @@ steps:
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- - build-wheel-cuda-12-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 0000000000000..d0965fbd56405
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,400 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import sys
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+import regex as re
+
+if not sys.version_info >= (3, 12):
+ raise RuntimeError("This script requires Python 3.12 or higher.")
+
+INDEX_HTML_TEMPLATE = """
+
+
+
+
+{items}
+
+
+"""
+
+
+@dataclass
+class WheelFileInfo:
+ package_name: str
+ version: str
+ build_tag: str | None
+ python_tag: str
+ abi_tag: str
+ platform_tag: str
+ variant: str | None
+ filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+ """
+ Parse wheel file name to extract metadata.
+
+ The format of wheel names:
+ {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+ All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+ Example:
+ vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+ vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+ vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+ """
+ wheel_file_re = re.compile(
+ r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$"
+ )
+ match = wheel_file_re.match(file)
+ if not match:
+ raise ValueError(f"Invalid wheel file name: {file}")
+
+ package_name = match.group("package_name")
+ version = match.group("version")
+ build_tag = match.group("build_tag")
+ python_tag = match.group("python_tag")
+ abi_tag = match.group("abi_tag")
+ platform_tag = match.group("platform_tag")
+
+ # extract variant from version
+ variant = None
+ if "dev" in version:
+ ver_after_dev = version.split("dev")[-1]
+ if "." in ver_after_dev:
+ variant = ver_after_dev.split(".")[-1]
+ version = version.removesuffix("." + variant)
+ else:
+ if "+" in version:
+ version, variant = version.split("+")
+
+ return WheelFileInfo(
+ package_name=package_name,
+ version=version,
+ build_tag=build_tag,
+ python_tag=python_tag,
+ abi_tag=abi_tag,
+ platform_tag=platform_tag,
+ variant=variant,
+ filename=file,
+ )
+
+
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
+ """
+ Generate project list HTML content linking to each project & variant sub-directory.
+ """
+ href_tags = []
+ for name in sorted(subdir_names):
+ name = name.strip("/").strip(".")
+ href_tags.append(f' {name}/
')
+ return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+
+
+def generate_package_index_and_metadata(
+ wheel_files: list[WheelFileInfo],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ comment: str = "",
+) -> tuple[str, str]:
+ """
+ Generate package index HTML content for a specific package, linking to actual wheel files.
+ """
+ href_tags = []
+ metadata = []
+ for file in sorted(wheel_files, key=lambda x: x.filename):
+ relative_path = (
+ wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+ )
+ # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
+ # NOTE: this is AWS S3 specific behavior!
+ file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
+ href_tags.append(f' {file.filename}
')
+ file_meta = asdict(file)
+ file_meta["path"] = file_path_quoted
+ metadata.append(file_meta)
+ index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
+ metadata_str = json.dumps(metadata, indent=2)
+ return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+ whl_files: list[str],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ default_variant: str | None = None,
+ alias_to_default: str | None = None,
+ comment: str = "",
+):
+ """
+ Generate index for all wheel files.
+
+ Args:
+ whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+ wheel_base_dir (Path): Base directory for wheel files.
+ index_base_dir (Path): Base directory to store index files.
+ default_variant (str | None): The default variant name, if any.
+ alias_to_default (str | None): Alias variant name for the default variant, if any.
+ comment (str | None): Optional comment to include in the generated HTML files.
+
+ First, parse all wheel files to extract metadata.
+ We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+ The index for the default variant (if any) is generated in the root index directory.
+
+ If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+ is purely a copy of the corresponding variant index, with only the links adjusted.
+ Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+ If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+ as the default variant index, but the links are adjusted accordingly.
+
+ Index directory structure:
+ index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+ index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
+ vllm/
+ index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+ metadata.json # machine-readable metadata for all wheels in this package
+ cpu/ # cpu variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu129/ # cu129 is actually the alias to default variant
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ cu130/ # cu130 variant sub-directory
+ index.html
+ vllm/
+ index.html
+ metadata.json
+ ...
+
+ metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+ [
+ {
+ "package_name": "vllm",
+ "version": "0.10.2rc2",
+ "build_tag": null,
+ "python_tag": "cp38",
+ "abi_tag": "abi3",
+ "platform_tag": "manylinux2014_aarch64",
+ "variant": "cu129",
+ "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+ "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
+ },
+ ...
+ ]
+ """
+
+ parsed_files = [parse_from_filename(f) for f in whl_files]
+
+ if not parsed_files:
+ print("No wheel files found, skipping index generation.")
+ return
+
+ # Group by variant
+ variant_to_files: dict[str, list[WheelFileInfo]] = {}
+ for file in parsed_files:
+ variant = file.variant or "default"
+ if variant not in variant_to_files:
+ variant_to_files[variant] = []
+ variant_to_files[variant].append(file)
+
+ print(f"Found variants: {list(variant_to_files.keys())}")
+
+ # sanity check for default variant
+ if default_variant:
+ if "default" in variant_to_files:
+ raise ValueError(
+ "All wheel files must have variant suffixes when `default_variant` is specified."
+ )
+ if default_variant not in variant_to_files:
+ raise ValueError(
+ f"Default variant '{default_variant}' not found among wheel files."
+ )
+
+ if alias_to_default:
+ if "default" not in variant_to_files:
+ # e.g. only some wheels are uploaded to S3 currently
+ print(
+ "[WARN] Alias to default variant specified, but no default variant found."
+ )
+ elif alias_to_default in variant_to_files:
+ raise ValueError(
+ f"Alias variant name '{alias_to_default}' already exists among wheel files."
+ )
+ else:
+ variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+ print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+ # Generate comment in HTML header
+ comment_str = f" ({comment})" if comment else ""
+ comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
+ # Generate index for each variant
+ subdir_names = set()
+ for variant, files in variant_to_files.items():
+ if variant == "default":
+ variant_dir = index_base_dir
+ else:
+ variant_dir = index_base_dir / variant
+ subdir_names.add(variant)
+
+ variant_dir.mkdir(parents=True, exist_ok=True)
+
+ # gather all package names in this variant
+ packages = set(f.package_name for f in files)
+ if variant == "default":
+ # these packages should also appear in the "project list"
+ # generate after all variants are processed
+ subdir_names = subdir_names.union(packages)
+ else:
+ # generate project list for this variant directly
+ project_list_str = generate_project_list(sorted(packages), comment_tmpl)
+ with open(variant_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+ for package in packages:
+ # filter files belonging to this package only
+ package_files = [f for f in files if f.package_name == package]
+ package_dir = variant_dir / package
+ package_dir.mkdir(parents=True, exist_ok=True)
+ index_str, metadata_str = generate_package_index_and_metadata(
+ package_files, wheel_base_dir, package_dir, comment
+ )
+ with open(package_dir / "index.html", "w") as f:
+ f.write(index_str)
+ with open(package_dir / "metadata.json", "w") as f:
+ f.write(metadata_str)
+
+ # Generate top-level project list index
+ project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
+ with open(index_base_dir / "index.html", "w") as f:
+ f.write(project_list_str)
+
+
+if __name__ == "__main__":
+ """
+ Arguments:
+ --version : version string for the current build (e.g., commit hash)
+ --current-objects : path to JSON file containing current S3 objects listing in this version directory
+ --output-dir : directory to store generated index files
+ --alias-to-default : (optional) alias variant name for the default variant
+ --comment : (optional) comment string to include in generated HTML files
+ """
+
+ parser = argparse.ArgumentParser(
+ description="Process nightly build wheel files to generate indices."
+ )
+ parser.add_argument(
+ "--version",
+ type=str,
+ required=True,
+ help="Version string for the current build (e.g., commit hash)",
+ )
+ parser.add_argument(
+ "--current-objects",
+ type=str,
+ required=True,
+ help="Path to JSON file containing current S3 objects listing in this version directory",
+ )
+ parser.add_argument(
+ "--output-dir",
+ type=str,
+ required=True,
+ help="Directory to store generated index files",
+ )
+ parser.add_argument(
+ "--alias-to-default",
+ type=str,
+ default=None,
+ help="Alias variant name for the default variant",
+ )
+ parser.add_argument(
+ "--comment",
+ type=str,
+ default="",
+ help="Optional comment string to include in generated HTML files",
+ )
+
+ args = parser.parse_args()
+
+ version = args.version
+ if "/" in version or "\\" in version:
+ raise ValueError("Version string must not contain slashes.")
+ current_objects_path = Path(args.current_objects)
+ output_dir = Path(args.output_dir)
+ if not output_dir.exists():
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Read current objects JSON
+ with open(current_objects_path) as f:
+ current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+ # current_objects looks like from list_objects_v2 S3 API:
+ """
+ "Contents": [
+ {
+ "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+ "LastModified": "2025-11-28T14:00:32+00:00",
+ "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+ "ChecksumAlgorithm": [
+ "CRC64NVME"
+ ],
+ "ChecksumType": "FULL_OBJECT",
+ "Size": 435649349,
+ "StorageClass": "STANDARD"
+ },
+ ...
+ ]
+ """
+
+ # Extract wheel file keys
+ wheel_files = []
+ for item in current_objects.get("Contents", []):
+ key: str = item["Key"]
+ if key.endswith(".whl"):
+ wheel_files.append(key.split("/")[-1]) # only the filename is used
+
+ print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+ # keep only "official" files for a non-nightly version (specifed by cli args)
+ PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
+ if PY_VERSION_RE.match(version):
+ # upload-wheels.sh ensures no "dev" is in args.version
+ wheel_files = list(
+ filter(lambda x: version in x and "dev" not in x, wheel_files)
+ )
+ print(f"Non-nightly version detected, wheel files used: {wheel_files}")
+ else:
+ print("Nightly version detected, keeping all wheel files.")
+
+ # Generate index and metadata, assuming wheels and indices are stored as:
+ # s3://vllm-wheels/{version}/
+ # s3://vllm-wheels//
+ wheel_base_dir = Path(output_dir).parent / version
+ index_base_dir = Path(output_dir)
+
+ generate_index_and_metadata(
+ whl_files=wheel_files,
+ wheel_base_dir=wheel_base_dir,
+ index_base_dir=index_base_dir,
+ default_variant=None,
+ alias_to_default=args.alias_to_default,
+ comment=args.comment.strip(),
+ )
+ print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index b5f6b2494792f..b6274d698d01a 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -36,11 +36,17 @@ function cpu_tests() {
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ # Run model tests
+ docker exec cpu-test bash -c "
+ set -e
+ pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
# Run kernel tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
- pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+ pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+ pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
docker exec cpu-test bash -c '
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
index 29c8f5ed5a91a..0db1abe37ba11 100644
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 4d163399cfc6c..dfc9db512d1e9 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,6 +38,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
@@ -46,6 +47,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'
diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
index 5b25c358fc4aa..3fb7c82c8d333 100755
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+ echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+ exit 0
+fi
+
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
new file mode 100644
index 0000000000000..937a43d1a3221
--- /dev/null
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+THRESHOLD=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8040}
+OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
+mkdir -p "${OUT_DIR}"
+
+wait_for_server() {
+ local port=$1
+ timeout 600 bash -c '
+ until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+ sleep 1
+ done'
+}
+
+MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+ # ROCm platform
+ BACKENDS=("allgather_reducescatter")
+ # Disable MOE padding for ROCm since it is causing eplb to fail
+ export VLLM_ROCM_MOE_PADDING=0
+else
+ # Non-ROCm platform (CUDA/other)
+ BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
+
+cleanup() {
+ if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+ kill "${SERVER_PID}" 2>/dev/null || true
+ for _ in {1..20}; do
+ kill -0 "${SERVER_PID}" 2>/dev/null || break
+ sleep 0.5
+ done
+ kill -9 "${SERVER_PID}" 2>/dev/null || true
+ fi
+}
+trap cleanup EXIT
+
+for BACK in "${BACKENDS[@]}"; do
+ VLLM_DEEP_GEMM_WARMUP=skip \
+ VLLM_ALL2ALL_BACKEND=$BACK \
+ vllm serve "$MODEL" \
+ --enforce-eager \
+ --tensor-parallel-size 4 \
+ --enable-expert-parallel \
+ --enable-eplb \
+ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
+ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
+ --trust-remote-code \
+ --max-model-len 2048 \
+ --gpu-memory-utilization 0.9 \
+ --port $PORT &
+ SERVER_PID=$!
+ wait_for_server $PORT
+
+ TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
+ OUT="${OUT_DIR}/${TAG}_${BACK}.json"
+ python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+ python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+ cleanup
+ SERVER_PID=
+ sleep 1
+ PORT=$((PORT+1))
+done
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c0090..3a218a4bb2e6d 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
set -ex
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+ # use new python from docker
+ docker pull python:3-slim
+ PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
@@ -10,74 +32,76 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
-
-# Get the single wheel file
wheel="${wheel_files[0]}"
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
- manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
- manylinux_version="manylinux2014"
-else
- echo "Warning: Unknown architecture $arch, using manylinux1 as default"
- manylinux_version="manylinux1"
-fi
+# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# we also accept params as manylinux tag
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="${1:-manylinux_2_31}"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+ echo "Error: Wheel filename does not contain 'linux': $wheel"
+ exit 1
+fi
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
-normal_wheel="$wheel" # Save the original wheel filename
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
- suffix="${version##*.}"
- if [[ $suffix == cu* ]]; then
- new_version="1.0.0.dev+${suffix}"
- else
- new_version="1.0.0.dev"
- fi
- new_wheel="${wheel/$version/$new_version}"
- # use cp to keep both files in the artifacts directory
- cp -- "$wheel" "$new_wheel"
- wheel="$new_wheel"
- version="$new_version"
-fi
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
- aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels//
+# and indices can be placed in //, or /nightly/, or //
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+ alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
else
- echo "Skipping index files for non-cu129 wheels"
+ alias_arg=""
fi
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
-if [[ $normal_wheel == *"cu129"* ]]; then
- # only upload index.html for cu129 wheels (default wheels) as it
- # is available on both x86 and arm64
- aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
- echo "Skipping index files for non-cu129 wheels"
+# copy indices to // unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+ echo "Uploading indices to overwrite /nightly/"
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# re-generate and copy to // only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+ echo "Re-generating indices for /$pure_version/"
+ rm -rf "$INDICES_OUTPUT_DIR/*"
+ mkdir -p "$INDICES_OUTPUT_DIR"
+ $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+ aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 687b6b08507c7..3c9b8cbedcf06 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -39,9 +39,9 @@ steps:
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
@@ -50,9 +50,9 @@ steps:
- label: Async Engine, Inputs, Utils, Worker Test # 10min
timeout_in_minutes: 15
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/multimodal
@@ -61,11 +61,11 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
- timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/test_inputs.py
@@ -73,6 +73,7 @@ steps:
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
+ - tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -82,6 +83,7 @@ steps:
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -115,9 +117,9 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py
- label: Entrypoints Unit Tests # 5min
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -214,6 +216,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -252,9 +255,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
@@ -325,10 +328,10 @@ steps:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-- label: V1 Test e2e + engine # 30min
- timeout_in_minutes: 45
+- label: V1 Test e2e + engine # 65min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
- agent_pool: mi325_1
+ agent_pool: mi325_4
# grade: Blocking
source_file_dependencies:
- vllm/
@@ -341,9 +344,9 @@ steps:
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
@@ -391,6 +394,21 @@ steps:
commands:
- pytest -v -s v1/attention
+- label: Batch Invariance Tests (H100) # 10min
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
- label: V1 Test attention (B200) # 10min
timeout_in_minutes: 30
gpu: b200
@@ -401,9 +419,9 @@ steps:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
- # grade: Blocking
+ grade: Blocking
source_file_dependencies:
- vllm/
- tests/v1
@@ -419,29 +437,34 @@ steps:
- label: Examples Test # 30min
timeout_in_minutes: 45
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
- - python3 offline_inference/audio_language.py --seed 0
- - python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- - python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -495,7 +518,7 @@ steps:
- label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
@@ -512,7 +535,7 @@ steps:
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
torch_nightly: true
@@ -568,7 +591,7 @@ steps:
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
@@ -595,7 +618,7 @@ steps:
- label: Kernels MoE Test %N # 40min
timeout_in_minutes: 60
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
@@ -622,6 +645,26 @@ steps:
commands:
- pytest -v -s kernels/mamba
+- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
+# Not replicating for CUTLAS & CuTe
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
- label: Model Executor Test # 23min
timeout_in_minutes: 35
torch_nightly: true
@@ -680,16 +723,18 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
+ - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-- label: LM Eval Small Models # 15min
- timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
+ autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
@@ -702,7 +747,7 @@ steps:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
- commands: # LMEval
+ commands: # LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/
@@ -716,19 +761,7 @@ steps:
- vllm/
- tests/tool_use
commands:
- - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_1
- # grade: Blocking
- timeout_in_minutes: 10
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- no_gpu: true
- commands:
- - pytest -v -s -m 'cpu_test' tool_use
+ - pytest -v -s tool_use
##### models test #####
@@ -899,6 +932,18 @@ steps:
commands:
- pytest -v -s models/language/pooling_mteb_test
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
@@ -926,8 +971,8 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
- timeout_in_minutes: 70
+- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
+ timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
@@ -939,7 +984,8 @@ steps:
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+ timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -963,7 +1009,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
-- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+ timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -1055,6 +1102,7 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
@@ -1064,11 +1112,18 @@ steps:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
@@ -1079,7 +1134,7 @@ steps:
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Blackwell Fusion E2E Tests # 30 min
timeout_in_minutes: 40
@@ -1097,17 +1152,15 @@ steps:
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
- - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- - pytest -v -s tests/compile/test_fusions_e2e.py
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-- label: ROCm GPT-OSS Eval
+- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
- agent_pool: mi325_1
- mirror_hardwares: [amdexperimental, amdproduction]
+ gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
@@ -1116,7 +1169,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
@@ -1216,6 +1269,7 @@ steps:
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@@ -1251,7 +1305,7 @@ steps:
- label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1320,14 +1374,14 @@ steps:
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- # Disabled for now because MXFP4 backend on non-cuda platform
+ # Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
- label: Weight Loading Multiple GPU Test # 33min
timeout_in_minutes: 45
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_2
# grade: Blocking
working_dir: "/vllm-workspace/tests"
@@ -1386,7 +1440,83 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
+
- label: LM Eval Large Models # optional
+ gpu: a100
+ optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+ gpu: h100
+ optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+
+##### H200 test #####
+- label: Distributed Tests (H200) # optional
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+ - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+ - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+##### E2E Eval Tests #####
+- label: LM Eval Small Models (1 Card) # 15min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 Card)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
@@ -1401,52 +1531,29 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-##### H100 test #####
-- label: LM Eval Large Models (H100) # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
- gpu: h100
- optional: true
- num_gpus: 4
+- label: ROCm LM Eval Large Models (8 Card)
+ mirror_hardwares: [amdproduction]
+ agent_pool: mi325_8
+ num_gpus: 8
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
+
+- label: ROCm GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ agent_pool: mi325_1
+ mirror_hardwares: [amdexperimental, amdproduction]
+ optional: true # run on nightlies
source_file_dependencies:
- - csrc/
- - vllm/model_executor/layers/quantization
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
commands:
- - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
-##### H200 test #####
-- label: Distributed Tests (H200) # optional
- mirror_hardwares: [amdexperimental]
- agent_pool: mi325_2
- # grade: Blocking
- gpu: h200
- optional: true
- working_dir: "/vllm-workspace/"
- num_gpus: 2
- commands:
- - pytest -v -s tests/compile/distributed/test_async_tp.py
- - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
- - pytest -v -s tests/distributed/test_context_parallel.py
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- - pytest -v -s tests/v1/distributed/test_dbo.py
-
-##### B200 test #####
-- label: Distributed Tests (B200) # optional
- gpu: b200
- optional: true
- working_dir: "/vllm-workspace/"
- num_gpus: 2
- commands:
- - pytest -v -s tests/distributed/test_context_parallel.py
- - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- - pytest -v -s tests/v1/distributed/test_dbo.py
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
@@ -1462,9 +1569,8 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
-
- label: DeepSeek V2-Lite Accuracy
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
@@ -1475,8 +1581,8 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
-- label: Qwen3-30B-A3B-FP8-block Accuracy
- mirror_hardwares: [amdexperimental]
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 60
@@ -1486,3 +1592,35 @@ steps:
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9f2107fb1e5ab..9d0b3fdd3a02c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,8 +57,8 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
- timeout_in_minutes: 20
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+ timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/test_inputs.py
@@ -66,6 +66,7 @@ steps:
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
+ - tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -75,6 +76,7 @@ steps:
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -350,7 +352,8 @@ steps:
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- - vllm/
+ - vllm/v1/attention
+ - vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -387,23 +390,28 @@ steps:
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
- - python3 offline_inference/audio_language.py --seed 0
- - python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- - python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -462,7 +470,9 @@ steps:
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@@ -476,7 +486,9 @@ steps:
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
@@ -662,16 +674,7 @@ steps:
- vllm/
- tests/tool_use
commands:
- - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
- timeout_in_minutes: 10
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- no_gpu: true
- commands:
- - pytest -v -s -m 'cpu_test' tool_use
+ - pytest -v -s tool_use
##### models test #####
@@ -682,6 +685,7 @@ steps:
source_file_dependencies:
- vllm/
- tests/models/test_initialization.py
+ - tests/models/registry.py
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -694,6 +698,7 @@ steps:
- vllm/model_executor/models/
- vllm/transformers_utils/
- tests/models/test_initialization.py
+ - tests/models/registry.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
@@ -826,7 +831,7 @@ steps:
- tests/models/multimodal
no_gpu: true
commands:
- - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor Test
@@ -1218,6 +1223,8 @@ steps:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # Alot of these tests are on the edge of OOMing
+ - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
@@ -1336,6 +1343,7 @@ steps:
- label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30
optional: true
+ soft_fail: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
@@ -1369,4 +1377,4 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
new file mode 100644
index 0000000000000..6e444eae14c74
--- /dev/null
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,21 @@
+group: Attention
+depends_on:
+ - image-build
+steps:
+- label: V1 attention (H100)
+ timeout_in_minutes: 30
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+ timeout_in_minutes: 30
+ gpu: b200
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
new file mode 100644
index 0000000000000..759d2b5358714
--- /dev/null
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on:
+ - image-build
+steps:
+- label: Basic Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/basic_correctness/test_basic_correctness
+ - tests/basic_correctness/test_cpu_offload
+ - tests/basic_correctness/test_cumem.py
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s basic_correctness/test_cumem.py
+ - pytest -v -s basic_correctness/test_basic_correctness.py
+ - pytest -v -s basic_correctness/test_cpu_offload.py
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
new file mode 100644
index 0000000000000..574b642d407b0
--- /dev/null
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on:
+ - image-build
+steps:
+- label: Benchmarks
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/.buildkite"
+ source_file_dependencies:
+ - benchmarks/
+ commands:
+ - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/benchmarks/
+ commands:
+ - pytest -v -s benchmarks/
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
new file mode 100644
index 0000000000000..0ba00925a4838
--- /dev/null
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,57 @@
+group: Compile
+depends_on:
+ - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
+ commands:
+ - nvidia-smi
+ - pytest -v -s tests/compile/test_fusion_attn.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+ # this runner has 2 GPUs available even though num_gpus=2 is not set
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+ # Wrap with quotes to escape yaml
+ - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ commands:
+ - nvidia-smi
+ # Run all e2e fusion tests
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
new file mode 100644
index 0000000000000..50c0c338c2434
--- /dev/null
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,22 @@
+group: CUDA
+depends_on:
+ - image-build
+steps:
+- label: Platform Tests (CUDA)
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/cuda
+ commands:
+ - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/v1/cudagraph
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/config/compilation.py
+ - vllm/compilation
+ commands:
+ - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+ - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
new file mode 100644
index 0000000000000..2cc90698d916a
--- /dev/null
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,199 @@
+group: Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Comm Ops
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/distributed
+ - tests/distributed
+ commands:
+ - pytest -v -s distributed/test_comm_ops.py
+ - pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+ timeout_in_minutes: 90
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/compilation/
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/worker/worker_base.py
+ - vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+ - pytest -v -s entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s ./compile/test_wrapper.py
+ - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - tests/distributed/test_utils
+ - tests/distributed/test_pynccl
+ - tests/distributed/test_events
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - examples/offline_inference/rlhf.py
+ - examples/offline_inference/rlhf_colocate.py
+ - tests/examples/offline_inference/data_parallel.py
+ - tests/v1/distributed
+ - tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and external_dp=2
+ - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and pp=2
+ - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with internal dp
+ - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+ - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+ - pytest -v -s distributed/test_utils.py
+ - pytest -v -s compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s distributed/test_pynccl.py
+ - pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
+ # TODO: create a dedicated test section for multi-GPU example tests
+ # when we have multiple distributed example tests
+ - cd ../examples/offline_inference
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+ timeout_in_minutes: 10
+ gpu: h100
+ num_gpus: 8
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - examples/offline_inference/torchrun_dp_example.py
+ - vllm/config/parallel.py
+ - vllm/distributed/
+ - vllm/v1/engine/llm_engine.py
+ - vllm/v1/executor/uniproc_executor.py
+ - vllm/v1/worker/gpu_worker.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and dp=4 with ep
+ - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/
+ commands:
+ # NOTE: don't test llama model here, it seems hf implementation is buggy
+ # see https://github.com/vllm-project/vllm/pull/5689 for details
+ - pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+ - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ num_nodes: 2
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ - tests/examples/offline_inference/data_parallel.py
+ commands:
+ - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ commands:
+ - pytest -v -s distributed/test_pp_cudagraph.py
+ - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
new file mode 100644
index 0000000000000..93d389815edac
--- /dev/null
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,59 @@
+group: E2E Integration
+depends_on:
+ - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
new file mode 100644
index 0000000000000..a028e0e4af4c1
--- /dev/null
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,26 @@
+group: Engine
+depends_on:
+ - image-build
+steps:
+- label: Engine
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/engine
+ - tests/test_sequence
+ - tests/test_config
+ - tests/test_logger
+ - tests/test_vllm_port
+ commands:
+ - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
new file mode 100644
index 0000000000000..0a789be943f37
--- /dev/null
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,68 @@
+group: Entrypoints
+depends_on:
+ - image-build
+steps:
+- label: Entrypoints Unit Tests
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/llm
+ - tests/entrypoints/offline_mode
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server)
+ timeout_in_minutes: 130
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai
+ - tests/entrypoints/test_chat_utils
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+ - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (Pooling)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - csrc/
+ - vllm/entrypoints/openai/
+ - vllm/model_executor/models/whisper.py
+ commands: # LMEval+Transcription WER check
+ - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
new file mode 100644
index 0000000000000..feb8252148c7f
--- /dev/null
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on:
+ - image-build
+steps:
+- label: EPLB Algorithm
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_algo.py
+ commands:
+ - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_execute.py
+ commands:
+ - pytest -v -s distributed/test_eplb_execute.py
+ - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
new file mode 100644
index 0000000000000..7ca099516d641
--- /dev/null
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,117 @@
+group: Kernels
+depends_on:
+ - image-build
+steps:
+- label: Kernels Core Operation Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - tests/kernels/core
+ - tests/kernels/test_top_k_per_row.py
+ commands:
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - csrc/attention/
+ - vllm/attention
+ - vllm/v1/attention
+ - tests/kernels/attention
+ commands:
+ - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Quantization Test %N
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/quantization/
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization
+ commands:
+ - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels MoE Test %N
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - csrc/quantization/cutlass_w8a8/moe/
+ - csrc/moe/
+ - tests/kernels/moe
+ - vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
+ - vllm/envs.py
+ - vllm/config
+ commands:
+ - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Mamba Test
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - csrc/mamba/
+ - tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
+ commands:
+ - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ # optional: true
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - csrc/attention/mla/
+ - csrc/quantization/cutlass_w8a8/moe/
+ - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/attention/backends/mla/cutlass_mla.py
+ - vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/platforms/cuda.py
+ - vllm/attention/selector.py
+ commands:
+ - nvidia-smi
+ - python3 examples/offline_inference/basic/chat.py
+ # Attention
+ # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_attention_selector.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+ - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+ # Quantization
+ - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+ - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
new file mode 100644
index 0000000000000..9af43e0c375a8
--- /dev/null
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,46 @@
+group: LM Eval
+depends_on:
+ - image-build
+steps:
+- label: LM Eval Small Models
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ autorun_on_main: true
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
new file mode 100644
index 0000000000000..809b4138f44ba
--- /dev/null
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,31 @@
+group: LoRA
+depends_on:
+ - image-build
+steps:
+- label: LoRA %N
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+ parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+ timeout_in_minutes: 30
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ # FIXIT: find out which code initialize cuda before running the test
+ # before the fix, we need to use spawn to test it
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # There is some Tensor Parallelism related processing logic in LoRA that
+ # requires multi-GPU testing for validation.
+ - pytest -v -s -x lora/test_chatglm3_tp.py
+ - pytest -v -s -x lora/test_llama_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
+ - pytest -v -s -x lora/test_olmoe_tp.py
+ - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
new file mode 100644
index 0000000000000..252af1e56a105
--- /dev/null
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,165 @@
+group: Miscellaneous
+depends_on:
+ - image-build
+steps:
+- label: V1 Others
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ # split the test to avoid interference
+ - pytest -v -s -m 'not cpu_test' v1/core
+ - pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
+ - pytest -v -s v1/sample
+ - pytest -v -s v1/logits_processors
+ - pytest -v -s v1/worker
+ - pytest -v -s v1/spec_decode
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
+ - pytest -v -s v1/test_oracle.py
+ - pytest -v -s v1/test_request.py
+ - pytest -v -s v1/test_outputs.py
+ # Integration test for streaming correctness (requires special branch).
+ - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+ - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+ depends_on: ~
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/test_regression
+ commands:
+ - pip install modelscope
+ - pytest -v -s test_regression.py
+ working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/examples"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - vllm/multimodal
+ - examples/
+ commands:
+ - pip install tensorizer # for tensorizer test
+ - python3 offline_inference/basic/chat.py # for basic
+ - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+ - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+ timeout_in_minutes: 20
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/tracing
+ commands:
+ - "pip install \
+ 'opentelemetry-sdk>=1.26.0' \
+ 'opentelemetry-api>=1.26.0' \
+ 'opentelemetry-exporter-otlp>=1.26.0' \
+ 'opentelemetry-semantic-conventions-ai>=0.4.1'"
+ - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+ depends_on: ~
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/standalone_tests/python_only_compile.sh
+ - setup.py
+ commands:
+ - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+ depends_on: ~
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/test_inputs.py
+ - tests/test_outputs.py
+ - tests/multimodal
+ - tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/tool_parsers
+ - tests/transformers_utils
+ - tests/config
+ no_gpu: true
+ commands:
+ - python3 standalone_tests/lazy_imports.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s test_outputs.py
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
+ - pytest -v -s transformers_utils
+ - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
new file mode 100644
index 0000000000000..996c8bb8b780a
--- /dev/null
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,17 @@
+group: Model Executor
+depends_on:
+ - image-build
+steps:
+- label: Model Executor
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - vllm/engine/arg_utils.py
+ - vllm/config/model.py
+ - vllm/model_executor
+ - tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+ commands:
+ - apt-get update && apt-get install -y curl libsodium23
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
new file mode 100644
index 0000000000000..39a5d51c48833
--- /dev/null
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,62 @@
+group: Models - Basic
+depends_on:
+ - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_initialization.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+ working_dir: "/vllm-workspace/"
+ optional: true
+ soft_fail: true
+ commands:
+ - pip install --upgrade git+https://github.com/huggingface/transformers
+ - pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
+ - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/test_mapping.py
+ - python3 examples/offline_inference/basic/chat.py
+ - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
new file mode 100644
index 0000000000000..b6bfbf2ddab47
--- /dev/null
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+ # Avoid importing model tests that cause CUDA reinitialization error
+ - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
new file mode 100644
index 0000000000000..f70192c4ebc0a
--- /dev/null
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,91 @@
+group: Models - Language
+depends_on:
+ - image-build
+steps:
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language
+ commands:
+ # Test standard language models, excluding a subset of slow tests
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling
+ commands:
+ - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
new file mode 100644
index 0000000000000..fc24068c20a46
--- /dev/null
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on:
+ - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+ timeout_in_minutes: 80
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+ timeout_in_minutes: 70
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - vllm/multimodal/
+ - vllm/inputs/
+ - vllm/v1/core/
+ commands:
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+ optional: true
+ commands:
+ - echo 'Testing custom models...'
+ # PR authors can temporarily add commands below to test individual models
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
new file mode 100644
index 0000000000000..60c179aa098e1
--- /dev/null
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,34 @@
+group: Plugins
+depends_on:
+ - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/plugins/
+ - tests/plugins/
+ commands:
+ # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+ - pip install -e ./plugins/vllm_add_dummy_platform
+ - pytest -v -s plugins_tests/test_platform_plugins.py
+ - pip uninstall vllm_add_dummy_platform -y
+ # end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
+ # begin stat_logger plugins test
+ - pip install -e ./plugins/vllm_add_dummy_stat_logger
+ - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+ - pip uninstall dummy_stat_logger -y
+ # end stat_logger plugins test
+ # other tests continue here:
+ - pytest -v -s plugins_tests/test_scheduler_plugins.py
+ - pip install -e ./plugins/vllm_add_dummy_model
+ - pytest -v -s distributed/test_distributed_oot.py
+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+ - pytest -v -s models/test_oot_registration.py # it needs a clean process
+ - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
new file mode 100644
index 0000000000000..703c82eb1a91b
--- /dev/null
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,50 @@
+group: PyTorch
+depends_on:
+ - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run unit tests defined directly under compile/,
+ # not including subdirectories, which are usually heavier
+ # tests covered elsewhere.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph Smoke Test
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run smoke tests under fullgraph directory, except test_full_graph.py
+ # as it is a heavy test that is covered in other steps.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+ timeout_in_minutes: 40
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # fp8 kv scales not supported on sm89, tested on Blackwell instead
+ - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+ # Limit to no custom ops to reduce running time
+ # Wrap with quotes to escape yaml and avoid starting -k string with a -
+ - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+ # if this test fails, it means the nightly torch version is not compatible with some
+ # of the dependencies. Please check the error message and add the package to whitelist
+ # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+ soft_fail: true
+ source_file_dependencies:
+ - requirements/nightly_torch_test.txt
+ commands:
+ - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
new file mode 100644
index 0000000000000..6e89d6af3b8d1
--- /dev/null
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,46 @@
+group: Quantization
+depends_on:
+ - image-build
+steps:
+- label: Quantization
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ - tests/quantization
+ commands:
+ # temporary install here since we need nightly, will move to requirements/test.in
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system conch-triton-kernels
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/model_executor/layers/quantization
+ - tests/models/quantization
+ commands:
+ - pytest -v -s models/quantization
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
new file mode 100644
index 0000000000000..ad377148fd073
--- /dev/null
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,14 @@
+group: Samplers
+depends_on:
+ - image-build
+steps:
+- label: Samplers Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - vllm/model_executor/layers
+ - vllm/sampling_metadata.py
+ - tests/samplers
+ - tests/conftest.py
+ commands:
+ - pytest -v -s samplers
+ - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
new file mode 100644
index 0000000000000..69527a1214229
--- /dev/null
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,13 @@
+group: Tool use
+depends_on:
+ - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
+ fast_check: false
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ commands:
+ - pytest -v -s tool_use
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
new file mode 100644
index 0000000000000..cfc5bb20fe7ad
--- /dev/null
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on:
+ - image-build
+steps:
+- label: Weight Loading Multiple GPU # 33min
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ gpu: a100
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ecb10d1a450f3..d6447649cd89a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -146,10 +146,10 @@ mkdocs.yaml @hmellor
/requirements/kv_connectors.txt @NickLucche
# Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
-/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
+/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 997a40e18e588..3ad79f93bc7ad 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,6 +14,52 @@ pull_request_rules:
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
+- name: comment-pre-commit-failure
+ description: Comment on PR when pre-commit check fails
+ conditions:
+ - status-failure=pre-commit
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+ ```bash
+ uv pip install pre-commit
+ pre-commit install
+ pre-commit run --all-files
+ ```
+
+ Then, commit the changes and push to your branch.
+
+ For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+ > [!TIP]
+ >
+ > Is mypy or markdownlint failing?
+ >
+ > mypy and markdownlint are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+ >
+ > ```bash
+ > # For mypy (substitute "3.10" with the failing version if needed)
+ > pre-commit run --hook-stage manual mypy-3.10
+ > # For markdownlint
+ > pre-commit run --hook-stage manual markdownlint
+ > ```
+ >
+
+- name: comment-dco-failure
+ description: Comment on PR when DCO check fails
+ conditions:
+ - status-failure=dco
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
@@ -140,7 +186,7 @@ pull_request_rules:
- files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
- files~=^vllm/entrypoints/tool_server.py
- files~=^vllm/entrypoints/tool.py
- files~=^vllm/entrypoints/context.py
@@ -358,4 +404,4 @@ pull_request_rules:
actions:
label:
add:
- - kv-connector
\ No newline at end of file
+ - kv-connector
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 861290ea43c87..df8910837715d 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,10 +13,10 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Python
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index 3a12c4b3a8300..e80a5c0cc80f9 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30
steps:
- - uses: actions/checkout@v6
+ - uses: actions/checkout@v6.0.1
- uses: astral-sh/setup-uv@v7
with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index d5e70f30ef638..1041653c2f57e 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,8 +16,8 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+ - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+ - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index dca3089f496c9..44bf71db5e9de 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,13 +7,15 @@ on:
jobs:
close-issues-and-pull-requests:
+ # Prevents triggering on forks or other repos
+ if: github.repository == 'vllm-project/vllm'
permissions:
issues: write
pull-requests: write
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+ - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e09972fe71995..cd52df86e0346 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -384,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=$PYTHONPATH
+ PYTHONPATH=$ENV{PYTHONPATH}
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
@@ -822,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+ PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
@@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+ "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+ )
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
- "csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -1002,7 +1004,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=$PYTHONPATH
+ PYTHONPATH=$ENV{PYTHONPATH}
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
diff --git a/README.md b/README.md
index abbb63158f166..26222b815370d 100644
--- a/README.md
+++ b/README.md
@@ -137,16 +137,19 @@ Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
+- Arm
- AWS
- Crusoe Cloud
- Databricks
- DeepInfra
- Google Cloud
+- IBM
- Intel
- Lambda Lab
- Nebius
- Novita AI
- NVIDIA
+- Red Hat
- Replicate
- Roblox
- RunPod
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index d1bdb4c43f10b..9a9600e08dafe 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
```
-#### 2. Maximize Throughput with a Latency Requirement
+### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**:
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=500
```
-#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**:
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 56b721cbb4021..a245e2022e605 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+ echo "Error: Failed to determine hostname." >&2
+ exit 1
+fi
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
"$MODEL"
"--disable-log-requests"
"--port" "8004"
+ "--host" "$HOSTNAME"
"--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -96,8 +102,9 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
- VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
- vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+ local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+ VLLM_SERVER_DEV_MODE=1 \
+ vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_SERVER_DEV_MODE=1 \
@@ -112,7 +119,7 @@ start_server() {
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
- RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+ RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
@@ -172,6 +179,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -187,7 +195,7 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
- curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+ curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
@@ -203,6 +211,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 \
--profile &> "$bm_log"
else
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index d69d74ca61f54..831b76b66e096 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
- from vllm.tokenizers import MistralTokenizer
+ from vllm.tokenizers.mistral import MistralTokenizer
except ImportError as e:
raise ImportError(
"MistralTokenizer requires vllm package.\n"
diff --git a/benchmarks/benchmark_hash.py b/benchmarks/benchmark_hash.py
new file mode 100644
index 0000000000000..08cdc012d6527
--- /dev/null
+++ b/benchmarks/benchmark_hash.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+ (32-byte bytes object, 32-int tuple)
+
+Usage:
+ python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+ """Generate a deterministic test payload."""
+ random.seed(seed)
+ bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+ int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+ return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+ """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+ times: list[float] = []
+
+ # Warm-up to avoid first-run noise.
+ for _ in range(200):
+ func(data)
+
+ for _ in range(iterations):
+ start = time.perf_counter()
+ func(data)
+ end = time.perf_counter()
+ times.append(end - start)
+
+ avg = statistics.mean(times)
+ std = statistics.stdev(times) if len(times) > 1 else 0.0
+ return avg, std
+
+
+def _run_benchmarks(
+ benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+ data: tuple,
+ iterations: int,
+):
+ """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+ for name, func in benchmarks:
+ try:
+ avg, std = _benchmark_func(func, data, iterations)
+ except ModuleNotFoundError as exc:
+ print(f"Skipping {name}: {exc}")
+ continue
+ yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+ """Wrapper for Python's built-in hash()."""
+ return hash(data)
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--iterations",
+ type=int,
+ default=10_000,
+ help="Number of measured iterations per hash function.",
+ )
+ parser.add_argument(
+ "--seed", type=int, default=42, help="Random seed for test payload."
+ )
+ args = parser.parse_args()
+
+ data = _generate_test_data(args.seed)
+ benchmarks = (
+ ("SHA256 (pickle)", sha256),
+ ("xxHash (pickle)", xxhash),
+ ("built-in hash()", builtin_hash),
+ )
+
+ print("=" * 60)
+ print("HASH FUNCTION MICRO BENCHMARK")
+ print("=" * 60)
+ print("Test data: (32-byte bytes object, 32-int tuple)")
+ print(f"Iterations: {args.iterations:,}")
+ print("=" * 60)
+
+ results = list(_run_benchmarks(benchmarks, data, args.iterations))
+ builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+ print("\nResults:")
+ for name, avg, std in results:
+ print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+ if builtin_entry:
+ _, builtin_avg, _ = builtin_entry
+ print("\n" + "=" * 60)
+ print("SUMMARY (relative to built-in hash())")
+ print("=" * 60)
+ for name, avg, _ in results:
+ if name == "built-in hash()":
+ continue
+ speed_ratio = avg / builtin_avg
+ print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+ else:
+ print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index dedb564fffac8..b5373d383b548 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -32,12 +32,11 @@ def benchmark_propose(args):
model_config = ModelConfig(
model="facebook/opt-125m",
- task="generate",
max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
- seed=None,
+ seed=0,
trust_remote_code=False,
)
proposer = NgramProposer(
@@ -108,7 +107,10 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
- scheduler_config=SchedulerConfig(),
+ scheduler_config=SchedulerConfig(
+ max_model_len=model_config.max_model_len,
+ is_encoder_decoder=model_config.is_encoder_decoder,
+ ),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
diff --git a/benchmarks/benchmark_prefix_block_hash.py b/benchmarks/benchmark_prefix_block_hash.py
new file mode 100644
index 0000000000000..8bcd8af0d3102
--- /dev/null
+++ b/benchmarks/benchmark_prefix_block_hash.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+ python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+ num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+ rng = random.Random(seed)
+ return [
+ [rng.randrange(vocab_size) for _ in range(block_size)]
+ for _ in range(num_blocks)
+ ]
+
+
+def _hash_all_blocks(
+ hash_fn: Callable[[object], bytes],
+ blocks: Iterable[Sequence[int]],
+) -> float:
+ parent_hash: BlockHash | None = None
+ start = time.perf_counter()
+ for block in blocks:
+ parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+ end = time.perf_counter()
+ return end - start
+
+
+def _benchmark(
+ hash_algo: str,
+ blocks: list[list[int]],
+ trials: int,
+) -> tuple[float, float, float] | None:
+ try:
+ hash_fn = get_hash_fn_by_name(hash_algo)
+ init_none_hash(hash_fn)
+ timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+ except ModuleNotFoundError as exc:
+ print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+ return None
+
+ avg = statistics.mean(timings)
+ best = min(timings)
+ # throughput: tokens / second
+ tokens_hashed = len(blocks) * len(blocks[0])
+ throughput = tokens_hashed / best
+ return avg, best, throughput
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+ parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+ parser.add_argument(
+ "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+ parser.add_argument(
+ "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+ )
+ parser.add_argument(
+ "--algorithms",
+ nargs="+",
+ default=SUPPORTED_ALGOS,
+ choices=SUPPORTED_ALGOS,
+ help="Hash algorithms to benchmark.",
+ )
+ args = parser.parse_args()
+
+ blocks = _generate_blocks(
+ args.num_blocks, args.block_size, args.vocab_size, args.seed
+ )
+ print(
+ f"Benchmarking {len(args.algorithms)} algorithms on "
+ f"{args.num_blocks} blocks (block size={args.block_size})."
+ )
+
+ for algo in args.algorithms:
+ result = _benchmark(algo, blocks, args.trials)
+ if result is None:
+ continue
+
+ avg, best, throughput = result
+ print(
+ f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
+ f"throughput: {throughput / 1e6:.2f}M tokens/s"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 28fc383a318dd..e6391134ff932 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 55001cf3722a0..33aca831883aa 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try:
- from vllm.transformers_utils.tokenizer import get_tokenizer
+ from vllm.tokenizers import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
@@ -574,7 +574,7 @@ async def benchmark(
)
print(
"{:<40} {:<10.2f}".format(
- "Total Token throughput (tok/s):", metrics.total_token_throughput
+ "Total token throughput (tok/s):", metrics.total_token_throughput
)
)
@@ -963,8 +963,7 @@ def create_argument_parser():
parser.add_argument(
"--profile",
action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+ help="Use vLLM Profiling. --profiler-config must be provided on the server.",
)
parser.add_argument(
"--result-dir",
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index d809bf1db8cbc..fb3329975cee3 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@ from tqdm import tqdm
import vllm._custom_ops as ops
from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ per_token_group_quant_fp8,
+)
@dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
hidden_size: int
add_residual: bool
dtype: torch.dtype
+ group_size: list[int]
def description(self):
return (
@@ -29,6 +33,7 @@ class bench_params_t:
f"x D {self.hidden_size} "
f"x R {self.add_residual} "
f"x DT {self.dtype}"
+ f"x GS {self.group_size}"
)
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
HIDDEN_SIZES = list(range(1024, 8129, 1024))
ADD_RESIDUAL = [True, False]
DTYPES = [torch.bfloat16, torch.float]
+ GROUP_SIZES = [[1, 64], [1, 128]]
- combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+ combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
bench_params = list(
- map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+ map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
)
return bench_params
@@ -52,6 +58,7 @@ def unfused_int8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
torch_out, _ = ops.scaled_fp8_quant(torch_out)
+def unfused_groupwise_fp8_impl(
+ rms_norm_layer: RMSNorm,
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ # Norm
+ torch_out = None
+ if residual is None:
+ torch_out = rms_norm_layer.forward_cuda(x, residual)
+ else:
+ torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+ # Quant
+ torch_out, _ = per_token_group_quant_fp8(
+ torch_out, group_size=group_size[1], use_ue8m0=False
+ )
+
+
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
out, _ = ops.rms_norm_dynamic_per_token_quant(
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
)
+def fused_groupwise_impl(
+ rms_norm_layer: RMSNorm, # this stores the weights
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ out, _ = ops.rms_norm_per_block_quant(
+ x,
+ rms_norm_layer.weight,
+ 1e-6,
+ quant_dtype,
+ group_size,
+ residual=residual,
+ is_scale_transposed=True,
+ )
+
+
# Bench functions
def bench_fn(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: torch.Tensor,
quant_dtype: torch.dtype,
+ group_size: list[int],
label: str,
sub_label: str,
fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
"x": x,
"residual": residual,
"quant_dtype": quant_dtype,
+ "group_size": group_size,
"fn": fn,
}
return TBenchmark.Timer(
- stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+ stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
globals=globals,
label=label,
sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
)
)
+ # unfused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ unfused_groupwise_fp8_impl,
+ "unfused_groupwise_fp8_impl",
+ )
+ )
+
+ # fused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ fused_groupwise_impl,
+ "fused_groupwise_fp8_impl",
+ )
+ )
+
print_timers(timers)
return timers
diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
new file mode 100644
index 0000000000000..04921dafbdbea
--- /dev/null
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ _per_token_group_quant_fp8_colmajor,
+ silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+ print(
+ f"Note : The timings reported above is for {cuda_graph_nops} "
+ "consecutive invocations of the benchmarking functions. "
+ f"Please divide by {cuda_graph_nops} for single invocation "
+ "timings."
+ )
+ compare = TBenchmark.Compare(timers)
+ compare.print()
+
+
+class ImplType(Enum):
+ SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+ REFERENCE = 2
+
+ def get_impl(self):
+ if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+ return silu_mul_per_token_group_quant_fp8_colmajor
+ elif self == ImplType.REFERENCE:
+ return reference
+ raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+ input: torch.Tensor
+ output: torch.Tensor
+
+ # Reference act output tensor
+ ref_act_out: torch.Tensor
+ ref_quant_out: torch.Tensor
+
+ @staticmethod
+ def make(T: int, N: int) -> "BenchmarkTensors":
+ assert T % GROUP_SIZE == 0
+ assert N % (GROUP_SIZE * 2) == 0
+
+ input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+ # silu_mul_per_token_group_quant_fp8_colmajor output.
+ output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+ FLOAT8_T
+ )
+
+ # reference output.
+ ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+ ref_quant_out = torch.empty(
+ (T, N // 2), dtype=torch.bfloat16, device="cuda"
+ ).to(FLOAT8_T)
+
+ return BenchmarkTensors(
+ input=input,
+ output=output,
+ ref_act_out=ref_act_out,
+ ref_quant_out=ref_quant_out,
+ )
+
+ @property
+ def T(self):
+ return self.input.size(0)
+
+ @property
+ def N(self):
+ return self.input.size(1)
+
+ def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+ if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+ return {
+ "input": self.input,
+ "output": self.output,
+ "use_ue8m0": is_deep_gemm_e8m0_used(),
+ }
+ elif impl_type == ImplType.REFERENCE:
+ return {
+ "input": self.input,
+ "act_out": self.ref_act_out,
+ "quant_out": self.ref_quant_out,
+ "use_ue8m0": is_deep_gemm_e8m0_used(),
+ }
+ raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+ """
+ Reference triton quant kernel from,
+ vllm.model_executor.layers.quantization.utils.fp8_utils
+ """
+ assert quant_out.size() == x.size()
+ # Allocate the scale tensor column-major format.
+ shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+ x_q = quant_out
+ x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+ M = x.numel() // GROUP_SIZE
+ N = GROUP_SIZE
+ BLOCK = triton.next_power_of_2(N)
+ # heuristics for number of warps
+ num_warps = min(max(BLOCK // 256, 1), 8)
+ num_stages = 1
+
+ finfo = torch.finfo(FLOAT8_T)
+ fp8_min = finfo.min
+ fp8_max = finfo.max
+
+ _per_token_group_quant_fp8_colmajor[(M,)](
+ x,
+ x_q,
+ x_s,
+ GROUP_SIZE,
+ x.shape[1],
+ x.stride(0),
+ x_s.stride(1),
+ eps=1e-10,
+ fp8_min=fp8_min,
+ fp8_max=fp8_max,
+ use_ue8m0=use_ue8m0,
+ BLOCK=BLOCK,
+ num_warps=num_warps,
+ num_stages=num_stages,
+ )
+ return x_q, x_s
+
+
+def reference(
+ input: torch.Tensor,
+ act_out: torch.Tensor,
+ quant_out: torch.Tensor,
+ use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ torch.ops._C.silu_and_mul(act_out, input)
+ return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+ bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+ T = bench_tensors[0].T
+ N = bench_tensors[0].N
+
+ arg_pool_size = len(bench_tensors)
+ kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+ # warmup
+ for kwargs in kwargs_list:
+ impl_type.get_impl()(**kwargs)
+ torch.cuda.synchronize()
+
+ # Merge into a single kwargs and qualify arguments as ArgPool
+ kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+ for _kwargs in kwargs_list:
+ for k, v in _kwargs.items():
+ kwargs[k].values.append(v)
+
+ cuda_graph_params = None
+ cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+ timer = None
+ with Bench(
+ cuda_graph_params,
+ "silu-mul-quant",
+ f"num_tokens={T}, N={N}",
+ impl_type.name,
+ impl_type.get_impl(),
+ **kwargs,
+ ) as bench:
+ timer = bench.run()
+ return timer
+
+
+def test_correctness(T: int, N: int):
+ print(f"Testing num_tokens={T}, N={N} ...")
+
+ bench_tensor = BenchmarkTensors.make(T, N)
+
+ def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+ return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+ # reference output
+ ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+ # test ouptut
+ out_q, out_s = output_from_impl(
+ ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+ )
+
+ torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+ torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+ timers = []
+ for N, T in product(Ns, Ts):
+ test_correctness(T, N)
+
+ bench_tensors: list[BenchmarkTensors] = [
+ BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+ ]
+
+ silu_mul_quant_timer = bench_impl(
+ bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+ )
+ timers.append(silu_mul_quant_timer)
+ reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+ timers.append(reference_timer)
+
+ print_timers(
+ [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+ )
+
+ print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+ return timers
+
+
+if __name__ == "__main__":
+ T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+ N = [2048, 4096, 8192]
+
+ print(f"T = {T}, N = {N}")
+ run(T, N, arg_pool_size=8)
diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py
new file mode 100644
index 0000000000000..fb3b6c8f12003
--- /dev/null
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
+in MLA (Multi-head Latent Attention) prefill.
+
+This validates that the optimization from commit 8d4142bd is beneficial across
+various batch sizes, not just the originally tested batch size of 32768.
+"""
+
+import time
+from collections.abc import Callable
+
+import torch
+
+# DeepSeek-V3 MLA dimensions
+NUM_HEADS = 128
+QK_NOPE_HEAD_DIM = 128
+PE_DIM = 64
+
+
+def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+ """Original torch.cat approach with expand."""
+ return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+
+def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+ """Optimized direct copy approach (avoids expand + cat overhead)."""
+ k = torch.empty(
+ (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
+ dtype=k_nope.dtype,
+ device=k_nope.device,
+ )
+ k[..., : k_nope.shape[-1]] = k_nope
+ k[..., k_nope.shape[-1] :] = k_pe
+ return k
+
+
+def benchmark_method(
+ method: Callable,
+ k_nope: torch.Tensor,
+ k_pe: torch.Tensor,
+ num_warmup: int = 10,
+ num_iters: int = 100,
+) -> float:
+ """Benchmark a concatenation method and return mean latency in ms."""
+ # Warmup
+ for _ in range(num_warmup):
+ _ = method(k_nope, k_pe)
+ torch.cuda.synchronize()
+
+ # Benchmark
+ start = time.perf_counter()
+ for _ in range(num_iters):
+ _ = method(k_nope, k_pe)
+ torch.cuda.synchronize()
+ end = time.perf_counter()
+
+ return (end - start) / num_iters * 1000 # Convert to ms
+
+
+@torch.inference_mode()
+def run_benchmark(dtype: torch.dtype, dtype_name: str):
+ """Run benchmark for a specific dtype."""
+ torch.set_default_device("cuda")
+
+ # Batch sizes to test (powers of 2 from 32 to 65536)
+ batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
+
+ print("=" * 80)
+ print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
+ print("=" * 80)
+ print(
+ f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
+ f"k_pe=[B, 1, {PE_DIM}]"
+ )
+ print(f"dtype: {dtype_name}")
+ print()
+ print(
+ f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
+ f"{'Speedup':>8} | {'Reduction':>10}"
+ )
+ print("-" * 70)
+
+ results = []
+ for batch_size in batch_sizes:
+ # Create input tensors (generate in float32 then convert for FP8 compatibility)
+ k_nope = torch.randn(
+ batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
+ ).to(dtype)
+ k_pe = torch.randn(
+ batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
+ ).to(dtype)
+
+ # Benchmark both methods
+ cat_time = benchmark_method(cat_method, k_nope, k_pe)
+ direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
+
+ speedup = cat_time / direct_time
+ reduction = (1 - direct_time / cat_time) * 100
+
+ results.append((batch_size, cat_time, direct_time, speedup, reduction))
+
+ print(
+ f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
+ f"{speedup:>7.2f}x | {reduction:>9.1f}%"
+ )
+
+ print("=" * 80)
+
+ # Summary statistics
+ speedups = [r[3] for r in results]
+ print("\nSpeedup summary:")
+ print(f" Min: {min(speedups):.2f}x")
+ print(f" Max: {max(speedups):.2f}x")
+ print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
+
+ # Find crossover point
+ crossover_batch = None
+ for batch_size, _, _, speedup, _ in results:
+ if speedup >= 1.0:
+ crossover_batch = batch_size
+ break
+
+ print("\nConclusion:")
+ if crossover_batch:
+ print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
+ # Filter for large batches (>= 512 which is typical for prefill)
+ large_batch_speedups = [r[3] for r in results if r[0] >= 512]
+ if large_batch_speedups:
+ avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
+ print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
+ print(" - MLA prefill typically uses large batches, so optimization is effective")
+
+ return results
+
+
+@torch.inference_mode()
+def main():
+ # Test bfloat16
+ print("\n")
+ run_benchmark(torch.bfloat16, "bfloat16")
+
+ # Test float8_e4m3fn
+ print("\n")
+ run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index f540cff6261a8..5f9a131f79b0e 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+ep_size_range = [1, 8]
+configs = list(
+ itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
+)
@triton.testing.perf_report(
triton.testing.Benchmark(
- x_names=["num_tokens", "num_experts", "topk"],
+ x_names=["num_tokens", "num_experts", "topk", "ep_size"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm"],
@@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
args={},
)
)
-def benchmark(num_tokens, num_experts, topk, provider):
+def benchmark(num_tokens, num_experts, topk, ep_size, provider):
"""Benchmark function for Triton."""
block_size = 256
+ torch.cuda.manual_seed_all(0)
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+ e_map = None
+ if ep_size != 1:
+ local_e = num_experts // ep_size
+ e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
+ e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+ e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
- lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+ lambda: moe_align_block_size(
+ topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
+ ),
quantiles=quantiles,
)
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index 83bd91917508f..09de5fa822f86 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -99,7 +99,6 @@ def benchmark_mrope(
# the parameters to compute the q k v size based on tp_size
mrope_helper_class = get_rope(
head_size=head_dim,
- rotary_dim=head_dim,
max_position=max_position,
is_neox_style=is_neox_style,
rope_parameters=rope_parameters,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 074b7a440b612..7a1bc050bb33f 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
def benchmark(batch_size, seq_len, num_heads, provider):
dtype = torch.bfloat16
max_position = 8192
- base = 10000
- rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+ rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
+ rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
rope = rope.to(dtype=dtype, device=device)
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fbbb03c5ed465..85b286f8d8d0a 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
# Build ACL with CMake
- set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
- set(CMAKE_BUILD_TYPE "Release")
- set(ARM_COMPUTE_ARCH "armv8.2-a")
- set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
- set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
- set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
- set(ARM_COMPUTE_ENABLE_OPENMP "ON")
- set(ARM_COMPUTE_ENABLE_WERROR "OFF")
- set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
- set(ARM_COMPUTE_BUILD_TESTING "OFF")
-
set(_cmake_config_cmd
${CMAKE_COMMAND} -G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 2cf3c1a755d3c..0d4f9b7aa07c8 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# sm90a
set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
- list(APPEND SUPPORT_ARCHS 9.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+ list(APPEND SUPPORT_ARCHS "9.0a")
endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
- list(APPEND SUPPORT_ARCHS 10.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+ # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+ # this supports all compute_10x family
+ list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ list(APPEND SUPPORT_ARCHS "10.0a")
endif()
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
if(FLASH_MLA_ARCHS)
+ message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
$<$:-UPy_LIMITED_API>
$<$:-UPy_LIMITED_API>)
else()
- # Create empty targets for setup.py when not targeting sm90a systems
+ message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+ # Create empty targets for setup.py on unsupported systems
add_custom_target(_flashmla_C)
add_custom_target(_flashmla_extension_C)
endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 5047c354ff7d2..bdb2ba74d944d 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
run_python(_VLLM_TORCH_GOMP_PATH
"
import os, glob
-try:
- import torch
- torch_pkg = os.path.dirname(torch.__file__)
- site_root = os.path.dirname(torch_pkg)
- torch_libs = os.path.join(site_root, 'torch.libs')
- print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
-except:
- print('')
+import torch
+torch_pkg = os.path.dirname(torch.__file__)
+site_root = os.path.dirname(torch_pkg)
+
+# Search both torch.libs and torch/lib
+roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
+candidates = []
+for root in roots:
+ if not os.path.isdir(root):
+ continue
+ candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
+
+print(candidates[0] if candidates else '')
"
- "failed to probe torch.libs for libgomp")
+ "failed to probe for libgomp")
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
return()
diff --git a/csrc/cache.h b/csrc/cache.h
index f2a5ec0acf5cd..cbe44c09eb624 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,6 +1,7 @@
#pragma once
#include
+#include
#include