diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
new file mode 100644
index 0000000000000..199c33159fde3
--- /dev/null
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+ - ".buildkite/test_areas"
+ - ".buildkite/image_build"
+run_all_patterns:
+ - "docker/Dockerfile"
+ - "CMakeLists.txt"
+ - "requirements/common.txt"
+ - "requirements/cuda.txt"
+ - "requirements/build.txt"
+ - "requirements/test.txt"
+ - "setup.py"
+ - "csrc/"
+ - "cmake/"
+run_all_exclude_patterns:
+ - "docker/Dockerfile."
+ - "csrc/cpu/"
+ - "csrc/rocm/"
+ - "cmake/hipify.py"
+ - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+ main: "vllm-ci-postmerge-repo"
+ premerge: "vllm-ci-test-repo"
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe8476..0000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """
-
-
- Links for vLLM
- {x86_wheel}
- {arm_wheel}
-
-
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
- print(f"Generated index.html for {args.wheel}")
- # sync the abi tag with .buildkite/scripts/upload-wheels.sh
- if "x86_64" in filename:
- x86_wheel = filename
- arm_wheel = filename.replace("x86_64", "aarch64").replace(
- "manylinux1", "manylinux2014"
- )
- elif "aarch64" in filename:
- x86_wheel = filename.replace("aarch64", "x86_64").replace(
- "manylinux2014", "manylinux1"
- )
- arm_wheel = filename
- else:
- raise ValueError(f"Unsupported wheel: {filename}")
- # cloudfront requires escaping the '+' character
- f.write(
- template.format(
- x86_wheel=x86_wheel,
- x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
- arm_wheel=arm_wheel,
- arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
- )
- )
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
new file mode 100755
index 0000000000000..9a2384e524b63
--- /dev/null
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+ merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+ merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+ --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+ --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+ ${merge_base_commit_build_args} \
+ --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+ --cache-to type=registry,ref=${CACHE_TO},mode=max \
+ --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+ $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+ --push \
+ --target test \
+ --progress plain .
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
new file mode 100644
index 0000000000000..d01c71dd9becf
--- /dev/null
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,57 @@
+group: Abuild
+steps:
+ - label: ":docker: Build image"
+ key: image-build
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU image"
+ key: image-build-cpu
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build HPU image"
+ soft_fail: true
+ depends_on: []
+ key: image-build-hpu
+ commands:
+ - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU arm64 image"
+ key: cpu-arm64-image-build
+ depends_on: []
+ optional: true
+ commands:
+ - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
new file mode 100755
index 0000000000000..a69732f430985
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg VLLM_CPU_AVX512BF16=true \
+ --build-arg VLLM_CPU_AVX512VNNI=true \
+ --build-arg VLLM_CPU_AMXBF16=true \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
new file mode 100755
index 0000000000000..615298b6555bd
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
new file mode 100755
index 0000000000000..192447ef4577e
--- /dev/null
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build \
+ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+ --progress plain \
+ https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
index 46f1a9fbf6ff9..6c0b5540cbb6a 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -8,3 +8,4 @@ tasks:
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5
+rtol: 0.05
diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
new file mode 100644
index 0000000000000..4fb0b84bc4d81
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 3627b760eddcf..f94d681197d2d 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1
"""
+import os
+from contextlib import contextmanager
+
import lm_eval
import numpy as np
import yaml
-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+ if not new_env:
+ # Fast path: nothing to do
+ yield
+ return
+
+ old_values = {}
+ new_keys = []
+
+ try:
+ for key, value in new_env.items():
+ if key in os.environ:
+ old_values[key] = os.environ[key]
+ else:
+ new_keys.append(key)
+ os.environ[key] = str(value)
+ yield
+ finally:
+ # Restore / clean up
+ for key, value in old_values.items():
+ os.environ[key] = value
+ for key in new_keys:
+ os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size):
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len},"
)
- results = lm_eval.simple_evaluate(
- model=backend,
- model_args=model_args,
- tasks=[task["name"] for task in eval_config["tasks"]],
- num_fewshot=eval_config["num_fewshot"],
- limit=eval_config["limit"],
- # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
- # text models. however, this is regressing measured strict-match for
- # existing text models in CI, so only apply it for mm, or explicitly set
- apply_chat_template=eval_config.get(
- "apply_chat_template", backend == "vllm-vlm"
- ),
- fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
- # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
- gen_kwargs=eval_config.get("gen_kwargs"),
- batch_size=batch_size,
- )
+
+ env_vars = eval_config.get("env_vars", None)
+ with scoped_env_vars(env_vars):
+ results = lm_eval.simple_evaluate(
+ model=backend,
+ model_args=model_args,
+ tasks=[task["name"] for task in eval_config["tasks"]],
+ num_fewshot=eval_config["num_fewshot"],
+ limit=eval_config["limit"],
+ # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+ # text models. however, this is regressing measured strict-match for
+ # existing text models in CI, so only apply it for mm, or explicitly set
+ apply_chat_template=eval_config.get(
+ "apply_chat_template", backend == "vllm-vlm"
+ ),
+ fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+ # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+ gen_kwargs=eval_config.get("gen_kwargs"),
+ batch_size=batch_size,
+ )
return results
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size)
+ rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]]
print(
f"{task['name']} | {metric['name']}: "
- f"ground_truth={ground_truth} | measured={measured_value}"
+ f"ground_truth={ground_truth:.3f} | "
+ f"measured={measured_value:.3f} | rtol={rtol}"
)
- success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+ success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index fbfc923998f89..a9d51557bd9bb 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -15,6 +15,21 @@ steps:
env:
DOCKER_BUILDKIT: "1"
+ - label: "Build arm64 wheel - CUDA 13.0"
+ depends_on: ~
+ id: build-wheel-arm64-cuda-13-0
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+ # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+ env:
+ DOCKER_BUILDKIT: "1"
+
# aarch64 build
- label: "Build arm64 CPU wheel"
depends_on: ~
@@ -25,7 +40,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
@@ -39,7 +54,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
@@ -52,7 +67,21 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- - "bash .buildkite/scripts/upload-wheels.sh"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
+ env:
+ DOCKER_BUILDKIT: "1"
+
+ # x86 CPU wheel build
+ - label: "Build x86 CPU wheel"
+ depends_on: ~
+ id: build-wheel-x86-cpu
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 8d09ba178db7b..d0965fbd56405 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -7,18 +7,21 @@
import argparse
import json
-import re
import sys
from dataclasses import asdict, dataclass
+from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import quote
+import regex as re
+
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")
INDEX_HTML_TEMPLATE = """
+
{items}
@@ -89,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
)
-def generate_project_list(subdir_names: list[str]) -> str:
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
"""
Generate project list HTML content linking to each project & variant sub-directory.
"""
@@ -97,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
for name in sorted(subdir_names):
name = name.strip("/").strip(".")
href_tags.append(f' {name}/
')
- return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+ return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
def generate_package_index_and_metadata(
- wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+ wheel_files: list[WheelFileInfo],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ comment: str = "",
) -> tuple[str, str]:
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
@@ -119,7 +125,7 @@ def generate_package_index_and_metadata(
file_meta = asdict(file)
file_meta["path"] = file_path_quoted
metadata.append(file_meta)
- index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+ index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str
@@ -130,6 +136,7 @@ def generate_index_and_metadata(
index_base_dir: Path,
default_variant: str | None = None,
alias_to_default: str | None = None,
+ comment: str = "",
):
"""
Generate index for all wheel files.
@@ -140,6 +147,7 @@ def generate_index_and_metadata(
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
+ comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
@@ -233,6 +241,10 @@ def generate_index_and_metadata(
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.")
+ # Generate comment in HTML header
+ comment_str = f" ({comment})" if comment else ""
+ comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
# Generate index for each variant
subdir_names = set()
for variant, files in variant_to_files.items():
@@ -252,7 +264,7 @@ def generate_index_and_metadata(
subdir_names = subdir_names.union(packages)
else:
# generate project list for this variant directly
- project_list_str = generate_project_list(sorted(packages))
+ project_list_str = generate_project_list(sorted(packages), comment_tmpl)
with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str)
@@ -262,7 +274,7 @@ def generate_index_and_metadata(
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
- package_files, wheel_base_dir, package_dir
+ package_files, wheel_base_dir, package_dir, comment
)
with open(package_dir / "index.html", "w") as f:
f.write(index_str)
@@ -270,7 +282,7 @@ def generate_index_and_metadata(
f.write(metadata_str)
# Generate top-level project list index
- project_list_str = generate_project_list(sorted(subdir_names))
+ project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str)
@@ -282,6 +294,7 @@ if __name__ == "__main__":
--current-objects : path to JSON file containing current S3 objects listing in this version directory
--output-dir : directory to store generated index files
--alias-to-default : (optional) alias variant name for the default variant
+ --comment : (optional) comment string to include in generated HTML files
"""
parser = argparse.ArgumentParser(
@@ -311,6 +324,12 @@ if __name__ == "__main__":
default=None,
help="Alias variant name for the default variant",
)
+ parser.add_argument(
+ "--comment",
+ type=str,
+ default="",
+ help="Optional comment string to include in generated HTML files",
+ )
args = parser.parse_args()
@@ -353,6 +372,17 @@ if __name__ == "__main__":
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+ # keep only "official" files for a non-nightly version (specifed by cli args)
+ PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
+ if PY_VERSION_RE.match(version):
+ # upload-wheels.sh ensures no "dev" is in args.version
+ wheel_files = list(
+ filter(lambda x: version in x and "dev" not in x, wheel_files)
+ )
+ print(f"Non-nightly version detected, wheel files used: {wheel_files}")
+ else:
+ print("Nightly version detected, keeping all wheel files.")
+
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/
# s3://vllm-wheels//
@@ -365,5 +395,6 @@ if __name__ == "__main__":
index_base_dir=index_base_dir,
default_variant=None,
alias_to_default=args.alias_to_default,
+ comment=args.comment.strip(),
)
print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index b5f6b2494792f..b6274d698d01a 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -36,11 +36,17 @@ function cpu_tests() {
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ # Run model tests
+ docker exec cpu-test bash -c "
+ set -e
+ pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
# Run kernel tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
- pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+ pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+ pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
docker exec cpu-test bash -c '
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
index 29c8f5ed5a91a..0db1abe37ba11 100644
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 4d163399cfc6c..dfc9db512d1e9 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,6 +38,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
@@ -46,6 +47,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'
diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
index 5b25c358fc4aa..3fb7c82c8d333 100755
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+ echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+ exit 0
+fi
+
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
deleted file mode 100644
index d7167161b0059..0000000000000
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8030}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
- local port=$1
- timeout 600 bash -c '
- until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
- sleep 1
- done'
-}
-
-MODEL="deepseek-ai/DeepSeek-V2-lite"
-
-# Set BACKENDS based on platform
-if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
- # ROCm platform
- BACKENDS=("allgather_reducescatter")
- # Disable MOE padding for ROCm since it is causing eplb to fail
- export VLLM_ROCM_MOE_PADDING=0
-else
- # Non-ROCm platform (CUDA/other)
- BACKENDS=("deepep_high_throughput" "deepep_low_latency")
-fi
-
-cleanup() {
- if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
- kill "${SERVER_PID}" 2>/dev/null || true
- for _ in {1..20}; do
- kill -0 "${SERVER_PID}" 2>/dev/null || break
- sleep 0.5
- done
- kill -9 "${SERVER_PID}" 2>/dev/null || true
- fi
-}
-trap cleanup EXIT
-
-for BACK in "${BACKENDS[@]}"; do
- VLLM_DEEP_GEMM_WARMUP=skip \
- VLLM_ALL2ALL_BACKEND=$BACK \
- vllm serve "$MODEL" \
- --enforce-eager \
- --tensor-parallel-size 2 \
- --data-parallel-size 2 \
- --enable-expert-parallel \
- --enable-eplb \
- --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
- --trust-remote-code \
- --max-model-len 2048 \
- --port $PORT &
- SERVER_PID=$!
- wait_for_server $PORT
-
- TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
- OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
- python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
- python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
-PY
-
- cleanup
- SERVER_PID=
- sleep 1
- PORT=$((PORT+1))
-done
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index 693418da6093e..8106f50f18f66 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
--data-parallel-size 2 \
--enable-expert-parallel \
--enable-eplb \
- --eplb-config '{"window_size":200,"step_interval":600}' \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 2eaa91c04086c..3a218a4bb2e6d 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
fi
wheel="${wheel_files[0]}"
-# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# we also accept params as manylinux tag
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-manylinux_version="manylinux_2_31"
+manylinux_version="${1:-manylinux_2_31}"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
if [[ "$wheel" != *"linux"* ]]; then
@@ -81,7 +82,10 @@ else
alias_arg=""
fi
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
# copy indices to // unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -93,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]];
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi
-# copy to // only if it does not have "dev" in the version
+# re-generate and copy to // only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then
- echo "Uploading indices to overwrite /$pure_version/"
+ echo "Re-generating indices for /$pure_version/"
+ rm -rf "$INDICES_OUTPUT_DIR/*"
+ mkdir -p "$INDICES_OUTPUT_DIR"
+ $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ee4fdebae5675..3c9b8cbedcf06 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -61,8 +61,8 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
- timeout_in_minutes: 20
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
agent_pool: mi325_1
grade: Blocking
@@ -73,6 +73,7 @@ steps:
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
+ - tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -82,6 +83,7 @@ steps:
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -326,10 +328,10 @@ steps:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-- label: V1 Test e2e + engine # 30min
- timeout_in_minutes: 45
+- label: V1 Test e2e + engine # 65min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
- agent_pool: mi325_1
+ agent_pool: mi325_4
# grade: Blocking
source_file_dependencies:
- vllm/
@@ -398,7 +400,8 @@ steps:
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- - vllm/
+ - vllm/v1/attention
+ - vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -434,29 +437,34 @@ steps:
- label: Examples Test # 30min
timeout_in_minutes: 45
- mirror_hardwares: [amdexperimental]
+ mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
- - python3 offline_inference/audio_language.py --seed 0
- - python3 offline_inference/vision_language.py --seed 0
- - python3 offline_inference/vision_language_pooling.py --seed 0
- - python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -718,14 +726,15 @@ steps:
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-- label: LM Eval Small Models # 15min
- timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
+ autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
@@ -738,7 +747,7 @@ steps:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
- commands: # LMEval
+ commands: # LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/
@@ -752,19 +761,7 @@ steps:
- vllm/
- tests/tool_use
commands:
- - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_1
- # grade: Blocking
- timeout_in_minutes: 10
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- no_gpu: true
- commands:
- - pytest -v -s -m 'cpu_test' tool_use
+ - pytest -v -s tool_use
##### models test #####
@@ -974,8 +971,8 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
- timeout_in_minutes: 70
+- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
+ timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
@@ -987,7 +984,8 @@ steps:
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+ timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -1011,7 +1009,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
-- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+ timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -1120,7 +1119,6 @@ steps:
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- - vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
@@ -1154,17 +1152,15 @@ steps:
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
- - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-- label: ROCm GPT-OSS Eval
+- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
- agent_pool: mi325_1
- mirror_hardwares: [amdexperimental, amdproduction]
+ gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
@@ -1173,7 +1169,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
@@ -1378,7 +1374,7 @@ steps:
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- # Disabled for now because MXFP4 backend on non-cuda platform
+ # Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
@@ -1444,12 +1440,13 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
+
- label: LM Eval Large Models # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
gpu: a100
optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
@@ -1461,11 +1458,11 @@ steps:
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
gpu: h100
optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
@@ -1475,6 +1472,7 @@ steps:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
##### H200 test #####
- label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental]
@@ -1506,6 +1504,57 @@ steps:
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
+##### E2E Eval Tests #####
+- label: LM Eval Small Models (1 Card) # 15min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 Card)
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: ROCm LM Eval Large Models (8 Card)
+ mirror_hardwares: [amdproduction]
+ agent_pool: mi325_8
+ num_gpus: 8
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
+
+- label: ROCm GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ agent_pool: mi325_1
+ mirror_hardwares: [amdexperimental, amdproduction]
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
@@ -1520,7 +1569,6 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
-
- label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
@@ -1552,4 +1600,27 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f79e9266559f6..2dcca5711b3d5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,8 +57,8 @@ steps:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
- timeout_in_minutes: 20
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
+ timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/test_inputs.py
@@ -66,6 +66,7 @@ steps:
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
+ - tests/tool_parsers
- tests/transformers_utils
- tests/config
no_gpu: true
@@ -75,6 +76,7 @@ steps:
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
- pytest -v -s transformers_utils
- pytest -v -s config
@@ -350,7 +352,8 @@ steps:
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- - vllm/
+ - vllm/v1/attention
+ - vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -387,6 +390,7 @@ steps:
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
@@ -466,7 +470,9 @@ steps:
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@@ -480,7 +486,9 @@ steps:
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
@@ -666,16 +674,7 @@ steps:
- vllm/
- tests/tool_use
commands:
- - pytest -v -s -m 'not cpu_test' tool_use
-
-- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
- timeout_in_minutes: 10
- source_file_dependencies:
- - vllm/
- - tests/tool_use
- no_gpu: true
- commands:
- - pytest -v -s -m 'cpu_test' tool_use
+ - pytest -v -s tool_use
##### models test #####
@@ -686,6 +685,7 @@ steps:
source_file_dependencies:
- vllm/
- tests/models/test_initialization.py
+ - tests/models/registry.py
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -698,6 +698,7 @@ steps:
- vllm/model_executor/models/
- vllm/transformers_utils/
- tests/models/test_initialization.py
+ - tests/models/registry.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
@@ -830,7 +831,7 @@ steps:
- tests/models/multimodal
no_gpu: true
commands:
- - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor Test
@@ -1340,6 +1341,7 @@ steps:
- label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30
optional: true
+ soft_fail: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
@@ -1374,21 +1376,3 @@ steps:
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
-
-- label: DeepSeek V2-Lite Async EPLB Accuracy
- timeout_in_minutes: 60
- gpu: h100
- optional: true
- num_gpus: 4
- working_dir: "/vllm-workspace"
- commands:
- - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
-
-- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
- timeout_in_minutes: 60
- gpu: h100
- optional: true
- num_gpus: 4
- working_dir: "/vllm-workspace"
- commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
new file mode 100644
index 0000000000000..6e444eae14c74
--- /dev/null
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,21 @@
+group: Attention
+depends_on:
+ - image-build
+steps:
+- label: V1 attention (H100)
+ timeout_in_minutes: 30
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+ timeout_in_minutes: 30
+ gpu: b200
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
new file mode 100644
index 0000000000000..759d2b5358714
--- /dev/null
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on:
+ - image-build
+steps:
+- label: Basic Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/basic_correctness/test_basic_correctness
+ - tests/basic_correctness/test_cpu_offload
+ - tests/basic_correctness/test_cumem.py
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s basic_correctness/test_cumem.py
+ - pytest -v -s basic_correctness/test_basic_correctness.py
+ - pytest -v -s basic_correctness/test_cpu_offload.py
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
new file mode 100644
index 0000000000000..574b642d407b0
--- /dev/null
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on:
+ - image-build
+steps:
+- label: Benchmarks
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/.buildkite"
+ source_file_dependencies:
+ - benchmarks/
+ commands:
+ - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/benchmarks/
+ commands:
+ - pytest -v -s benchmarks/
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
new file mode 100644
index 0000000000000..0ba00925a4838
--- /dev/null
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,57 @@
+group: Compile
+depends_on:
+ - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
+ commands:
+ - nvidia-smi
+ - pytest -v -s tests/compile/test_fusion_attn.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+ # this runner has 2 GPUs available even though num_gpus=2 is not set
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+ # Wrap with quotes to escape yaml
+ - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ commands:
+ - nvidia-smi
+ # Run all e2e fusion tests
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
new file mode 100644
index 0000000000000..50c0c338c2434
--- /dev/null
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,22 @@
+group: CUDA
+depends_on:
+ - image-build
+steps:
+- label: Platform Tests (CUDA)
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/cuda
+ commands:
+ - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/v1/cudagraph
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/config/compilation.py
+ - vllm/compilation
+ commands:
+ - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+ - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
new file mode 100644
index 0000000000000..2cc90698d916a
--- /dev/null
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,199 @@
+group: Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Comm Ops
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/distributed
+ - tests/distributed
+ commands:
+ - pytest -v -s distributed/test_comm_ops.py
+ - pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+ timeout_in_minutes: 90
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/compilation/
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/worker/worker_base.py
+ - vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+ - pytest -v -s entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s ./compile/test_wrapper.py
+ - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - tests/distributed/test_utils
+ - tests/distributed/test_pynccl
+ - tests/distributed/test_events
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - examples/offline_inference/rlhf.py
+ - examples/offline_inference/rlhf_colocate.py
+ - tests/examples/offline_inference/data_parallel.py
+ - tests/v1/distributed
+ - tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and external_dp=2
+ - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and pp=2
+ - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with internal dp
+ - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+ - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+ - pytest -v -s distributed/test_utils.py
+ - pytest -v -s compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s distributed/test_pynccl.py
+ - pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
+ # TODO: create a dedicated test section for multi-GPU example tests
+ # when we have multiple distributed example tests
+ - cd ../examples/offline_inference
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+ timeout_in_minutes: 10
+ gpu: h100
+ num_gpus: 8
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - examples/offline_inference/torchrun_dp_example.py
+ - vllm/config/parallel.py
+ - vllm/distributed/
+ - vllm/v1/engine/llm_engine.py
+ - vllm/v1/executor/uniproc_executor.py
+ - vllm/v1/worker/gpu_worker.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and dp=4 with ep
+ - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/
+ commands:
+ # NOTE: don't test llama model here, it seems hf implementation is buggy
+ # see https://github.com/vllm-project/vllm/pull/5689 for details
+ - pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+ - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ num_nodes: 2
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ - tests/examples/offline_inference/data_parallel.py
+ commands:
+ - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ commands:
+ - pytest -v -s distributed/test_pp_cudagraph.py
+ - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
new file mode 100644
index 0000000000000..93d389815edac
--- /dev/null
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,59 @@
+group: E2E Integration
+depends_on:
+ - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
new file mode 100644
index 0000000000000..a028e0e4af4c1
--- /dev/null
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,26 @@
+group: Engine
+depends_on:
+ - image-build
+steps:
+- label: Engine
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/engine
+ - tests/test_sequence
+ - tests/test_config
+ - tests/test_logger
+ - tests/test_vllm_port
+ commands:
+ - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
new file mode 100644
index 0000000000000..0a789be943f37
--- /dev/null
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,68 @@
+group: Entrypoints
+depends_on:
+ - image-build
+steps:
+- label: Entrypoints Unit Tests
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/llm
+ - tests/entrypoints/offline_mode
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server)
+ timeout_in_minutes: 130
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai
+ - tests/entrypoints/test_chat_utils
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+ - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (Pooling)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - csrc/
+ - vllm/entrypoints/openai/
+ - vllm/model_executor/models/whisper.py
+ commands: # LMEval+Transcription WER check
+ - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
new file mode 100644
index 0000000000000..feb8252148c7f
--- /dev/null
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on:
+ - image-build
+steps:
+- label: EPLB Algorithm
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_algo.py
+ commands:
+ - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_execute.py
+ commands:
+ - pytest -v -s distributed/test_eplb_execute.py
+ - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
new file mode 100644
index 0000000000000..7ca099516d641
--- /dev/null
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,117 @@
+group: Kernels
+depends_on:
+ - image-build
+steps:
+- label: Kernels Core Operation Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - tests/kernels/core
+ - tests/kernels/test_top_k_per_row.py
+ commands:
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - csrc/attention/
+ - vllm/attention
+ - vllm/v1/attention
+ - tests/kernels/attention
+ commands:
+ - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Quantization Test %N
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/quantization/
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization
+ commands:
+ - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels MoE Test %N
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - csrc/quantization/cutlass_w8a8/moe/
+ - csrc/moe/
+ - tests/kernels/moe
+ - vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
+ - vllm/envs.py
+ - vllm/config
+ commands:
+ - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Mamba Test
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - csrc/mamba/
+ - tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
+ commands:
+ - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ # optional: true
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - csrc/attention/mla/
+ - csrc/quantization/cutlass_w8a8/moe/
+ - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/attention/backends/mla/cutlass_mla.py
+ - vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/platforms/cuda.py
+ - vllm/attention/selector.py
+ commands:
+ - nvidia-smi
+ - python3 examples/offline_inference/basic/chat.py
+ # Attention
+ # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_attention_selector.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+ - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+ # Quantization
+ - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+ - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
new file mode 100644
index 0000000000000..9af43e0c375a8
--- /dev/null
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,46 @@
+group: LM Eval
+depends_on:
+ - image-build
+steps:
+- label: LM Eval Small Models
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ autorun_on_main: true
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
new file mode 100644
index 0000000000000..809b4138f44ba
--- /dev/null
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,31 @@
+group: LoRA
+depends_on:
+ - image-build
+steps:
+- label: LoRA %N
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+ parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+ timeout_in_minutes: 30
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ # FIXIT: find out which code initialize cuda before running the test
+ # before the fix, we need to use spawn to test it
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # There is some Tensor Parallelism related processing logic in LoRA that
+ # requires multi-GPU testing for validation.
+ - pytest -v -s -x lora/test_chatglm3_tp.py
+ - pytest -v -s -x lora/test_llama_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
+ - pytest -v -s -x lora/test_olmoe_tp.py
+ - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
new file mode 100644
index 0000000000000..252af1e56a105
--- /dev/null
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,165 @@
+group: Miscellaneous
+depends_on:
+ - image-build
+steps:
+- label: V1 Others
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ # split the test to avoid interference
+ - pytest -v -s -m 'not cpu_test' v1/core
+ - pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
+ - pytest -v -s v1/sample
+ - pytest -v -s v1/logits_processors
+ - pytest -v -s v1/worker
+ - pytest -v -s v1/spec_decode
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
+ - pytest -v -s v1/test_oracle.py
+ - pytest -v -s v1/test_request.py
+ - pytest -v -s v1/test_outputs.py
+ # Integration test for streaming correctness (requires special branch).
+ - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+ - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+ depends_on: ~
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/test_regression
+ commands:
+ - pip install modelscope
+ - pytest -v -s test_regression.py
+ working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/examples"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - vllm/multimodal
+ - examples/
+ commands:
+ - pip install tensorizer # for tensorizer test
+ - python3 offline_inference/basic/chat.py # for basic
+ - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+ - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+ timeout_in_minutes: 20
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/tracing
+ commands:
+ - "pip install \
+ 'opentelemetry-sdk>=1.26.0' \
+ 'opentelemetry-api>=1.26.0' \
+ 'opentelemetry-exporter-otlp>=1.26.0' \
+ 'opentelemetry-semantic-conventions-ai>=0.4.1'"
+ - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+ depends_on: ~
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/standalone_tests/python_only_compile.sh
+ - setup.py
+ commands:
+ - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+ depends_on: ~
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/test_inputs.py
+ - tests/test_outputs.py
+ - tests/multimodal
+ - tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/tool_parsers
+ - tests/transformers_utils
+ - tests/config
+ no_gpu: true
+ commands:
+ - python3 standalone_tests/lazy_imports.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s test_outputs.py
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s tool_parsers
+ - pytest -v -s transformers_utils
+ - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
new file mode 100644
index 0000000000000..996c8bb8b780a
--- /dev/null
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,17 @@
+group: Model Executor
+depends_on:
+ - image-build
+steps:
+- label: Model Executor
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - vllm/engine/arg_utils.py
+ - vllm/config/model.py
+ - vllm/model_executor
+ - tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+ commands:
+ - apt-get update && apt-get install -y curl libsodium23
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
new file mode 100644
index 0000000000000..39a5d51c48833
--- /dev/null
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,62 @@
+group: Models - Basic
+depends_on:
+ - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_initialization.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+ working_dir: "/vllm-workspace/"
+ optional: true
+ soft_fail: true
+ commands:
+ - pip install --upgrade git+https://github.com/huggingface/transformers
+ - pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
+ - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/test_mapping.py
+ - python3 examples/offline_inference/basic/chat.py
+ - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
new file mode 100644
index 0000000000000..b6bfbf2ddab47
--- /dev/null
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+ # Avoid importing model tests that cause CUDA reinitialization error
+ - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
new file mode 100644
index 0000000000000..f70192c4ebc0a
--- /dev/null
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,91 @@
+group: Models - Language
+depends_on:
+ - image-build
+steps:
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language
+ commands:
+ # Test standard language models, excluding a subset of slow tests
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling
+ commands:
+ - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
new file mode 100644
index 0000000000000..fc24068c20a46
--- /dev/null
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on:
+ - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+ timeout_in_minutes: 80
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+ timeout_in_minutes: 70
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - vllm/multimodal/
+ - vllm/inputs/
+ - vllm/v1/core/
+ commands:
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+ optional: true
+ commands:
+ - echo 'Testing custom models...'
+ # PR authors can temporarily add commands below to test individual models
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
new file mode 100644
index 0000000000000..60c179aa098e1
--- /dev/null
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,34 @@
+group: Plugins
+depends_on:
+ - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/plugins/
+ - tests/plugins/
+ commands:
+ # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+ - pip install -e ./plugins/vllm_add_dummy_platform
+ - pytest -v -s plugins_tests/test_platform_plugins.py
+ - pip uninstall vllm_add_dummy_platform -y
+ # end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
+ # begin stat_logger plugins test
+ - pip install -e ./plugins/vllm_add_dummy_stat_logger
+ - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+ - pip uninstall dummy_stat_logger -y
+ # end stat_logger plugins test
+ # other tests continue here:
+ - pytest -v -s plugins_tests/test_scheduler_plugins.py
+ - pip install -e ./plugins/vllm_add_dummy_model
+ - pytest -v -s distributed/test_distributed_oot.py
+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+ - pytest -v -s models/test_oot_registration.py # it needs a clean process
+ - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
new file mode 100644
index 0000000000000..703c82eb1a91b
--- /dev/null
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,50 @@
+group: PyTorch
+depends_on:
+ - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run unit tests defined directly under compile/,
+ # not including subdirectories, which are usually heavier
+ # tests covered elsewhere.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph Smoke Test
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run smoke tests under fullgraph directory, except test_full_graph.py
+ # as it is a heavy test that is covered in other steps.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+ timeout_in_minutes: 40
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # fp8 kv scales not supported on sm89, tested on Blackwell instead
+ - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+ # Limit to no custom ops to reduce running time
+ # Wrap with quotes to escape yaml and avoid starting -k string with a -
+ - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+ # if this test fails, it means the nightly torch version is not compatible with some
+ # of the dependencies. Please check the error message and add the package to whitelist
+ # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+ soft_fail: true
+ source_file_dependencies:
+ - requirements/nightly_torch_test.txt
+ commands:
+ - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
new file mode 100644
index 0000000000000..6e89d6af3b8d1
--- /dev/null
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,46 @@
+group: Quantization
+depends_on:
+ - image-build
+steps:
+- label: Quantization
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ - tests/quantization
+ commands:
+ # temporary install here since we need nightly, will move to requirements/test.in
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system conch-triton-kernels
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/model_executor/layers/quantization
+ - tests/models/quantization
+ commands:
+ - pytest -v -s models/quantization
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
new file mode 100644
index 0000000000000..ad377148fd073
--- /dev/null
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,14 @@
+group: Samplers
+depends_on:
+ - image-build
+steps:
+- label: Samplers Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - vllm/model_executor/layers
+ - vllm/sampling_metadata.py
+ - tests/samplers
+ - tests/conftest.py
+ commands:
+ - pytest -v -s samplers
+ - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
new file mode 100644
index 0000000000000..69527a1214229
--- /dev/null
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,13 @@
+group: Tool use
+depends_on:
+ - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
+ fast_check: false
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ commands:
+ - pytest -v -s tool_use
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
new file mode 100644
index 0000000000000..cfc5bb20fe7ad
--- /dev/null
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on:
+ - image-build
+steps:
+- label: Weight Loading Multiple GPU # 33min
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ gpu: a100
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 997a40e18e588..3ad79f93bc7ad 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,6 +14,52 @@ pull_request_rules:
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
+- name: comment-pre-commit-failure
+ description: Comment on PR when pre-commit check fails
+ conditions:
+ - status-failure=pre-commit
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+ ```bash
+ uv pip install pre-commit
+ pre-commit install
+ pre-commit run --all-files
+ ```
+
+ Then, commit the changes and push to your branch.
+
+ For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+ > [!TIP]
+ >
+ > Is mypy or markdownlint failing?
+ >
+ > mypy and markdownlint are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+ >
+ > ```bash
+ > # For mypy (substitute "3.10" with the failing version if needed)
+ > pre-commit run --hook-stage manual mypy-3.10
+ > # For markdownlint
+ > pre-commit run --hook-stage manual markdownlint
+ > ```
+ >
+
+- name: comment-dco-failure
+ description: Comment on PR when DCO check fails
+ conditions:
+ - status-failure=dco
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
@@ -140,7 +186,7 @@ pull_request_rules:
- files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
- files~=^vllm/entrypoints/tool_server.py
- files~=^vllm/entrypoints/tool.py
- files~=^vllm/entrypoints/context.py
@@ -358,4 +404,4 @@ pull_request_rules:
actions:
label:
add:
- - kv-connector
\ No newline at end of file
+ - kv-connector
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 56fbe5ca704a1..df8910837715d 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Python
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index 3a12c4b3a8300..e80a5c0cc80f9 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30
steps:
- - uses: actions/checkout@v6
+ - uses: actions/checkout@v6.0.1
- uses: astral-sh/setup-uv@v7
with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index a03b979ad761d..1041653c2f57e 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+ - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index dca3089f496c9..44bf71db5e9de 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,13 +7,15 @@ on:
jobs:
close-issues-and-pull-requests:
+ # Prevents triggering on forks or other repos
+ if: github.repository == 'vllm-project/vllm'
permissions:
issues: write
pull-requests: write
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+ - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e09972fe71995..cd52df86e0346 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -384,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=$PYTHONPATH
+ PYTHONPATH=$ENV{PYTHONPATH}
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
@@ -822,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+ PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
@@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+ "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+ )
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
- "csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -1002,7 +1004,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=$PYTHONPATH
+ PYTHONPATH=$ENV{PYTHONPATH}
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
diff --git a/README.md b/README.md
index 5c040fe4a66d2..26222b815370d 100644
--- a/README.md
+++ b/README.md
@@ -143,11 +143,13 @@ Compute Resources:
- Databricks
- DeepInfra
- Google Cloud
+- IBM
- Intel
- Lambda Lab
- Nebius
- Novita AI
- NVIDIA
+- Red Hat
- Replicate
- Roblox
- RunPod
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 56b721cbb4021..a245e2022e605 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+ echo "Error: Failed to determine hostname." >&2
+ exit 1
+fi
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
"$MODEL"
"--disable-log-requests"
"--port" "8004"
+ "--host" "$HOSTNAME"
"--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -96,8 +102,9 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
- VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
- vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+ local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+ VLLM_SERVER_DEV_MODE=1 \
+ vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_SERVER_DEV_MODE=1 \
@@ -112,7 +119,7 @@ start_server() {
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
- RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+ RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
@@ -172,6 +179,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -187,7 +195,7 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
- curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+ curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
@@ -203,6 +211,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
+ --host "$HOSTNAME" \
--port 8004 \
--profile &> "$bm_log"
else
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index d69d74ca61f54..831b76b66e096 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
- from vllm.tokenizers import MistralTokenizer
+ from vllm.tokenizers.mistral import MistralTokenizer
except ImportError as e:
raise ImportError(
"MistralTokenizer requires vllm package.\n"
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index cac401456b62a..b5373d383b548 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -32,12 +32,11 @@ def benchmark_propose(args):
model_config = ModelConfig(
model="facebook/opt-125m",
- task="generate",
max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
- seed=None,
+ seed=0,
trust_remote_code=False,
)
proposer = NgramProposer(
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index df122b4c5e8db..33aca831883aa 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -574,7 +574,7 @@ async def benchmark(
)
print(
"{:<40} {:<10.2f}".format(
- "Total Token throughput (tok/s):", metrics.total_token_throughput
+ "Total token throughput (tok/s):", metrics.total_token_throughput
)
)
@@ -963,8 +963,7 @@ def create_argument_parser():
parser.add_argument(
"--profile",
action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+ help="Use vLLM Profiling. --profiler-config must be provided on the server.",
)
parser.add_argument(
"--result-dir",
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index d809bf1db8cbc..fb3329975cee3 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@ from tqdm import tqdm
import vllm._custom_ops as ops
from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ per_token_group_quant_fp8,
+)
@dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
hidden_size: int
add_residual: bool
dtype: torch.dtype
+ group_size: list[int]
def description(self):
return (
@@ -29,6 +33,7 @@ class bench_params_t:
f"x D {self.hidden_size} "
f"x R {self.add_residual} "
f"x DT {self.dtype}"
+ f"x GS {self.group_size}"
)
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
HIDDEN_SIZES = list(range(1024, 8129, 1024))
ADD_RESIDUAL = [True, False]
DTYPES = [torch.bfloat16, torch.float]
+ GROUP_SIZES = [[1, 64], [1, 128]]
- combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+ combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
bench_params = list(
- map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+ map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
)
return bench_params
@@ -52,6 +58,7 @@ def unfused_int8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
torch_out, _ = ops.scaled_fp8_quant(torch_out)
+def unfused_groupwise_fp8_impl(
+ rms_norm_layer: RMSNorm,
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ # Norm
+ torch_out = None
+ if residual is None:
+ torch_out = rms_norm_layer.forward_cuda(x, residual)
+ else:
+ torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+ # Quant
+ torch_out, _ = per_token_group_quant_fp8(
+ torch_out, group_size=group_size[1], use_ue8m0=False
+ )
+
+
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
out, _ = ops.rms_norm_dynamic_per_token_quant(
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
)
+def fused_groupwise_impl(
+ rms_norm_layer: RMSNorm, # this stores the weights
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ out, _ = ops.rms_norm_per_block_quant(
+ x,
+ rms_norm_layer.weight,
+ 1e-6,
+ quant_dtype,
+ group_size,
+ residual=residual,
+ is_scale_transposed=True,
+ )
+
+
# Bench functions
def bench_fn(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: torch.Tensor,
quant_dtype: torch.dtype,
+ group_size: list[int],
label: str,
sub_label: str,
fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
"x": x,
"residual": residual,
"quant_dtype": quant_dtype,
+ "group_size": group_size,
"fn": fn,
}
return TBenchmark.Timer(
- stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+ stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
globals=globals,
label=label,
sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
)
)
+ # unfused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ unfused_groupwise_fp8_impl,
+ "unfused_groupwise_fp8_impl",
+ )
+ )
+
+ # fused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ fused_groupwise_impl,
+ "fused_groupwise_fp8_impl",
+ )
+ )
+
print_timers(timers)
return timers
diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py
new file mode 100644
index 0000000000000..fb3b6c8f12003
--- /dev/null
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
+in MLA (Multi-head Latent Attention) prefill.
+
+This validates that the optimization from commit 8d4142bd is beneficial across
+various batch sizes, not just the originally tested batch size of 32768.
+"""
+
+import time
+from collections.abc import Callable
+
+import torch
+
+# DeepSeek-V3 MLA dimensions
+NUM_HEADS = 128
+QK_NOPE_HEAD_DIM = 128
+PE_DIM = 64
+
+
+def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+ """Original torch.cat approach with expand."""
+ return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+
+def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
+ """Optimized direct copy approach (avoids expand + cat overhead)."""
+ k = torch.empty(
+ (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
+ dtype=k_nope.dtype,
+ device=k_nope.device,
+ )
+ k[..., : k_nope.shape[-1]] = k_nope
+ k[..., k_nope.shape[-1] :] = k_pe
+ return k
+
+
+def benchmark_method(
+ method: Callable,
+ k_nope: torch.Tensor,
+ k_pe: torch.Tensor,
+ num_warmup: int = 10,
+ num_iters: int = 100,
+) -> float:
+ """Benchmark a concatenation method and return mean latency in ms."""
+ # Warmup
+ for _ in range(num_warmup):
+ _ = method(k_nope, k_pe)
+ torch.cuda.synchronize()
+
+ # Benchmark
+ start = time.perf_counter()
+ for _ in range(num_iters):
+ _ = method(k_nope, k_pe)
+ torch.cuda.synchronize()
+ end = time.perf_counter()
+
+ return (end - start) / num_iters * 1000 # Convert to ms
+
+
+@torch.inference_mode()
+def run_benchmark(dtype: torch.dtype, dtype_name: str):
+ """Run benchmark for a specific dtype."""
+ torch.set_default_device("cuda")
+
+ # Batch sizes to test (powers of 2 from 32 to 65536)
+ batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
+
+ print("=" * 80)
+ print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
+ print("=" * 80)
+ print(
+ f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
+ f"k_pe=[B, 1, {PE_DIM}]"
+ )
+ print(f"dtype: {dtype_name}")
+ print()
+ print(
+ f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
+ f"{'Speedup':>8} | {'Reduction':>10}"
+ )
+ print("-" * 70)
+
+ results = []
+ for batch_size in batch_sizes:
+ # Create input tensors (generate in float32 then convert for FP8 compatibility)
+ k_nope = torch.randn(
+ batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
+ ).to(dtype)
+ k_pe = torch.randn(
+ batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
+ ).to(dtype)
+
+ # Benchmark both methods
+ cat_time = benchmark_method(cat_method, k_nope, k_pe)
+ direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
+
+ speedup = cat_time / direct_time
+ reduction = (1 - direct_time / cat_time) * 100
+
+ results.append((batch_size, cat_time, direct_time, speedup, reduction))
+
+ print(
+ f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
+ f"{speedup:>7.2f}x | {reduction:>9.1f}%"
+ )
+
+ print("=" * 80)
+
+ # Summary statistics
+ speedups = [r[3] for r in results]
+ print("\nSpeedup summary:")
+ print(f" Min: {min(speedups):.2f}x")
+ print(f" Max: {max(speedups):.2f}x")
+ print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
+
+ # Find crossover point
+ crossover_batch = None
+ for batch_size, _, _, speedup, _ in results:
+ if speedup >= 1.0:
+ crossover_batch = batch_size
+ break
+
+ print("\nConclusion:")
+ if crossover_batch:
+ print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
+ # Filter for large batches (>= 512 which is typical for prefill)
+ large_batch_speedups = [r[3] for r in results if r[0] >= 512]
+ if large_batch_speedups:
+ avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
+ print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
+ print(" - MLA prefill typically uses large batches, so optimization is effective")
+
+ return results
+
+
+@torch.inference_mode()
+def main():
+ # Test bfloat16
+ print("\n")
+ run_benchmark(torch.bfloat16, "bfloat16")
+
+ # Test float8_e4m3fn
+ print("\n")
+ run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index f540cff6261a8..5f9a131f79b0e 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+ep_size_range = [1, 8]
+configs = list(
+ itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
+)
@triton.testing.perf_report(
triton.testing.Benchmark(
- x_names=["num_tokens", "num_experts", "topk"],
+ x_names=["num_tokens", "num_experts", "topk", "ep_size"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm"],
@@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
args={},
)
)
-def benchmark(num_tokens, num_experts, topk, provider):
+def benchmark(num_tokens, num_experts, topk, ep_size, provider):
"""Benchmark function for Triton."""
block_size = 256
+ torch.cuda.manual_seed_all(0)
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+ e_map = None
+ if ep_size != 1:
+ local_e = num_experts // ep_size
+ e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
+ e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+ e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
- lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+ lambda: moe_align_block_size(
+ topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
+ ),
quantiles=quantiles,
)
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index 83bd91917508f..09de5fa822f86 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -99,7 +99,6 @@ def benchmark_mrope(
# the parameters to compute the q k v size based on tp_size
mrope_helper_class = get_rope(
head_size=head_dim,
- rotary_dim=head_dim,
max_position=max_position,
is_neox_style=is_neox_style,
rope_parameters=rope_parameters,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 074b7a440b612..7a1bc050bb33f 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
def benchmark(batch_size, seq_len, num_heads, provider):
dtype = torch.bfloat16
max_position = 8192
- base = 10000
- rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+ rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
+ rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
rope = rope.to(dtype=dtype, device=device)
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fbbb03c5ed465..85b286f8d8d0a 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
# Build ACL with CMake
- set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
- set(CMAKE_BUILD_TYPE "Release")
- set(ARM_COMPUTE_ARCH "armv8.2-a")
- set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
- set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
- set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
- set(ARM_COMPUTE_ENABLE_OPENMP "ON")
- set(ARM_COMPUTE_ENABLE_WERROR "OFF")
- set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
- set(ARM_COMPUTE_BUILD_TESTING "OFF")
-
set(_cmake_config_cmd
${CMAKE_COMMAND} -G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 2cf3c1a755d3c..0d4f9b7aa07c8 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# sm90a
set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
- list(APPEND SUPPORT_ARCHS 9.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+ list(APPEND SUPPORT_ARCHS "9.0a")
endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
- list(APPEND SUPPORT_ARCHS 10.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+ # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+ # this supports all compute_10x family
+ list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ list(APPEND SUPPORT_ARCHS "10.0a")
endif()
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
if(FLASH_MLA_ARCHS)
+ message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
$<$:-UPy_LIMITED_API>
$<$:-UPy_LIMITED_API>)
else()
- # Create empty targets for setup.py when not targeting sm90a systems
+ message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+ # Create empty targets for setup.py on unsupported systems
add_custom_target(_flashmla_C)
add_custom_target(_flashmla_extension_C)
endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 5047c354ff7d2..bdb2ba74d944d 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
run_python(_VLLM_TORCH_GOMP_PATH
"
import os, glob
-try:
- import torch
- torch_pkg = os.path.dirname(torch.__file__)
- site_root = os.path.dirname(torch_pkg)
- torch_libs = os.path.join(site_root, 'torch.libs')
- print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
-except:
- print('')
+import torch
+torch_pkg = os.path.dirname(torch.__file__)
+site_root = os.path.dirname(torch_pkg)
+
+# Search both torch.libs and torch/lib
+roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
+candidates = []
+for root in roots:
+ if not os.path.isdir(root):
+ continue
+ candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
+
+print(candidates[0] if candidates else '')
"
- "failed to probe torch.libs for libgomp")
+ "failed to probe for libgomp")
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
return()
diff --git a/csrc/cache.h b/csrc/cache.h
index f2a5ec0acf5cd..cbe44c09eb624 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,6 +1,7 @@
#pragma once
#include
+#include
#include