diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml
new file mode 100644
index 0000000000000..199c33159fde3
--- /dev/null
+++ b/.buildkite/ci_config.yaml
@@ -0,0 +1,24 @@
+name: vllm_ci
+job_dirs:
+ - ".buildkite/test_areas"
+ - ".buildkite/image_build"
+run_all_patterns:
+ - "docker/Dockerfile"
+ - "CMakeLists.txt"
+ - "requirements/common.txt"
+ - "requirements/cuda.txt"
+ - "requirements/build.txt"
+ - "requirements/test.txt"
+ - "setup.py"
+ - "csrc/"
+ - "cmake/"
+run_all_exclude_patterns:
+ - "docker/Dockerfile."
+ - "csrc/cpu/"
+ - "csrc/rocm/"
+ - "cmake/hipify.py"
+ - "cmake/cpu_extension.cmake"
+registries: public.ecr.aws/q9t5s3a7
+repositories:
+ main: "vllm-ci-postmerge-repo"
+ premerge: "vllm-ci-test-repo"
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe8476..0000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """
-
-
- Links for vLLM
- {x86_wheel}
- {arm_wheel}
-
-
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
- print(f"Generated index.html for {args.wheel}")
- # sync the abi tag with .buildkite/scripts/upload-wheels.sh
- if "x86_64" in filename:
- x86_wheel = filename
- arm_wheel = filename.replace("x86_64", "aarch64").replace(
- "manylinux1", "manylinux2014"
- )
- elif "aarch64" in filename:
- x86_wheel = filename.replace("aarch64", "x86_64").replace(
- "manylinux2014", "manylinux1"
- )
- arm_wheel = filename
- else:
- raise ValueError(f"Unsupported wheel: {filename}")
- # cloudfront requires escaping the '+' character
- f.write(
- template.format(
- x86_wheel=x86_wheel,
- x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
- arm_wheel=arm_wheel,
- arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
- )
- )
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
new file mode 100755
index 0000000000000..9a2384e524b63
--- /dev/null
+++ b/.buildkite/image_build/image_build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 8 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+BRANCH=$4
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+CACHE_FROM=$7
+CACHE_TO=$8
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+
+# docker buildx
+docker buildx create --name vllm-builder --driver docker-container --use
+docker buildx inspect --bootstrap
+docker buildx ls
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+ merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+else
+ merge_base_commit_build_args=""
+fi
+
+# build
+docker buildx build --file docker/Dockerfile \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg USE_SCCACHE=1 \
+ --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
+ --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+ --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+ ${merge_base_commit_build_args} \
+ --cache-from type=registry,ref=${CACHE_FROM},mode=max \
+ --cache-to type=registry,ref=${CACHE_TO},mode=max \
+ --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
+ $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+ --push \
+ --target test \
+ --progress plain .
diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml
new file mode 100644
index 0000000000000..d01c71dd9becf
--- /dev/null
+++ b/.buildkite/image_build/image_build.yaml
@@ -0,0 +1,57 @@
+group: Abuild
+steps:
+ - label: ":docker: Build image"
+ key: image-build
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU image"
+ key: image-build-cpu
+ depends_on: []
+ commands:
+ - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build HPU image"
+ soft_fail: true
+ depends_on: []
+ key: image-build-hpu
+ commands:
+ - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
+
+ - label: ":docker: Build CPU arm64 image"
+ key: cpu-arm64-image-build
+ depends_on: []
+ optional: true
+ commands:
+ - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 2
+ - exit_status: -10 # Agent was lost
+ limit: 2
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
new file mode 100755
index 0000000000000..a69732f430985
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --build-arg VLLM_CPU_AVX512BF16=true \
+ --build-arg VLLM_CPU_AVX512VNNI=true \
+ --build-arg VLLM_CPU_AMXBF16=true \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
new file mode 100755
index 0000000000000..615298b6555bd
--- /dev/null
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build --file docker/Dockerfile.cpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+ --target vllm-test \
+ --progress plain .
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
new file mode 100755
index 0000000000000..192447ef4577e
--- /dev/null
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+if [[ $# -lt 3 ]]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+REGISTRY=$1
+REPO=$2
+BUILDKITE_COMMIT=$3
+
+# authenticate with AWS ECR
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+
+# skip build if image already exists
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+ echo "Image not found, proceeding with build..."
+else
+ echo "Image found"
+ exit 0
+fi
+
+# build
+docker build \
+ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
+ --build-arg max_jobs=16 \
+ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+ --progress plain \
+ https://github.com/vllm-project/vllm-gaudi.git
+
+# push
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
index 46f1a9fbf6ff9..6c0b5540cbb6a 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -8,3 +8,4 @@ tasks:
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5
+rtol: 0.05
diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
new file mode 100644
index 0000000000000..4fb0b84bc4d81
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 3627b760eddcf..f94d681197d2d 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1
"""
+import os
+from contextlib import contextmanager
+
import lm_eval
import numpy as np
import yaml
-RTOL = 0.08
+DEFAULT_RTOL = 0.08
+
+
+@contextmanager
+def scoped_env_vars(new_env: dict[str, str]):
+ if not new_env:
+ # Fast path: nothing to do
+ yield
+ return
+
+ old_values = {}
+ new_keys = []
+
+ try:
+ for key, value in new_env.items():
+ if key in os.environ:
+ old_values[key] = os.environ[key]
+ else:
+ new_keys.append(key)
+ os.environ[key] = str(value)
+ yield
+ finally:
+ # Restore / clean up
+ for key, value in old_values.items():
+ os.environ[key] = value
+ for key in new_keys:
+ os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size):
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len},"
)
- results = lm_eval.simple_evaluate(
- model=backend,
- model_args=model_args,
- tasks=[task["name"] for task in eval_config["tasks"]],
- num_fewshot=eval_config["num_fewshot"],
- limit=eval_config["limit"],
- # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
- # text models. however, this is regressing measured strict-match for
- # existing text models in CI, so only apply it for mm, or explicitly set
- apply_chat_template=eval_config.get(
- "apply_chat_template", backend == "vllm-vlm"
- ),
- fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
- # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
- gen_kwargs=eval_config.get("gen_kwargs"),
- batch_size=batch_size,
- )
+
+ env_vars = eval_config.get("env_vars", None)
+ with scoped_env_vars(env_vars):
+ results = lm_eval.simple_evaluate(
+ model=backend,
+ model_args=model_args,
+ tasks=[task["name"] for task in eval_config["tasks"]],
+ num_fewshot=eval_config["num_fewshot"],
+ limit=eval_config["limit"],
+ # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+ # text models. however, this is regressing measured strict-match for
+ # existing text models in CI, so only apply it for mm, or explicitly set
+ apply_chat_template=eval_config.get(
+ "apply_chat_template", backend == "vllm-vlm"
+ ),
+ fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
+ # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
+ gen_kwargs=eval_config.get("gen_kwargs"),
+ batch_size=batch_size,
+ )
return results
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size)
+ rtol = eval_config.get("rtol", DEFAULT_RTOL)
+
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]]
print(
f"{task['name']} | {metric['name']}: "
- f"ground_truth={ground_truth} | measured={measured_value}"
+ f"ground_truth={ground_truth:.3f} | "
+ f"measured={measured_value:.3f} | rtol={rtol}"
)
- success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
+ success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
index 8d09ba178db7b..f10cb2f0b6e21 100644
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -7,18 +7,21 @@
import argparse
import json
-import re
import sys
from dataclasses import asdict, dataclass
+from datetime import datetime
from pathlib import Path
from typing import Any
from urllib.parse import quote
+import regex as re
+
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")
INDEX_HTML_TEMPLATE = """
+
{items}
@@ -89,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
)
-def generate_project_list(subdir_names: list[str]) -> str:
+def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
"""
Generate project list HTML content linking to each project & variant sub-directory.
"""
@@ -97,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
for name in sorted(subdir_names):
name = name.strip("/").strip(".")
href_tags.append(f' {name}/
')
- return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+ return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
def generate_package_index_and_metadata(
- wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+ wheel_files: list[WheelFileInfo],
+ wheel_base_dir: Path,
+ index_base_dir: Path,
+ comment: str = "",
) -> tuple[str, str]:
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
@@ -119,7 +125,7 @@ def generate_package_index_and_metadata(
file_meta = asdict(file)
file_meta["path"] = file_path_quoted
metadata.append(file_meta)
- index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+ index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str
@@ -130,6 +136,7 @@ def generate_index_and_metadata(
index_base_dir: Path,
default_variant: str | None = None,
alias_to_default: str | None = None,
+ comment: str = "",
):
"""
Generate index for all wheel files.
@@ -140,6 +147,7 @@ def generate_index_and_metadata(
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
+ comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
@@ -233,6 +241,10 @@ def generate_index_and_metadata(
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.")
+ # Generate comment in HTML header
+ comment_str = f" ({comment})" if comment else ""
+ comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
+
# Generate index for each variant
subdir_names = set()
for variant, files in variant_to_files.items():
@@ -252,7 +264,7 @@ def generate_index_and_metadata(
subdir_names = subdir_names.union(packages)
else:
# generate project list for this variant directly
- project_list_str = generate_project_list(sorted(packages))
+ project_list_str = generate_project_list(sorted(packages), comment_tmpl)
with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str)
@@ -262,7 +274,7 @@ def generate_index_and_metadata(
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
- package_files, wheel_base_dir, package_dir
+ package_files, wheel_base_dir, package_dir, comment
)
with open(package_dir / "index.html", "w") as f:
f.write(index_str)
@@ -270,7 +282,7 @@ def generate_index_and_metadata(
f.write(metadata_str)
# Generate top-level project list index
- project_list_str = generate_project_list(sorted(subdir_names))
+ project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str)
@@ -282,6 +294,7 @@ if __name__ == "__main__":
--current-objects : path to JSON file containing current S3 objects listing in this version directory
--output-dir : directory to store generated index files
--alias-to-default : (optional) alias variant name for the default variant
+ --comment : (optional) comment string to include in generated HTML files
"""
parser = argparse.ArgumentParser(
@@ -311,6 +324,12 @@ if __name__ == "__main__":
default=None,
help="Alias variant name for the default variant",
)
+ parser.add_argument(
+ "--comment",
+ type=str,
+ default="",
+ help="Optional comment string to include in generated HTML files",
+ )
args = parser.parse_args()
@@ -365,5 +384,6 @@ if __name__ == "__main__":
index_base_dir=index_base_dir,
default_variant=None,
alias_to_default=args.alias_to_default,
+ comment=args.comment.strip(),
)
print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index b5f6b2494792f..9c6e7766b2ac4 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -40,7 +40,8 @@ function cpu_tests() {
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
- pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+ pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+ pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
docker exec cpu-test bash -c '
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
index 29c8f5ed5a91a..0db1abe37ba11 100644
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
+ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 4d163399cfc6c..dfc9db512d1e9 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,6 +38,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
@@ -46,6 +47,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'
diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
index 5b25c358fc4aa..3fb7c82c8d333 100755
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+ echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+ exit 0
+fi
+
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 2eaa91c04086c..8e38ace0bfbc2 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -81,7 +81,10 @@ else
alias_arg=""
fi
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+# HACK: we do not need regex module here, but it is required by pre-commit hook
+# To avoid any external dependency, we simply replace it back to the stdlib re module
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
# copy indices to // unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ee4fdebae5675..4038d32834e68 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -398,7 +398,8 @@ steps:
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- - vllm/
+ - vllm/v1/attention
+ - vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -440,23 +441,29 @@ steps:
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
+ # for basic
+ - python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- - python3 offline_inference/basic/chat.py
- - python3 offline_inference/prefix_caching.py
- - python3 offline_inference/llm_engine_example.py
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- - python3 offline_inference/basic/classify.py
- - python3 offline_inference/basic/embed.py
- - python3 offline_inference/basic/score.py
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -718,14 +725,15 @@ steps:
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-- label: LM Eval Small Models # 15min
- timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
+ autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
@@ -738,7 +746,7 @@ steps:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
- commands: # LMEval
+ commands: # LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/
@@ -974,8 +982,8 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
- timeout_in_minutes: 70
+- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
+ timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
@@ -987,7 +995,8 @@ steps:
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
+ timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -1011,7 +1020,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
-- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
+ timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@@ -1120,7 +1130,6 @@ steps:
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- - vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
@@ -1154,17 +1163,15 @@ steps:
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
- - tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-- label: ROCm GPT-OSS Eval
+- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
- agent_pool: mi325_1
- mirror_hardwares: [amdexperimental, amdproduction]
+ gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
@@ -1173,7 +1180,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
@@ -1378,7 +1385,7 @@ steps:
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- # Disabled for now because MXFP4 backend on non-cuda platform
+ # Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
@@ -1444,12 +1451,13 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
+
- label: LM Eval Large Models # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
gpu: a100
optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
@@ -1461,11 +1469,11 @@ steps:
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
- mirror_hardwares: [amdexperimental, amdproduction]
- agent_pool: mi325_4
- # grade: Blocking
gpu: h100
optional: true
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
@@ -1475,6 +1483,7 @@ steps:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
##### H200 test #####
- label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental]
@@ -1506,6 +1515,57 @@ steps:
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
+##### E2E Eval Tests #####
+- label: LM Eval Small Models (1 Card) # 15min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 Card)
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: ROCm LM Eval Large Models (8 Card)
+ mirror_hardwares: [amdproduction]
+ agent_pool: mi325_8
+ num_gpus: 8
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
+
+- label: ROCm GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ agent_pool: mi325_1
+ mirror_hardwares: [amdexperimental, amdproduction]
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
@@ -1520,7 +1580,6 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
-
- label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
@@ -1552,4 +1611,28 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f79e9266559f6..8fc3587f7813c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -350,7 +350,8 @@ steps:
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- - vllm/
+ - vllm/v1/attention
+ - vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -387,6 +388,7 @@ steps:
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
+ - vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
@@ -466,7 +468,9 @@ steps:
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@@ -480,7 +484,9 @@ steps:
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+ # However, find does not normally propagate error codes, so we combine it with xargs
+ # (using -0 for proper path handling)
+ - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
new file mode 100644
index 0000000000000..6e444eae14c74
--- /dev/null
+++ b/.buildkite/test_areas/attention.yaml
@@ -0,0 +1,21 @@
+group: Attention
+depends_on:
+ - image-build
+steps:
+- label: V1 attention (H100)
+ timeout_in_minutes: 30
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - pytest -v -s v1/attention
+
+- label: V1 attention (B200)
+ timeout_in_minutes: 30
+ gpu: b200
+ source_file_dependencies:
+ - vllm/v1/attention
+ - tests/v1/attention
+ commands:
+ - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
new file mode 100644
index 0000000000000..759d2b5358714
--- /dev/null
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -0,0 +1,16 @@
+group: Basic Correctness
+depends_on:
+ - image-build
+steps:
+- label: Basic Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/basic_correctness/test_basic_correctness
+ - tests/basic_correctness/test_cpu_offload
+ - tests/basic_correctness/test_cumem.py
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s basic_correctness/test_cumem.py
+ - pytest -v -s basic_correctness/test_basic_correctness.py
+ - pytest -v -s basic_correctness/test_cpu_offload.py
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
new file mode 100644
index 0000000000000..574b642d407b0
--- /dev/null
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -0,0 +1,19 @@
+group: Benchmarks
+depends_on:
+ - image-build
+steps:
+- label: Benchmarks
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/.buildkite"
+ source_file_dependencies:
+ - benchmarks/
+ commands:
+ - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/benchmarks/
+ commands:
+ - pytest -v -s benchmarks/
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
new file mode 100644
index 0000000000000..0ba00925a4838
--- /dev/null
+++ b/.buildkite/test_areas/compile.yaml
@@ -0,0 +1,57 @@
+group: Compile
+depends_on:
+ - image-build
+steps:
+- label: Fusion and Compile Tests (B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/worker/
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/test_fusion_attn.py
+ - tests/compile/test_silu_mul_quant_fusion.py
+ - tests/compile/distributed/test_fusion_all_reduce.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ - tests/compile/fullgraph/test_full_graph.py
+ commands:
+ - nvidia-smi
+ - pytest -v -s tests/compile/test_fusion_attn.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+ # this runner has 2 GPUs available even though num_gpus=2 is not set
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+ # Wrap with quotes to escape yaml
+ - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+ # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+ - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E (2 GPUs)(B200)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ - tests/compile/distributed/test_fusions_e2e.py
+ commands:
+ - nvidia-smi
+ # Run all e2e fusion tests
+ - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+
diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml
new file mode 100644
index 0000000000000..50c0c338c2434
--- /dev/null
+++ b/.buildkite/test_areas/cuda.yaml
@@ -0,0 +1,22 @@
+group: CUDA
+depends_on:
+ - image-build
+steps:
+- label: Platform Tests (CUDA)
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/cuda
+ commands:
+ - pytest -v -s cuda/test_cuda_context.py
+
+- label: Cudagraph
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/v1/cudagraph
+ - vllm/v1/cudagraph_dispatcher.py
+ - vllm/config/compilation.py
+ - vllm/compilation
+ commands:
+ - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+ - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
new file mode 100644
index 0000000000000..2cc90698d916a
--- /dev/null
+++ b/.buildkite/test_areas/distributed.yaml
@@ -0,0 +1,199 @@
+group: Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Comm Ops
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/distributed
+ - tests/distributed
+ commands:
+ - pytest -v -s distributed/test_comm_ops.py
+ - pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
+
+- label: Distributed (2 GPUs)
+ timeout_in_minutes: 90
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/compilation/
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/worker/worker_base.py
+ - vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+ - pytest -v -s entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s ./compile/test_wrapper.py
+ - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Tests (4 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - tests/distributed/test_utils
+ - tests/distributed/test_pynccl
+ - tests/distributed/test_events
+ - tests/compile/fullgraph/test_basic_correctness.py
+ - examples/offline_inference/rlhf.py
+ - examples/offline_inference/rlhf_colocate.py
+ - tests/examples/offline_inference/data_parallel.py
+ - tests/v1/distributed
+ - tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and external_dp=2
+ - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and pp=2
+ - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with internal dp
+ - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+ - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+ - pytest -v -s distributed/test_utils.py
+ - pytest -v -s compile/fullgraph/test_basic_correctness.py
+ - pytest -v -s distributed/test_pynccl.py
+ - pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
+ # TODO: create a dedicated test section for multi-GPU example tests
+ # when we have multiple distributed example tests
+ - cd ../examples/offline_inference
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+
+- label: Distributed Tests (8 GPUs)(H100)
+ timeout_in_minutes: 10
+ gpu: h100
+ num_gpus: 8
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - examples/offline_inference/torchrun_dp_example.py
+ - vllm/config/parallel.py
+ - vllm/distributed/
+ - vllm/v1/engine/llm_engine.py
+ - vllm/v1/executor/uniproc_executor.py
+ - vllm/v1/worker/gpu_worker.py
+ commands:
+ # https://github.com/NVIDIA/nccl/issues/1838
+ - export NCCL_CUMEM_HOST_ENABLE=0
+ # test with torchrun tp=2 and dp=4 with ep
+ - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
+- label: Distributed Tests (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/
+ commands:
+ # NOTE: don't test llama model here, it seems hf implementation is buggy
+ # see https://github.com/vllm-project/vllm/pull/5689 for details
+ - pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - pytest -v -s -x lora/test_mixtral.py
+
+- label: Distributed Tests (2 GPUs)(H200)
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+ - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+ - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+ - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: Distributed Tests (2 GPUs)(B200)
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+ - pytest -v -s tests/v1/distributed/test_dbo.py
+
+- label: 2 Node Test (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ num_nodes: 2
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ - tests/examples/offline_inference/data_parallel.py
+ commands:
+ - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+
+- label: Pipeline + Context Parallelism (4 GPUs))
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ commands:
+ - pytest -v -s distributed/test_pp_cudagraph.py
+ - pytest -v -s distributed/test_pipeline_parallel.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
new file mode 100644
index 0000000000000..93d389815edac
--- /dev/null
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -0,0 +1,59 @@
+group: E2E Integration
+depends_on:
+ - image-build
+steps:
+- label: DeepSeek V2-Lite Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+
+- label: Prime-RL Integration (2 GPUs)
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Async EPLB Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
+ timeout_in_minutes: 60
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
new file mode 100644
index 0000000000000..a028e0e4af4c1
--- /dev/null
+++ b/.buildkite/test_areas/engine.yaml
@@ -0,0 +1,26 @@
+group: Engine
+depends_on:
+ - image-build
+steps:
+- label: Engine
+ timeout_in_minutes: 15
+ source_file_dependencies:
+ - vllm/
+ - tests/engine
+ - tests/test_sequence
+ - tests/test_config
+ - tests/test_logger
+ - tests/test_vllm_port
+ commands:
+ - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+
+- label: V1 e2e + engine
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
new file mode 100644
index 0000000000000..0a789be943f37
--- /dev/null
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -0,0 +1,68 @@
+group: Entrypoints
+depends_on:
+ - image-build
+steps:
+- label: Entrypoints Unit Tests
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration (LLM)
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/llm
+ - tests/entrypoints/offline_mode
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration (API Server)
+ timeout_in_minutes: 130
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai
+ - tests/entrypoints/test_chat_utils
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+ - pytest -v -s entrypoints/test_chat_utils.py
+
+
+- label: Entrypoints Integration (Pooling)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+
+- label: Entrypoints V1
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: OpenAI API Correctness
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - csrc/
+ - vllm/entrypoints/openai/
+ - vllm/model_executor/models/whisper.py
+ commands: # LMEval+Transcription WER check
+ - pytest -s entrypoints/openai/correctness/
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
new file mode 100644
index 0000000000000..feb8252148c7f
--- /dev/null
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -0,0 +1,23 @@
+group: Expert Parallelism
+depends_on:
+ - image-build
+steps:
+- label: EPLB Algorithm
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_algo.py
+ commands:
+ - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution
+ timeout_in_minutes: 20
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_execute.py
+ commands:
+ - pytest -v -s distributed/test_eplb_execute.py
+ - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
new file mode 100644
index 0000000000000..7ca099516d641
--- /dev/null
+++ b/.buildkite/test_areas/kernels.yaml
@@ -0,0 +1,117 @@
+group: Kernels
+depends_on:
+ - image-build
+steps:
+- label: Kernels Core Operation Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - tests/kernels/core
+ - tests/kernels/test_top_k_per_row.py
+ commands:
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - csrc/attention/
+ - vllm/attention
+ - vllm/v1/attention
+ - tests/kernels/attention
+ commands:
+ - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Quantization Test %N
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/quantization/
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization
+ commands:
+ - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels MoE Test %N
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - csrc/quantization/cutlass_w8a8/moe/
+ - csrc/moe/
+ - tests/kernels/moe
+ - vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
+ - vllm/envs.py
+ - vllm/config
+ commands:
+ - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Mamba Test
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - csrc/mamba/
+ - tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
+ commands:
+ - pytest -v -s kernels/mamba
+
+- label: Kernels DeepGEMM Test (H100)
+ timeout_in_minutes: 45
+ gpu: h100
+ num_gpus: 1
+ source_file_dependencies:
+ - tools/install_deepgemm.sh
+ - vllm/utils/deep_gemm.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization/test_block_fp8.py
+ - tests/kernels/moe/test_deepgemm.py
+ - tests/kernels/moe/test_batched_deepgemm.py
+ - tests/kernels/attention/test_deepgemm_attention.py
+ commands:
+ - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+ - pytest -v -s kernels/moe/test_deepgemm.py
+ - pytest -v -s kernels/moe/test_batched_deepgemm.py
+ - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
+- label: Kernels (B200)
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ # optional: true
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - csrc/attention/mla/
+ - csrc/quantization/cutlass_w8a8/moe/
+ - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/v1/attention/backends/mla/cutlass_mla.py
+ - vllm/v1/attention/backends/mla/flashinfer_mla.py
+ - vllm/platforms/cuda.py
+ - vllm/attention/selector.py
+ commands:
+ - nvidia-smi
+ - python3 examples/offline_inference/basic/chat.py
+ # Attention
+ # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_attention_selector.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+ - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+ # Quantization
+ - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+ - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
new file mode 100644
index 0000000000000..9af43e0c375a8
--- /dev/null
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -0,0 +1,46 @@
+group: LM Eval
+depends_on:
+ - image-build
+steps:
+- label: LM Eval Small Models
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ autorun_on_main: true
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: LM Eval Large Models (4 GPUs)(A100)
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: LM Eval Large Models (4 GPUs)(H100)
+ gpu: h100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
+- label: LM Eval Small Models (B200)
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
new file mode 100644
index 0000000000000..809b4138f44ba
--- /dev/null
+++ b/.buildkite/test_areas/lora.yaml
@@ -0,0 +1,31 @@
+group: LoRA
+depends_on:
+ - image-build
+steps:
+- label: LoRA %N
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
+ parallelism: 4
+
+
+- label: LoRA TP (Distributed)
+ timeout_in_minutes: 30
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ # FIXIT: find out which code initialize cuda before running the test
+ # before the fix, we need to use spawn to test it
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # There is some Tensor Parallelism related processing logic in LoRA that
+ # requires multi-GPU testing for validation.
+ - pytest -v -s -x lora/test_chatglm3_tp.py
+ - pytest -v -s -x lora/test_llama_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
+ - pytest -v -s -x lora/test_olmoe_tp.py
+ - pytest -v -s -x lora/test_gptoss_tp.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
new file mode 100644
index 0000000000000..072bccadb726a
--- /dev/null
+++ b/.buildkite/test_areas/misc.yaml
@@ -0,0 +1,163 @@
+group: Miscellaneous
+depends_on:
+ - image-build
+steps:
+- label: V1 Others
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ # split the test to avoid interference
+ - pytest -v -s -m 'not cpu_test' v1/core
+ - pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
+ - pytest -v -s v1/sample
+ - pytest -v -s v1/logits_processors
+ - pytest -v -s v1/worker
+ - pytest -v -s v1/spec_decode
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
+ - pytest -v -s v1/test_oracle.py
+ - pytest -v -s v1/test_request.py
+ - pytest -v -s v1/test_outputs.py
+ # Integration test for streaming correctness (requires special branch).
+ - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+ - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Others (CPU)
+ depends_on: ~
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+- label: Regression
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/test_regression
+ commands:
+ - pip install modelscope
+ - pytest -v -s test_regression.py
+ working_dir: "/vllm-workspace/tests" # optional
+
+- label: Examples
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/examples"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - vllm/multimodal
+ - examples/
+ commands:
+ - pip install tensorizer # for tensorizer test
+ - python3 offline_inference/basic/chat.py # for basic
+ - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+ - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ # for multi-modal models
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ # for pooling models
+ - python3 pooling/pooling/vision_language_pooling.py --seed 0
+ # for features demo
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Metrics, Tracing (2 GPUs)
+ timeout_in_minutes: 20
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/tracing
+ commands:
+ - "pip install \
+ 'opentelemetry-sdk>=1.26.0' \
+ 'opentelemetry-api>=1.26.0' \
+ 'opentelemetry-exporter-otlp>=1.26.0' \
+ 'opentelemetry-semantic-conventions-ai>=0.4.1'"
+ - pytest -v -s v1/tracing
+
+- label: Python-only Installation
+ depends_on: ~
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - tests/standalone_tests/python_only_compile.sh
+ - setup.py
+ commands:
+ - bash standalone_tests/python_only_compile.sh
+
+- label: Async Engine, Inputs, Utils, Worker
+ timeout_in_minutes: 50
+ source_file_dependencies:
+ - vllm/
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
+ depends_on: ~
+ timeout_in_minutes: 20
+ source_file_dependencies:
+ - vllm/
+ - tests/test_inputs.py
+ - tests/test_outputs.py
+ - tests/multimodal
+ - tests/standalone_tests/lazy_imports.py
+ - tests/tokenizers_
+ - tests/transformers_utils
+ - tests/config
+ no_gpu: true
+ commands:
+ - python3 standalone_tests/lazy_imports.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s test_outputs.py
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s tokenizers_
+ - pytest -v -s transformers_utils
+ - pytest -v -s config
+
+- label: GPT-OSS Eval (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Batch Invariance (H100)
+ timeout_in_minutes: 25
+ gpu: h100
+ source_file_dependencies:
+ - vllm/v1/attention
+ - vllm/model_executor/layers
+ - tests/v1/determinism/
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pip install pytest-timeout pytest-forked
+ - pytest -v -s v1/determinism/test_batch_invariance.py
+ - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
\ No newline at end of file
diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml
new file mode 100644
index 0000000000000..996c8bb8b780a
--- /dev/null
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -0,0 +1,17 @@
+group: Model Executor
+depends_on:
+ - image-build
+steps:
+- label: Model Executor
+ timeout_in_minutes: 35
+ source_file_dependencies:
+ - vllm/engine/arg_utils.py
+ - vllm/config/model.py
+ - vllm/model_executor
+ - tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+ commands:
+ - apt-get update && apt-get install -y curl libsodium23
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
new file mode 100644
index 0000000000000..39a5d51c48833
--- /dev/null
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -0,0 +1,62 @@
+group: Models - Basic
+depends_on:
+ - image-build
+steps:
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_initialization.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Transformers Nightly Models
+ working_dir: "/vllm-workspace/"
+ optional: true
+ soft_fail: true
+ commands:
+ - pip install --upgrade git+https://github.com/huggingface/transformers
+ - pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
+ - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/test_mapping.py
+ - python3 examples/offline_inference/basic/chat.py
+ - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml
new file mode 100644
index 0000000000000..b6bfbf2ddab47
--- /dev/null
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -0,0 +1,22 @@
+group: Models - Distributed
+depends_on:
+ - image-build
+steps:
+- label: Distributed Model Tests (2 GPUs)
+ timeout_in_minutes: 50
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+ # Avoid importing model tests that cause CUDA reinitialization error
+ - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
new file mode 100644
index 0000000000000..f70192c4ebc0a
--- /dev/null
+++ b/.buildkite/test_areas/models_language.yaml
@@ -0,0 +1,91 @@
+group: Models - Language
+depends_on:
+ - image-build
+steps:
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language
+ commands:
+ # Test standard language models, excluding a subset of slow tests
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling
+ commands:
+ - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
new file mode 100644
index 0000000000000..fc24068c20a46
--- /dev/null
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -0,0 +1,79 @@
+group: Models - Multimodal
+depends_on:
+ - image-build
+steps:
+- label: Multi-Modal Models (Standard) # 60min
+ timeout_in_minutes: 80
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Processor Test (CPU)
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ no_gpu: true
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor # 44min
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+ timeout_in_minutes: 70
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - vllm/multimodal/
+ - vllm/inputs/
+ - vllm/v1/core/
+ commands:
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
+- label: Multi-Modal Models (Extended) 1
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models (Extended) 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models (Extended) 3
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models
+ optional: true
+ commands:
+ - echo 'Testing custom models...'
+ # PR authors can temporarily add commands below to test individual models
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
new file mode 100644
index 0000000000000..60c179aa098e1
--- /dev/null
+++ b/.buildkite/test_areas/plugins.yaml
@@ -0,0 +1,34 @@
+group: Plugins
+depends_on:
+ - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/plugins/
+ - tests/plugins/
+ commands:
+ # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+ - pip install -e ./plugins/vllm_add_dummy_platform
+ - pytest -v -s plugins_tests/test_platform_plugins.py
+ - pip uninstall vllm_add_dummy_platform -y
+ # end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
+ # begin stat_logger plugins test
+ - pip install -e ./plugins/vllm_add_dummy_stat_logger
+ - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+ - pip uninstall dummy_stat_logger -y
+ # end stat_logger plugins test
+ # other tests continue here:
+ - pytest -v -s plugins_tests/test_scheduler_plugins.py
+ - pip install -e ./plugins/vllm_add_dummy_model
+ - pytest -v -s distributed/test_distributed_oot.py
+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+ - pytest -v -s models/test_oot_registration.py # it needs a clean process
+ - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
new file mode 100644
index 0000000000000..703c82eb1a91b
--- /dev/null
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -0,0 +1,50 @@
+group: PyTorch
+depends_on:
+ - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run unit tests defined directly under compile/,
+ # not including subdirectories, which are usually heavier
+ # tests covered elsewhere.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph Smoke Test
+ timeout_in_minutes: 30
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # Run smoke tests under fullgraph directory, except test_full_graph.py
+ # as it is a heavy test that is covered in other steps.
+ # Use `find` to launch multiple instances of pytest so that
+ # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+ - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+ timeout_in_minutes: 40
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ # fp8 kv scales not supported on sm89, tested on Blackwell instead
+ - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+ # Limit to no custom ops to reduce running time
+ # Wrap with quotes to escape yaml and avoid starting -k string with a -
+ - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+ # if this test fails, it means the nightly torch version is not compatible with some
+ # of the dependencies. Please check the error message and add the package to whitelist
+ # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+ soft_fail: true
+ source_file_dependencies:
+ - requirements/nightly_torch_test.txt
+ commands:
+ - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
new file mode 100644
index 0000000000000..6e89d6af3b8d1
--- /dev/null
+++ b/.buildkite/test_areas/quantization.yaml
@@ -0,0 +1,46 @@
+group: Quantization
+depends_on:
+ - image-build
+steps:
+- label: Quantization
+ timeout_in_minutes: 90
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ - tests/quantization
+ commands:
+ # temporary install here since we need nightly, will move to requirements/test.in
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+ - uv pip install --system conch-triton-kernels
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/model_executor/layers/quantization
+ - tests/models/quantization
+ commands:
+ - pytest -v -s models/quantization
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
new file mode 100644
index 0000000000000..ad377148fd073
--- /dev/null
+++ b/.buildkite/test_areas/samplers.yaml
@@ -0,0 +1,14 @@
+group: Samplers
+depends_on:
+ - image-build
+steps:
+- label: Samplers Test
+ timeout_in_minutes: 75
+ source_file_dependencies:
+ - vllm/model_executor/layers
+ - vllm/sampling_metadata.py
+ - tests/samplers
+ - tests/conftest.py
+ commands:
+ - pytest -v -s samplers
+ - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
new file mode 100644
index 0000000000000..7040cd1d253b3
--- /dev/null
+++ b/.buildkite/test_areas/tool_use.yaml
@@ -0,0 +1,23 @@
+group: Tool use
+depends_on:
+ - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
+ fast_check: false
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ commands:
+ - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU)
+ depends_on: ~
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ no_gpu: true
+ commands:
+ - pytest -v -s -m 'cpu_test' tool_use
diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml
new file mode 100644
index 0000000000000..cfc5bb20fe7ad
--- /dev/null
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -0,0 +1,25 @@
+group: Weight Loading
+depends_on:
+ - image-build
+steps:
+- label: Weight Loading Multiple GPU # 33min
+ timeout_in_minutes: 45
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ gpu: a100
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 997a40e18e588..3ad79f93bc7ad 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,6 +14,52 @@ pull_request_rules:
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
+- name: comment-pre-commit-failure
+ description: Comment on PR when pre-commit check fails
+ conditions:
+ - status-failure=pre-commit
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+ ```bash
+ uv pip install pre-commit
+ pre-commit install
+ pre-commit run --all-files
+ ```
+
+ Then, commit the changes and push to your branch.
+
+ For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+ > [!TIP]
+ >
+ > Is mypy or markdownlint failing?
+ >
+ > mypy and markdownlint are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+ >
+ > ```bash
+ > # For mypy (substitute "3.10" with the failing version if needed)
+ > pre-commit run --hook-stage manual mypy-3.10
+ > # For markdownlint
+ > pre-commit run --hook-stage manual markdownlint
+ > ```
+ >
+
+- name: comment-dco-failure
+ description: Comment on PR when DCO check fails
+ conditions:
+ - status-failure=dco
+ - -closed
+ - -draft
+ actions:
+ comment:
+ message: |
+ Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
@@ -140,7 +186,7 @@ pull_request_rules:
- files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
- files~=^vllm/entrypoints/tool_server.py
- files~=^vllm/entrypoints/tool.py
- files~=^vllm/entrypoints/context.py
@@ -358,4 +404,4 @@ pull_request_rules:
actions:
label:
add:
- - kv-connector
\ No newline at end of file
+ - kv-connector
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 56fbe5ca704a1..df8910837715d 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:
steps:
- name: Checkout repository
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+ uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Python
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index 3a12c4b3a8300..e80a5c0cc80f9 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30
steps:
- - uses: actions/checkout@v6
+ - uses: actions/checkout@v6.0.1
- uses: astral-sh/setup-uv@v7
with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index a03b979ad761d..1041653c2f57e 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+ - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index dca3089f496c9..44bf71db5e9de 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,13 +7,15 @@ on:
jobs:
close-issues-and-pull-requests:
+ # Prevents triggering on forks or other repos
+ if: github.repository == 'vllm-project/vllm'
permissions:
issues: write
pull-requests: write
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+ - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e09972fe71995..6b93e3fe91603 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+ "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+ "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+ )
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
- "csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/README.md b/README.md
index abbb63158f166..5c040fe4a66d2 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,7 @@ Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
+- Arm
- AWS
- Crusoe Cloud
- Databricks
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 56b721cbb4021..25baa9cbda39c 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -96,8 +96,9 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
- VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
- vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+ local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+ VLLM_SERVER_DEV_MODE=1 \
+ vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_SERVER_DEV_MODE=1 \
diff --git a/benchmarks/benchmark_hash.py b/benchmarks/benchmark_hash.py
new file mode 100644
index 0000000000000..08cdc012d6527
--- /dev/null
+++ b/benchmarks/benchmark_hash.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+ (32-byte bytes object, 32-int tuple)
+
+Usage:
+ python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+ """Generate a deterministic test payload."""
+ random.seed(seed)
+ bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+ int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+ return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+ """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+ times: list[float] = []
+
+ # Warm-up to avoid first-run noise.
+ for _ in range(200):
+ func(data)
+
+ for _ in range(iterations):
+ start = time.perf_counter()
+ func(data)
+ end = time.perf_counter()
+ times.append(end - start)
+
+ avg = statistics.mean(times)
+ std = statistics.stdev(times) if len(times) > 1 else 0.0
+ return avg, std
+
+
+def _run_benchmarks(
+ benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+ data: tuple,
+ iterations: int,
+):
+ """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+ for name, func in benchmarks:
+ try:
+ avg, std = _benchmark_func(func, data, iterations)
+ except ModuleNotFoundError as exc:
+ print(f"Skipping {name}: {exc}")
+ continue
+ yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+ """Wrapper for Python's built-in hash()."""
+ return hash(data)
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--iterations",
+ type=int,
+ default=10_000,
+ help="Number of measured iterations per hash function.",
+ )
+ parser.add_argument(
+ "--seed", type=int, default=42, help="Random seed for test payload."
+ )
+ args = parser.parse_args()
+
+ data = _generate_test_data(args.seed)
+ benchmarks = (
+ ("SHA256 (pickle)", sha256),
+ ("xxHash (pickle)", xxhash),
+ ("built-in hash()", builtin_hash),
+ )
+
+ print("=" * 60)
+ print("HASH FUNCTION MICRO BENCHMARK")
+ print("=" * 60)
+ print("Test data: (32-byte bytes object, 32-int tuple)")
+ print(f"Iterations: {args.iterations:,}")
+ print("=" * 60)
+
+ results = list(_run_benchmarks(benchmarks, data, args.iterations))
+ builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+ print("\nResults:")
+ for name, avg, std in results:
+ print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+ if builtin_entry:
+ _, builtin_avg, _ = builtin_entry
+ print("\n" + "=" * 60)
+ print("SUMMARY (relative to built-in hash())")
+ print("=" * 60)
+ for name, avg, _ in results:
+ if name == "built-in hash()":
+ continue
+ speed_ratio = avg / builtin_avg
+ print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+ else:
+ print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_prefix_block_hash.py b/benchmarks/benchmark_prefix_block_hash.py
new file mode 100644
index 0000000000000..8bcd8af0d3102
--- /dev/null
+++ b/benchmarks/benchmark_prefix_block_hash.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+ python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+ num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+ rng = random.Random(seed)
+ return [
+ [rng.randrange(vocab_size) for _ in range(block_size)]
+ for _ in range(num_blocks)
+ ]
+
+
+def _hash_all_blocks(
+ hash_fn: Callable[[object], bytes],
+ blocks: Iterable[Sequence[int]],
+) -> float:
+ parent_hash: BlockHash | None = None
+ start = time.perf_counter()
+ for block in blocks:
+ parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+ end = time.perf_counter()
+ return end - start
+
+
+def _benchmark(
+ hash_algo: str,
+ blocks: list[list[int]],
+ trials: int,
+) -> tuple[float, float, float] | None:
+ try:
+ hash_fn = get_hash_fn_by_name(hash_algo)
+ init_none_hash(hash_fn)
+ timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+ except ModuleNotFoundError as exc:
+ print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+ return None
+
+ avg = statistics.mean(timings)
+ best = min(timings)
+ # throughput: tokens / second
+ tokens_hashed = len(blocks) * len(blocks[0])
+ throughput = tokens_hashed / best
+ return avg, best, throughput
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+ parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+ parser.add_argument(
+ "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+ parser.add_argument(
+ "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+ )
+ parser.add_argument(
+ "--algorithms",
+ nargs="+",
+ default=SUPPORTED_ALGOS,
+ choices=SUPPORTED_ALGOS,
+ help="Hash algorithms to benchmark.",
+ )
+ args = parser.parse_args()
+
+ blocks = _generate_blocks(
+ args.num_blocks, args.block_size, args.vocab_size, args.seed
+ )
+ print(
+ f"Benchmarking {len(args.algorithms)} algorithms on "
+ f"{args.num_blocks} blocks (block size={args.block_size})."
+ )
+
+ for algo in args.algorithms:
+ result = _benchmark(algo, blocks, args.trials)
+ if result is None:
+ continue
+
+ avg, best, throughput = result
+ print(
+ f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
+ f"throughput: {throughput / 1e6:.2f}M tokens/s"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index df122b4c5e8db..a4e1b163dcca9 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -963,8 +963,7 @@ def create_argument_parser():
parser.add_argument(
"--profile",
action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+ help="Use vLLM Profiling. --profiler-config must be provided on the server.",
)
parser.add_argument(
"--result-dir",
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index d809bf1db8cbc..fb3329975cee3 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@ from tqdm import tqdm
import vllm._custom_ops as ops
from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ per_token_group_quant_fp8,
+)
@dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
hidden_size: int
add_residual: bool
dtype: torch.dtype
+ group_size: list[int]
def description(self):
return (
@@ -29,6 +33,7 @@ class bench_params_t:
f"x D {self.hidden_size} "
f"x R {self.add_residual} "
f"x DT {self.dtype}"
+ f"x GS {self.group_size}"
)
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
HIDDEN_SIZES = list(range(1024, 8129, 1024))
ADD_RESIDUAL = [True, False]
DTYPES = [torch.bfloat16, torch.float]
+ GROUP_SIZES = [[1, 64], [1, 128]]
- combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+ combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
bench_params = list(
- map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+ map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
)
return bench_params
@@ -52,6 +58,7 @@ def unfused_int8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
# Norm
torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
torch_out, _ = ops.scaled_fp8_quant(torch_out)
+def unfused_groupwise_fp8_impl(
+ rms_norm_layer: RMSNorm,
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ # Norm
+ torch_out = None
+ if residual is None:
+ torch_out = rms_norm_layer.forward_cuda(x, residual)
+ else:
+ torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+ # Quant
+ torch_out, _ = per_token_group_quant_fp8(
+ torch_out, group_size=group_size[1], use_ue8m0=False
+ )
+
+
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
+ group_size: list[int],
):
out, _ = ops.rms_norm_dynamic_per_token_quant(
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
)
+def fused_groupwise_impl(
+ rms_norm_layer: RMSNorm, # this stores the weights
+ x: torch.Tensor,
+ residual: torch.Tensor | None,
+ quant_dtype: torch.dtype,
+ group_size: list[int],
+):
+ out, _ = ops.rms_norm_per_block_quant(
+ x,
+ rms_norm_layer.weight,
+ 1e-6,
+ quant_dtype,
+ group_size,
+ residual=residual,
+ is_scale_transposed=True,
+ )
+
+
# Bench functions
def bench_fn(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: torch.Tensor,
quant_dtype: torch.dtype,
+ group_size: list[int],
label: str,
sub_label: str,
fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
"x": x,
"residual": residual,
"quant_dtype": quant_dtype,
+ "group_size": group_size,
"fn": fn,
}
return TBenchmark.Timer(
- stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+ stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
globals=globals,
label=label,
sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.int8,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
x,
residual,
torch.float8_e4m3fn,
+ params.group_size,
label,
sub_label,
fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
)
)
+ # unfused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ unfused_groupwise_fp8_impl,
+ "unfused_groupwise_fp8_impl",
+ )
+ )
+
+ # fused groupwise fp8 impl.
+ timers.append(
+ bench_fn(
+ layer,
+ x,
+ residual,
+ torch.float8_e4m3fn,
+ params.group_size,
+ label,
+ sub_label,
+ fused_groupwise_impl,
+ "fused_groupwise_fp8_impl",
+ )
+ )
+
print_timers(timers)
return timers
diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
new file mode 100644
index 0000000000000..04921dafbdbea
--- /dev/null
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ _per_token_group_quant_fp8_colmajor,
+ silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+ print(
+ f"Note : The timings reported above is for {cuda_graph_nops} "
+ "consecutive invocations of the benchmarking functions. "
+ f"Please divide by {cuda_graph_nops} for single invocation "
+ "timings."
+ )
+ compare = TBenchmark.Compare(timers)
+ compare.print()
+
+
+class ImplType(Enum):
+ SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+ REFERENCE = 2
+
+ def get_impl(self):
+ if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+ return silu_mul_per_token_group_quant_fp8_colmajor
+ elif self == ImplType.REFERENCE:
+ return reference
+ raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+ input: torch.Tensor
+ output: torch.Tensor
+
+ # Reference act output tensor
+ ref_act_out: torch.Tensor
+ ref_quant_out: torch.Tensor
+
+ @staticmethod
+ def make(T: int, N: int) -> "BenchmarkTensors":
+ assert T % GROUP_SIZE == 0
+ assert N % (GROUP_SIZE * 2) == 0
+
+ input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+ # silu_mul_per_token_group_quant_fp8_colmajor output.
+ output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+ FLOAT8_T
+ )
+
+ # reference output.
+ ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+ ref_quant_out = torch.empty(
+ (T, N // 2), dtype=torch.bfloat16, device="cuda"
+ ).to(FLOAT8_T)
+
+ return BenchmarkTensors(
+ input=input,
+ output=output,
+ ref_act_out=ref_act_out,
+ ref_quant_out=ref_quant_out,
+ )
+
+ @property
+ def T(self):
+ return self.input.size(0)
+
+ @property
+ def N(self):
+ return self.input.size(1)
+
+ def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+ if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+ return {
+ "input": self.input,
+ "output": self.output,
+ "use_ue8m0": is_deep_gemm_e8m0_used(),
+ }
+ elif impl_type == ImplType.REFERENCE:
+ return {
+ "input": self.input,
+ "act_out": self.ref_act_out,
+ "quant_out": self.ref_quant_out,
+ "use_ue8m0": is_deep_gemm_e8m0_used(),
+ }
+ raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+ """
+ Reference triton quant kernel from,
+ vllm.model_executor.layers.quantization.utils.fp8_utils
+ """
+ assert quant_out.size() == x.size()
+ # Allocate the scale tensor column-major format.
+ shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+ x_q = quant_out
+ x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+ M = x.numel() // GROUP_SIZE
+ N = GROUP_SIZE
+ BLOCK = triton.next_power_of_2(N)
+ # heuristics for number of warps
+ num_warps = min(max(BLOCK // 256, 1), 8)
+ num_stages = 1
+
+ finfo = torch.finfo(FLOAT8_T)
+ fp8_min = finfo.min
+ fp8_max = finfo.max
+
+ _per_token_group_quant_fp8_colmajor[(M,)](
+ x,
+ x_q,
+ x_s,
+ GROUP_SIZE,
+ x.shape[1],
+ x.stride(0),
+ x_s.stride(1),
+ eps=1e-10,
+ fp8_min=fp8_min,
+ fp8_max=fp8_max,
+ use_ue8m0=use_ue8m0,
+ BLOCK=BLOCK,
+ num_warps=num_warps,
+ num_stages=num_stages,
+ )
+ return x_q, x_s
+
+
+def reference(
+ input: torch.Tensor,
+ act_out: torch.Tensor,
+ quant_out: torch.Tensor,
+ use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ torch.ops._C.silu_and_mul(act_out, input)
+ return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+ bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+ T = bench_tensors[0].T
+ N = bench_tensors[0].N
+
+ arg_pool_size = len(bench_tensors)
+ kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+ # warmup
+ for kwargs in kwargs_list:
+ impl_type.get_impl()(**kwargs)
+ torch.cuda.synchronize()
+
+ # Merge into a single kwargs and qualify arguments as ArgPool
+ kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+ for _kwargs in kwargs_list:
+ for k, v in _kwargs.items():
+ kwargs[k].values.append(v)
+
+ cuda_graph_params = None
+ cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+ timer = None
+ with Bench(
+ cuda_graph_params,
+ "silu-mul-quant",
+ f"num_tokens={T}, N={N}",
+ impl_type.name,
+ impl_type.get_impl(),
+ **kwargs,
+ ) as bench:
+ timer = bench.run()
+ return timer
+
+
+def test_correctness(T: int, N: int):
+ print(f"Testing num_tokens={T}, N={N} ...")
+
+ bench_tensor = BenchmarkTensors.make(T, N)
+
+ def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+ return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+ # reference output
+ ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+ # test ouptut
+ out_q, out_s = output_from_impl(
+ ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+ )
+
+ torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+ torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+ timers = []
+ for N, T in product(Ns, Ts):
+ test_correctness(T, N)
+
+ bench_tensors: list[BenchmarkTensors] = [
+ BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+ ]
+
+ silu_mul_quant_timer = bench_impl(
+ bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+ )
+ timers.append(silu_mul_quant_timer)
+ reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+ timers.append(reference_timer)
+
+ print_timers(
+ [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+ )
+
+ print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+ return timers
+
+
+if __name__ == "__main__":
+ T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+ N = [2048, 4096, 8192]
+
+ print(f"T = {T}, N = {N}")
+ run(T, N, arg_pool_size=8)
diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py
index f540cff6261a8..5f9a131f79b0e 100644
--- a/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/benchmarks/kernels/benchmark_moe_align_block_size.py
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
-configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+ep_size_range = [1, 8]
+configs = list(
+ itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
+)
@triton.testing.perf_report(
triton.testing.Benchmark(
- x_names=["num_tokens", "num_experts", "topk"],
+ x_names=["num_tokens", "num_experts", "topk", "ep_size"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm"],
@@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
args={},
)
)
-def benchmark(num_tokens, num_experts, topk, provider):
+def benchmark(num_tokens, num_experts, topk, ep_size, provider):
"""Benchmark function for Triton."""
block_size = 256
+ torch.cuda.manual_seed_all(0)
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+ e_map = None
+ if ep_size != 1:
+ local_e = num_experts // ep_size
+ e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
+ e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
+ e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
- lambda: moe_align_block_size(topk_ids, block_size, num_experts),
+ lambda: moe_align_block_size(
+ topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
+ ),
quantiles=quantiles,
)
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 98f55d7c014be..02164ed3666e3 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -1246,14 +1246,8 @@ class AttentionMainLoop {
// rescale sum and partial outputs
if (need_rescale) {
// compute rescale factor
-#ifdef DEFINE_FAST_EXP
- vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
- rescale_factor_vec = fast_exp(rescale_factor_vec);
- rescale_factor = rescale_factor_vec.get_last_elem();
-#else
rescale_factor = std::exp(rescale_factor);
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-#endif
// rescale sum
new_sum_val += rescale_factor * init_sum_val;
@@ -1889,15 +1883,8 @@ class AttentionMainLoop {
: curr_output_buffer;
float rescale_factor = final_max > curr_max ? curr_max - final_max
: final_max - curr_max;
-
-#ifdef DEFINE_FAST_EXP
- vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
- rescale_factor_vec = fast_exp(rescale_factor_vec);
- rescale_factor = rescale_factor_vec.get_last_elem();
-#else
rescale_factor = std::exp(rescale_factor);
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
-#endif
local_sum[head_idx] = final_max > curr_max
? final_sum + rescale_factor * curr_sum
diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_attn_macros.h
index 6458e43419370..35716a0790ab3 100644
--- a/csrc/cpu/cpu_attn_macros.h
+++ b/csrc/cpu/cpu_attn_macros.h
@@ -60,4 +60,54 @@
#endif
+#ifdef __aarch64__
+ // Implementation copied from Arm Optimized Routines (expf AdvSIMD)
+ // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
+ #include
+ #define DEFINE_FAST_EXP \
+ const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); \
+ const float ln2_hi = 0x1.62e4p-1f; \
+ const float ln2_lo = 0x1.7f7d1cp-20f; \
+ const float c0 = 0x1.0e4020p-7f; \
+ const float c2 = 0x1.555e66p-3f; \
+ const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2}; \
+ const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000); \
+ const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f); \
+ const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f); \
+ const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f); \
+ const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f); \
+ const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound); \
+ const float32x4_t inf = \
+ vdupq_n_f32(std::numeric_limits::infinity()); \
+ const float32x4_t zero = vdupq_n_f32(0.0f); \
+ auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) { \
+ float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); \
+ float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0); \
+ r = vfmsq_laneq_f32(r, n, ln2_c02, 1); \
+ uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
+ float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias)); \
+ float32x4_t r2 = vmulq_f32(r, r); \
+ float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2); \
+ float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3); \
+ q = vfmaq_f32(q, p, r2); \
+ p = vmulq_f32(c4, r); \
+ float32x4_t poly = vfmaq_f32(p, q, r2); \
+ poly = vfmaq_f32(scale, poly, scale); \
+ const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound); \
+ const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound); \
+ poly = vbslq_f32(hi_mask, inf, poly); \
+ return vbslq_f32(lo_mask, zero, poly); \
+ }; \
+ auto fast_exp = [&](vec_op::FP32Vec16& vec) \
+ __attribute__((always_inline)) { \
+ float32x4x4_t result; \
+ result.val[0] = neon_expf(vec.reg.val[0]); \
+ result.val[1] = neon_expf(vec.reg.val[1]); \
+ result.val[2] = neon_expf(vec.reg.val[2]); \
+ result.val[3] = neon_expf(vec.reg.val[3]); \
+ return vec_op::FP32Vec16(result); \
+ };
+
+#endif // __aarch64__
+
#endif
\ No newline at end of file
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index e1d131e4a7851..de0c505b7a62f 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -118,6 +118,24 @@
} \
}
+#define VLLM_DISPATCH_BOOL(expr, const_expr, ...) \
+ if (expr) { \
+ constexpr bool const_expr = true; \
+ __VA_ARGS__(); \
+ } else { \
+ constexpr bool const_expr = false; \
+ __VA_ARGS__(); \
+ }
+
+#define VLLM_DISPATCH_GROUP_SIZE(group_size, const_group_size, ...) \
+ if (group_size == 128) { \
+ constexpr int const_group_size = 128; \
+ __VA_ARGS__(); \
+ } else if (group_size == 64) { \
+ constexpr int const_group_size = 64; \
+ __VA_ARGS__(); \
+ }
+
#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \
switch (NUM_DIMS) { \
case 2: { \
diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 69b4c1fb11d1a..47ee5f021eb4a 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) {
return cuda_cast(sigmoid_accurate(f));
}
-template
+template
+__device__ inline T apply_scoring(T val) {
+ if constexpr (SF == SCORING_SIGMOID) {
+ return apply_sigmoid(val);
+ } else {
+ return val;
+ }
+}
+
+template
__device__ void topk_with_k2(T* output, T const* input, T const* bias,
cg::thread_block_tile<32> const& tile,
int32_t const lane_id,
- int const num_experts_per_group,
- int const scoring_func) {
+ int const num_experts_per_group) {
// Get the top2 per thread
T largest = neg_inf();
T second_largest = neg_inf();
if (num_experts_per_group > WARP_SIZE) {
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
- T value = input[i];
- // Apply scoring function if needed
- if (scoring_func == SCORING_SIGMOID) {
- value = apply_sigmoid(value);
- }
+ T value = apply_scoring(input[i]);
value = value + bias[i];
if (value > largest) {
@@ -472,11 +476,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
}
} else {
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
- T value = input[i];
- // Apply scoring function if needed
- if (scoring_func == SCORING_SIGMOID) {
- value = apply_sigmoid(value);
- }
+ T value = apply_scoring(input[i]);
value = value + bias[i];
largest = value;
}
@@ -501,13 +501,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
}
}
-template
+template
__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
int64_t const num_tokens,
int64_t const num_cases,
int64_t const n_group,
- int64_t const num_experts_per_group,
- int const scoring_func) {
+ int64_t const num_experts_per_group) {
int32_t warp_id = threadIdx.x / WARP_SIZE;
int32_t lane_id = threadIdx.x % WARP_SIZE;
@@ -525,21 +524,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.wait;");
#endif
- topk_with_k2(output, input, group_bias, tile, lane_id,
- num_experts_per_group, scoring_func);
+ topk_with_k2(output, input, group_bias, tile, lane_id,
+ num_experts_per_group);
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;");
#endif
}
-template
+template
__global__ void group_idx_and_topk_idx_kernel(
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
T const* bias, int64_t const num_tokens, int64_t const n_group,
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
int64_t const num_experts_per_group, bool renormalize,
- double routed_scaling_factor, int scoring_func) {
+ double routed_scaling_factor) {
int32_t warp_id = threadIdx.x / WARP_SIZE;
int32_t lane_id = threadIdx.x % WARP_SIZE;
int32_t case_id =
@@ -549,6 +548,11 @@ __global__ void group_idx_and_topk_idx_kernel(
topk_values += case_id * topk;
topk_indices += case_id * topk;
+ constexpr bool kUseStaticNGroup = (NGroup > 0);
+ // use int32 to avoid implicit conversion
+ int32_t const n_group_i32 =
+ kUseStaticNGroup ? NGroup : static_cast(n_group);
+
int32_t align_num_experts_per_group =
warp_topk::round_up_to_multiple_of(num_experts_per_group);
@@ -574,13 +578,14 @@ __global__ void group_idx_and_topk_idx_kernel(
if (case_id < num_tokens) {
// calculate group_idx
- int32_t target_num_min = WARP_SIZE - n_group + topk_group;
+ int32_t target_num_min =
+ WARP_SIZE - n_group_i32 + static_cast(topk_group);
// The check is necessary to avoid abnormal input
- if (lane_id < n_group && is_finite(group_scores[lane_id])) {
+ if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
value = group_scores[lane_id];
}
- int count_equal_to_top_value = WARP_SIZE - n_group;
+ int count_equal_to_top_value = WARP_SIZE - n_group_i32;
int pre_count_equal_to_top_value = 0;
// Use loop to find the largset top_group
while (count_equal_to_top_value < target_num_min) {
@@ -604,7 +609,7 @@ __global__ void group_idx_and_topk_idx_kernel(
int count_equalto_topkth_group = 0;
bool if_proceed_next_topk = topk_group_value != neg_inf();
if (case_id < num_tokens && if_proceed_next_topk) {
- for (int i_group = 0; i_group < n_group; i_group++) {
+ auto process_group = [&](int i_group) {
if ((group_scores[i_group] > topk_group_value) ||
((group_scores[i_group] == topk_group_value) &&
(count_equalto_topkth_group < num_equalto_topkth_group))) {
@@ -613,11 +618,10 @@ __global__ void group_idx_and_topk_idx_kernel(
i += WARP_SIZE) {
T candidates = neg_inf();
if (i < num_experts_per_group) {
- // Apply scoring function (if any) and add bias
+ // apply scoring function (if any) and add bias
T input = scores[offset + i];
if (is_finite(input)) {
- T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
- : input;
+ T score = apply_scoring(input);
candidates = score + bias[offset + i];
}
}
@@ -627,6 +631,17 @@ __global__ void group_idx_and_topk_idx_kernel(
count_equalto_topkth_group++;
}
}
+ };
+
+ if constexpr (kUseStaticNGroup) {
+#pragma unroll
+ for (int i_group = 0; i_group < NGroup; ++i_group) {
+ process_group(i_group);
+ }
+ } else {
+ for (int i_group = 0; i_group < n_group_i32; ++i_group) {
+ process_group(i_group);
+ }
}
queue.done();
__syncwarp();
@@ -646,12 +661,13 @@ __global__ void group_idx_and_topk_idx_kernel(
if (i < topk) {
// Load the score value (without bias) for normalization
T input = scores[s_topk_idx[i]];
- value =
- (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
+ value = apply_scoring(input);
s_topk_value[i] = value;
}
- topk_sum +=
- cg::reduce(tile, cuda_cast(value), cg::plus());
+ if (renormalize) {
+ topk_sum +=
+ cg::reduce(tile, cuda_cast(value), cg::plus());
+ }
}
}
@@ -660,13 +676,9 @@ __global__ void group_idx_and_topk_idx_kernel(
if (case_id < num_tokens) {
if (if_proceed_next_topk) {
for (int i = lane_id; i < topk; i += WARP_SIZE) {
- float value;
- if (renormalize) {
- value = cuda_cast(s_topk_value[i]) / topk_sum *
- routed_scaling_factor;
- } else {
- value = cuda_cast(s_topk_value[i]) * routed_scaling_factor;
- }
+ float base = cuda_cast(s_topk_value[i]);
+ float value = renormalize ? (base / topk_sum * routed_scaling_factor)
+ : (base * routed_scaling_factor);
topk_indices[i] = s_topk_idx[i];
topk_values[i] = value;
}
@@ -684,6 +696,45 @@ __global__ void group_idx_and_topk_idx_kernel(
#endif
}
+template
+inline void launch_group_idx_and_topk_kernel(
+ cudaLaunchConfig_t const& config, T* scores, T* group_scores,
+ float* topk_values, IdxT* topk_indices, T const* bias,
+ int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
+ int64_t const topk, int64_t const num_experts,
+ int64_t const num_experts_per_group, bool const renormalize,
+ double const routed_scaling_factor) {
+ auto launch = [&](auto* kernel_instance2) {
+ cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
+ topk_values, topk_indices, bias, num_tokens, n_group,
+ topk_group, topk, num_experts, num_experts_per_group,
+ renormalize, routed_scaling_factor);
+ };
+
+ switch (n_group) {
+ case 4: {
+ launch(&group_idx_and_topk_idx_kernel);
+ break;
+ }
+ case 8: {
+ launch(&group_idx_and_topk_idx_kernel);
+ break;
+ }
+ case 16: {
+ launch(&group_idx_and_topk_idx_kernel);
+ break;
+ }
+ case 32: {
+ launch(&group_idx_and_topk_idx_kernel);
+ break;
+ }
+ default: {
+ launch(&group_idx_and_topk_idx_kernel);
+ break;
+ }
+ }
+}
+
template
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
IdxT* topk_indices, T const* bias, int64_t const num_tokens,
@@ -694,7 +745,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
cudaStream_t const stream = 0) {
int64_t num_cases = num_tokens * n_group;
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
- auto* kernel_instance1 = &topk_with_k2_kernel;
cudaLaunchConfig_t config;
config.gridDim = topk_with_k2_num_blocks;
config.blockDim = BLOCK_SIZE;
@@ -705,16 +755,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;
- cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
- num_tokens, num_cases, n_group, num_experts / n_group,
- scoring_func);
+ auto const sf = static_cast(scoring_func);
+ int64_t const num_experts_per_group = num_experts / n_group;
+ auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
+ cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
+ num_tokens, num_cases, n_group, num_experts_per_group);
+ };
+ switch (sf) {
+ case SCORING_NONE: {
+ auto* kernel_instance1 = &topk_with_k2_kernel;
+ launch_topk_with_k2(kernel_instance1);
+ break;
+ }
+ case SCORING_SIGMOID: {
+ auto* kernel_instance1 = &topk_with_k2_kernel;
+ launch_topk_with_k2(kernel_instance1);
+ break;
+ }
+ default:
+ // should be guarded by higher level checks.
+ TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
+ }
int64_t topk_with_k_group_num_blocks =
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
size_t dynamic_smem_in_bytes =
warp_topk::calc_smem_size_for_block_wide(NUM_WARPS_PER_BLOCK,
topk);
- auto* kernel_instance2 = &group_idx_and_topk_idx_kernel;
config.gridDim = topk_with_k_group_num_blocks;
config.blockDim = BLOCK_SIZE;
config.dynamicSmemBytes = dynamic_smem_in_bytes;
@@ -723,10 +790,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;
- cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
- topk_values, topk_indices, bias, num_tokens, n_group,
- topk_group, topk, num_experts, num_experts / n_group,
- renormalize, routed_scaling_factor, scoring_func);
+ switch (sf) {
+ case SCORING_NONE: {
+ launch_group_idx_and_topk_kernel(
+ config, scores, group_scores, topk_values, topk_indices, bias,
+ num_tokens, n_group, topk_group, topk, num_experts,
+ num_experts_per_group, renormalize, routed_scaling_factor);
+ break;
+ }
+ case SCORING_SIGMOID: {
+ launch_group_idx_and_topk_kernel(
+ config, scores, group_scores, topk_values, topk_indices, bias,
+ num_tokens, n_group, topk_group, topk, num_experts,
+ num_experts_per_group, renormalize, routed_scaling_factor);
+ break;
+ }
+ default:
+ TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
+ }
}
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index b3d0c0aa58e9e..5c9e474024082 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -14,7 +14,6 @@
namespace vllm {
namespace moe {
-
namespace batched_moe_align_block_size {
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
@@ -80,17 +79,32 @@ __global__ void batched_moe_align_block_size_kernel(
} // namespace batched_moe_align_block_size
template
-__global__ void moe_align_block_size_kernel(
+__device__ void _moe_align_block_size(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
- int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
+ int32_t* __restrict__ total_tokens_post_pad,
+ int32_t* __restrict__ expert_map, int32_t num_experts,
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
- size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
+ size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
+ int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
+ int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
extern __shared__ int32_t shared_counts[];
- // Initialize sorted_token_ids with numel
- for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
- sorted_token_ids[it] = numel;
+ // Compute input buffer offsets. Typically these will all be 0, except when
+ // using Multi LoRA.
+ int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
+ int expert_ids_offset = max_num_m_blocks * model_offset;
+ int cumsum_offset = (num_experts + 1) * model_offset;
+
+ // Use separate threadblocks to fill sorted_token_ids.
+ // This is safe since the current kernel does not use sorted_token_ids.
+ if (blockIdx.x % 2) {
+ // Initialize sorted_token_ids with numel
+ for (size_t it = threadIdx.x; it < max_num_tokens_padded;
+ it += blockDim.x) {
+ sorted_token_ids[sorted_token_ids_offset + it] = numel;
+ }
+ return;
}
const int warp_id = threadIdx.x / WARP_SIZE;
@@ -112,9 +126,16 @@ __global__ void moe_align_block_size_kernel(
if (expert_id >= num_experts) {
continue;
}
+ if (has_expert_map) {
+ expert_id = expert_map[expert_id];
+ // filter invalid experts
+ if (expert_id == -1) continue;
+ }
int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp;
- atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
+ int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
+ atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
+ mask);
}
__syncthreads();
@@ -135,48 +156,196 @@ __global__ void moe_align_block_size_kernel(
int cumsum_val;
BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
if (expert_id <= num_experts) {
- cumsum[expert_id] = cumsum_val;
+ cumsum[cumsum_offset + expert_id] = cumsum_val;
}
if (expert_id == num_experts) {
- *total_tokens_post_pad = cumsum_val;
+ total_tokens_post_pad[model_offset] = cumsum_val;
}
__syncthreads();
if (threadIdx.x < num_experts) {
- for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
- i += block_size) {
- expert_ids[i / block_size] = threadIdx.x;
+ for (int i = cumsum[cumsum_offset + threadIdx.x];
+ i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
+ expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
}
}
// Fill remaining expert_ids with 0
- const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
- const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
- for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
- expert_ids[i] = 0;
+ const size_t fill_start_idx =
+ cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
+ for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
+ expert_ids[expert_ids_offset + i] = inactive_expert_id;
+ }
+}
+
+template
+__device__ void _moe_align_block_size_small_batch_expert(
+ const scalar_t* __restrict__ topk_ids,
+ int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+ int32_t* __restrict__ total_tokens_post_pad,
+ int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
+ size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
+ int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
+ int32_t* token_mask, bool has_expert_map) {
+ // Compute input buffer offsets. Typically these will all be 0, except when
+ // using Multi LoRA.
+ int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
+ int expert_ids_offset = max_num_m_blocks * model_offset;
+
+ // Use an additional group of threads to fill sorted_token_ids.
+ // Since the current kernel will use sorted_token_ids afterward,
+ // we fill sorted_token_ids within the same threadblock to make
+ // synchronization easier.
+ if (threadIdx.x < fill_threads) {
+ // Initialize sorted_token_ids with numel
+ for (size_t it = threadIdx.x; it < max_num_tokens_padded;
+ it += fill_threads) {
+ sorted_token_ids[sorted_token_ids_offset + it] = numel;
+ }
+ // Three __syncthreads() corresponding to the other threads
+ __syncthreads();
+ __syncthreads();
+ __syncthreads();
+ return;
+ }
+
+ const size_t tid = threadIdx.x - fill_threads;
+ const size_t stride = blockDim.x - fill_threads;
+
+ extern __shared__ int32_t shared_mem[];
+ int32_t* cumsum = shared_mem;
+ int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
+
+ for (int i = 0; i < num_experts; ++i) {
+ tokens_cnts[(tid + 1) * num_experts + i] = 0;
+ }
+
+ for (size_t i = tid; i < numel; i += stride) {
+ int32_t expert_id = topk_ids[i];
+ if (has_expert_map) {
+ expert_id = expert_map[expert_id];
+ // filter invalid expert
+ if (expert_id == -1) continue;
+ }
+ int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
+ tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
+ }
+
+ __syncthreads();
+
+ if (tid < num_experts) {
+ tokens_cnts[tid] = 0;
+ for (int i = 1; i <= stride; ++i) {
+ tokens_cnts[i * num_experts + tid] +=
+ tokens_cnts[(i - 1) * num_experts + tid];
+ }
+ }
+
+ __syncthreads();
+
+ if (tid == 0) {
+ cumsum[0] = 0;
+ for (int i = 1; i <= num_experts; ++i) {
+ cumsum[i] =
+ cumsum[i - 1] +
+ CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
+ block_size;
+ }
+ total_tokens_post_pad[model_offset] =
+ static_cast(cumsum[num_experts]);
+ }
+
+ __syncthreads();
+
+ if (tid < num_experts) {
+ for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
+ expert_ids[expert_ids_offset + i / block_size] = tid;
+ }
+ }
+
+ // Fill remaining expert_ids with 0
+ const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
+ for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
+ expert_ids[expert_ids_offset + i] = inactive_expert_id;
+ }
+
+ for (size_t i = tid; i < numel; i += stride) {
+ int32_t expert_id = topk_ids[i];
+ if (has_expert_map) {
+ expert_id = expert_map[expert_id];
+ // filter invalid expert
+ if (expert_id == -1) continue;
+ }
+ int32_t rank_post_pad =
+ tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
+
+ if (token_mask == nullptr || token_mask[i / topk_num]) {
+ sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
+ ++tokens_cnts[tid * num_experts + expert_id];
+ }
}
}
template
-__global__ void count_and_sort_expert_tokens_kernel(
+__device__ void _count_and_sort_expert_tokens(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
- size_t numel, int32_t num_experts) {
- const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
- const size_t stride = blockDim.x * gridDim.x;
+ int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+ int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
+ int32_t model_offset, int32_t topk_num, bool has_expert_map) {
+ const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
+ const size_t stride = blockDim.x * gridDim.y;
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
- int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
- sorted_token_ids[rank_post_pad] = i;
+
+ if (has_expert_map) {
+ expert_id = expert_map[expert_id];
+ // filter invalid experts
+ if (expert_id == -1) continue;
+ }
+
+ if (token_mask == nullptr || token_mask[i / topk_num]) {
+ int32_t rank_post_pad = atomicAdd(
+ &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
+ sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
+ i;
+ }
}
}
+template
+__global__ void moe_align_block_size_kernel(
+ const scalar_t* __restrict__ topk_ids,
+ int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+ int32_t* __restrict__ total_tokens_post_pad,
+ int32_t* __restrict__ expert_map, int32_t num_experts,
+ int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
+ size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
+ int32_t topk_num, bool has_expert_map) {
+ _moe_align_block_size(
+ topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+ num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+ cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
+ 0, 0, topk_num, nullptr, has_expert_map);
+}
+
+template
+__global__ void count_and_sort_expert_tokens_kernel(
+ const scalar_t* __restrict__ topk_ids,
+ int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+ int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+ int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
+ _count_and_sort_expert_tokens(
+ topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
+ max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
+}
+
template
__global__ void moe_sum_kernel(
scalar_t* __restrict__ out, // [..., d]
@@ -193,78 +362,111 @@ __global__ void moe_sum_kernel(
}
}
-template
+template
__global__ void moe_align_block_size_small_batch_expert_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
- int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
- int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
- // Initialize sorted_token_ids with numel
- for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
- sorted_token_ids[it] = numel;
+ int32_t* __restrict__ total_tokens_post_pad,
+ int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
+ size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
+ bool has_expert_map) {
+ _moe_align_block_size_small_batch_expert(
+ topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+ num_experts, block_size, numel, max_num_tokens_padded,
+ CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
+ has_expert_map);
+}
+
+template
+__global__ void moe_lora_align_block_size_kernel(
+ scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
+ int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
+ int max_loras, size_t numel, int max_num_tokens_padded,
+ int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
+ int32_t* __restrict__ expert_ids, int32_t topk_num,
+ int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
+ int32_t* __restrict__ cumsum, int32_t experts_per_warp,
+ int32_t padded_num_experts, int32_t* lora_ids,
+ int32_t* __restrict__ token_mask, bool has_expert_map) {
+ int lora_idx = blockIdx.x / 2;
+ int lora_id = lora_ids[lora_idx];
+ if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+ return;
}
- const size_t tid = threadIdx.x;
- const size_t stride = blockDim.x;
-
- extern __shared__ int32_t shared_mem[];
- int32_t* cumsum = shared_mem;
- int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
-
- for (int i = 0; i < num_experts; ++i) {
- tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
- }
-
- for (size_t i = tid; i < numel; i += stride) {
- ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
- }
-
- __syncthreads();
-
- if (threadIdx.x < num_experts) {
- tokens_cnts[threadIdx.x] = 0;
- for (int i = 1; i <= blockDim.x; ++i) {
- tokens_cnts[i * num_experts + threadIdx.x] +=
- tokens_cnts[(i - 1) * num_experts + threadIdx.x];
- }
- }
-
- __syncthreads();
-
+ // Populate the token_mask based on the token-LoRA mapping
+ int num_tokens = numel / topk_num;
if (threadIdx.x == 0) {
- cumsum[0] = 0;
- for (int i = 1; i <= num_experts; ++i) {
- cumsum[i] =
- cumsum[i - 1] +
- CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
- block_size;
+ total_tokens_post_pad[lora_id] = 0;
+
+ for (int i = 0; i < num_tokens; i++) {
+ token_mask[(lora_id * num_tokens) + i] =
+ (int)token_lora_mapping[i] == lora_id;
}
- *total_tokens_post_pad = static_cast(cumsum[num_experts]);
}
__syncthreads();
- if (threadIdx.x < num_experts) {
- for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
- i += block_size) {
- expert_ids[i / block_size] = threadIdx.x;
+ _moe_align_block_size(
+ topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+ num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+ cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
+ &token_mask[(lora_id * num_tokens)], has_expert_map);
+}
+
+template
+__global__ void lora_count_and_sort_expert_tokens_kernel(
+ const scalar_t* __restrict__ topk_ids,
+ int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
+ int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
+ int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
+ int32_t* lora_ids, bool has_expert_map) {
+ int lora_idx = blockIdx.x;
+ int lora_id = lora_ids[lora_idx];
+ if (lora_id == -1) {
+ return;
+ }
+
+ int num_tokens = numel / topk_num;
+
+ _count_and_sort_expert_tokens(
+ topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
+ max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
+ topk_num, has_expert_map);
+}
+
+template
+__global__ void moe_lora_align_block_size_small_batch_expert_kernel(
+ scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
+ int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
+ int max_loras, size_t numel, int max_num_tokens_padded,
+ int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
+ int32_t* __restrict__ expert_ids, int topk_num,
+ int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
+ int32_t* token_mask, bool has_expert_map) {
+ int lora_idx = blockIdx.x;
+ int lora_id = lora_ids[lora_idx];
+ if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
+ return;
+ }
+
+ int num_tokens = numel / topk_num;
+ if (threadIdx.x == 0) {
+ total_tokens_post_pad[lora_id] = 0;
+
+ for (int i = 0; i < num_tokens; i++) {
+ token_mask[(lora_id * num_tokens) + i] =
+ (int)token_lora_mapping[i] == lora_id;
}
}
- // Fill remaining expert_ids with 0
- const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
- const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
- for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
- expert_ids[i] = 0;
- }
+ __syncthreads();
- for (size_t i = tid; i < numel; i += stride) {
- int32_t expert_id = topk_ids[i];
- int32_t rank_post_pad =
- tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
- sorted_token_ids[rank_post_pad] = i;
- ++tokens_cnts[threadIdx.x * num_experts + expert_id];
- }
+ _moe_align_block_size_small_batch_expert(
+ topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
+ num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
+ -1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
+ has_expert_map);
}
} // namespace moe
@@ -275,7 +477,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
- torch::Tensor num_tokens_post_pad) {
+ torch::Tensor num_tokens_post_pad,
+ std::optional maybe_expert_map) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int64_t padded_num_experts =
@@ -287,14 +490,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
// BlockScan uses 1024 threads and assigns one thread per expert.
TORCH_CHECK(padded_num_experts < 1024,
"padded_num_experts must be less than 1024");
+ auto options_int =
+ torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+ bool has_expert_map = maybe_expert_map.has_value();
+ torch::Tensor expert_map;
+ if (has_expert_map) {
+ expert_map = maybe_expert_map.value();
+ } else {
+ expert_map = torch::empty({0}, options_int);
+ }
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// calc needed amount of shared mem for `cumsum` tensors
- auto options_int =
- torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
- torch::Tensor cumsum_buffer =
- torch::empty({num_experts + 1}, options_int);
bool small_batch_expert_mode =
(topk_ids.numel() < 1024) && (num_experts <= 64);
@@ -304,43 +512,58 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
((threads + 1) * num_experts + (num_experts + 1)) *
sizeof(int32_t);
+ // threadIdx.x >= fill_threads: counting experts and aligning
+ // threadIdx.x < fill_threads: filling sorted_token_ids
+ constexpr int32_t fill_threads = 256;
auto small_batch_expert_kernel =
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
- scalar_t>;
- small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
+ scalar_t, fill_threads>;
+ small_batch_expert_kernel<<<1, fill_threads + threads,
+ shared_mem_size, stream>>>(
topk_ids.data_ptr(),
sorted_token_ids.data_ptr(),
experts_ids.data_ptr(),
- num_tokens_post_pad.data_ptr(), num_experts, block_size,
- topk_ids.numel(), sorted_token_ids.size(0));
+ num_tokens_post_pad.data_ptr(),
+ expert_map.data_ptr(), num_experts, block_size,
+ topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
+ has_expert_map);
} else {
+ torch::Tensor cumsum_buffer =
+ torch::empty({num_experts + 1}, options_int);
auto align_kernel = vllm::moe::moe_align_block_size_kernel;
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
size_t shared_mem_size =
num_warps * experts_per_warp * sizeof(int32_t);
- align_kernel<<<1, threads, shared_mem_size, stream>>>(
+ // launch two threadblocks
+ // blockIdx.x == 0: counting experts and aligning
+ // blockIdx.x == 1: filling sorted_token_ids
+ align_kernel<<<2, threads, shared_mem_size, stream>>>(
topk_ids.data_ptr(),
sorted_token_ids.data_ptr(),
experts_ids.data_ptr(),
- num_tokens_post_pad.data_ptr(), num_experts,
- padded_num_experts, experts_per_warp, block_size,
- topk_ids.numel(), cumsum_buffer.data_ptr(),
- sorted_token_ids.size(0));
+ num_tokens_post_pad.data_ptr(),
+ expert_map.data_ptr(), num_experts, padded_num_experts,
+ experts_per_warp, block_size, topk_ids.numel(),
+ cumsum_buffer.data_ptr(), sorted_token_ids.size(0),
+ topk_ids.size(1), has_expert_map);
const int block_threads = std::min(256, (int)threads);
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
+ dim3 gridDims(1, actual_blocks);
auto sort_kernel =
vllm::moe::count_and_sort_expert_tokens_kernel;
- sort_kernel<<>>(
+ sort_kernel<<>>(
topk_ids.data_ptr(),
sorted_token_ids.data_ptr(),
- cumsum_buffer.data_ptr(), topk_ids.numel(), num_experts);
+ cumsum_buffer.data_ptr(), expert_map.data_ptr(),
+ topk_ids.numel(), num_experts, sorted_token_ids.size(0),
+ topk_ids.size(1), has_expert_map);
}
});
}
@@ -414,3 +637,123 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
break;
}
}
+
+void moe_lora_align_block_size(
+ torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
+ int64_t num_experts, int64_t block_size, int64_t max_loras,
+ int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
+ torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
+ torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
+ torch::Tensor lora_ids, std::optional maybe_expert_map) {
+ const int topk_num = topk_ids.size(1);
+
+ TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
+
+ int device_max_shared_mem;
+ auto dev = topk_ids.get_device();
+ cudaDeviceGetAttribute(&device_max_shared_mem,
+ cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ int64_t padded_num_experts =
+ ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+
+ // BlockScan uses 1024 threads and assigns one thread per expert.
+ TORCH_CHECK(padded_num_experts < 1024,
+ "padded_num_experts must be less than 1024");
+
+ auto options_int =
+ torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+ torch::Tensor token_mask =
+ torch::empty({max_loras * topk_ids.size(0)}, options_int);
+ bool has_expert_map = maybe_expert_map.has_value();
+ torch::Tensor expert_map;
+ if (has_expert_map) {
+ expert_map = maybe_expert_map.value();
+ } else {
+ expert_map = torch::empty({0}, options_int);
+ }
+
+ VLLM_DISPATCH_INTEGRAL_TYPES(
+ topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
+ bool small_batch_expert_mode =
+ (topk_ids.numel() < 1024) && (num_experts <= 64);
+
+ if (small_batch_expert_mode) {
+ const int32_t num_thread = max((int32_t)num_experts, 128);
+ const int32_t shared_mem =
+ (num_thread + 1) * num_experts * sizeof(int32_t) +
+ (num_experts + 1) * sizeof(int32_t);
+ if (shared_mem > device_max_shared_mem) {
+ TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
+ }
+
+ // threadIdx.x >= fill_threads: counting experts and aligning
+ // threadIdx.x < fill_threads: filling sorted_token_ids
+ constexpr int32_t fill_threads = 256;
+
+ dim3 blockDim(num_thread + fill_threads);
+ auto kernel =
+ vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
+ scalar_t, fill_threads>;
+ AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+ (void*)kernel, shared_mem));
+ kernel<<>>(
+ topk_ids.data_ptr(),
+ token_lora_mapping.data_ptr(), block_size,
+ expert_map.data_ptr(), num_experts, max_loras,
+ topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
+ sorted_token_ids.data_ptr(),
+ expert_ids.data_ptr(), topk_num,
+ num_tokens_post_pad.data_ptr(),
+ adapter_enabled.data_ptr(), lora_ids.data_ptr(),
+ token_mask.data_ptr(), has_expert_map);
+ } else {
+ int num_thread = 1024;
+ dim3 blockDim(num_thread);
+ size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);
+
+ size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);
+
+ // cumsum buffer
+ torch::Tensor cumsum =
+ torch::zeros({max_loras * (num_experts + 1)}, options_int);
+
+ auto align_kernel =
+ vllm::moe::moe_lora_align_block_size_kernel;
+
+ // launch two threadblocks for each lora
+ // blockIdx.x % 2 == 0: counting experts and aligning
+ // blockIdx.x % 2 == 1: filling sorted_token_ids
+ align_kernel<<>>(
+ topk_ids.data_ptr(),
+ token_lora_mapping.data_ptr(), block_size,
+ expert_map.data_ptr(), num_experts, max_loras,
+ topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
+ sorted_token_ids.data_ptr(),
+ expert_ids.data_ptr(), topk_num,
+ num_tokens_post_pad.data_ptr(),
+ adapter_enabled.data_ptr(), cumsum.data_ptr(),
+ WARP_SIZE, padded_num_experts, lora_ids.data_ptr(),
+ token_mask.data_ptr(), has_expert_map);
+
+ const int block_threads = std::min(256, (int)num_thread);
+ const int num_blocks =
+ (topk_ids.numel() + block_threads - 1) / block_threads;
+
+ const int max_blocks = 65535;
+ const int actual_blocks = std::min(num_blocks, max_blocks);
+
+ dim3 gridDims(max_loras, actual_blocks);
+ auto sort_kernel =
+ vllm::moe::lora_count_and_sort_expert_tokens_kernel;
+
+ sort_kernel<<>>(
+ topk_ids.data_ptr(),
+ sorted_token_ids.data_ptr(), cumsum.data_ptr(),
+ expert_map.data_ptr(), topk_ids.numel(), num_experts,
+ max_num_tokens_padded, topk_num, token_mask.data_ptr(),
+ lora_ids.data_ptr(), has_expert_map);
+ }
+ });
+}
\ No newline at end of file
diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu
deleted file mode 100644
index 360f1312cf579..0000000000000
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ /dev/null
@@ -1,174 +0,0 @@
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include
-
-#include "../cuda_compat.h"
-#include "../dispatch_utils.h"
-#include "core/math.hpp"
-
-namespace {
-
-__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
- int32_t col) {
- return row * total_col + col;
-}
-
-} // namespace
-
-// TODO: Refactor common parts with moe_align_sum_kernels
-template
-__global__ void moe_lora_align_sum_kernel(
- scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
- int64_t block_size, int num_experts, int max_loras, size_t numel,
- int max_num_tokens_padded, int max_num_m_blocks,
- int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
- int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
- int32_t* lora_ids) {
- const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
- const size_t start_idx = threadIdx.x * tokens_per_thread;
-
- int lora_idx = blockIdx.x;
- int lora_id = lora_ids[lora_idx];
- if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
- return;
- }
- extern __shared__ int32_t shared_mem[];
- int32_t* cumsum = shared_mem;
- token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
-
- // Initialize sorted_token_ids with numel
- for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
- sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
- }
-
- // Initialize expert_ids with -1
- for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
- expert_ids[lora_id * max_num_m_blocks + it] = -1;
- }
-
- // Initialize total_tokens_post_pad with 0
- if (threadIdx.x == 0) {
- total_tokens_post_pad[lora_id] = 0;
- }
-
- for (int i = 0; i < num_experts; ++i) {
- tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
- }
-
- for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
- int mask = token_lora_mapping[i / topk_num] == lora_id;
- int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
- tokens_cnts[idx] += mask;
- }
-
- __syncthreads();
-
- // For each expert we accumulate the token counts from the different threads.
- if (threadIdx.x < num_experts) {
- tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
- for (int i = 1; i <= blockDim.x; ++i) {
- tokens_cnts[index(num_experts, i, threadIdx.x)] +=
- tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
- }
- }
-
- __syncthreads();
-
- // We accumulate the token counts of all experts in thread 0.
- if (threadIdx.x == 0) {
- cumsum[0] = 0;
- for (int i = 1; i <= num_experts; ++i) {
- cumsum[i] = cumsum[i - 1] +
- div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
- block_size) *
- block_size;
- }
- total_tokens_post_pad[lora_id] = static_cast(cumsum[num_experts]);
- }
-
- __syncthreads();
-
- /**
- * For each expert, each thread processes the tokens of the corresponding
- * blocks and stores the corresponding expert_id for each block.
- */
- if (threadIdx.x < num_experts) {
- for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
- i += block_size) {
- expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
- threadIdx.x;
- }
- }
-
- for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
- int32_t expert_id = topk_ids[i];
- /** The cumsum[expert_id] stores the starting index of the tokens that the
- * expert with expert_id needs to process, and
- * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
- * processed by the expert with expert_id within the current thread's token
- * shard.
- */
- int32_t rank_post_pad =
- tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
- cumsum[expert_id];
-
- int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
- atomicAdd(
- &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
- (i - numel) * mask);
- tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
- }
-}
-
-void moe_lora_align_block_size(
- torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
- int64_t num_experts, int64_t block_size, int64_t max_loras,
- int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
- torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
- torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
- torch::Tensor lora_ids) {
- const int topk_num = topk_ids.size(1);
-
- TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
-
- int device_max_shared_mem;
- auto dev = topk_ids.get_device();
- cudaDeviceGetAttribute(&device_max_shared_mem,
- cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
- const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE,
- TORCH_CHECK(num_thread <= 1024,
- "num_thread must be less than 1024, "
- "and fallback is not implemented yet.");
- const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
- (num_experts + 1) * sizeof(int32_t);
-
- if (shared_mem > device_max_shared_mem) {
- TORCH_CHECK(false,
- "Shared memory usage exceeds device limit, and global memory "
- "fallback is not implemented yet.");
- }
-
- VLLM_DISPATCH_INTEGRAL_TYPES(
- topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
- dim3 blockDim(num_thread);
- auto kernel = moe_lora_align_sum_kernel;
- AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
- (void*)kernel, shared_mem));
- kernel<<>>(
- topk_ids.data_ptr(),
- token_lora_mapping.data_ptr(), block_size, num_experts,
- max_loras, topk_ids.numel(), max_num_tokens_padded,
- max_num_m_blocks, sorted_token_ids.data_ptr(),
- expert_ids.data_ptr(), topk_num,
- num_tokens_post_pad.data_ptr(),
- adapter_enabled.data_ptr(), lora_ids.data_ptr());
- });
-}
\ No newline at end of file
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 11c6875f7f1d0..337dcc50b079e 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -11,7 +11,8 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
- torch::Tensor num_tokens_post_pad);
+ torch::Tensor num_tokens_post_pad,
+ std::optional maybe_expert_map);
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
int64_t block_size,
@@ -26,7 +27,7 @@ void moe_lora_align_block_size(
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
- torch::Tensor lora_ids);
+ torch::Tensor lora_ids, std::optional maybe_expert_map);
#ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales,
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index e0a8280722f3c..779ad70ad1e09 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
"moe_align_block_size(Tensor topk_ids, int num_experts,"
" int block_size, Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
- " Tensor! num_tokens_post_pad) -> ()");
+ " Tensor! num_tokens_post_pad,"
+ " Tensor? maybe_expert_map) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// Aligning the number of tokens to be processed by each expert such
@@ -46,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor !experts_ids,"
" Tensor !num_tokens_post_pad,"
" Tensor !adapter_enabled,"
- " Tensor !lora_ids) -> () ");
+ " Tensor !lora_ids,"
+ " Tensor? maybe_expert_map) -> () ");
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
#ifndef USE_ROCM
diff --git a/csrc/ops.h b/csrc/ops.h
index 4bb7857b15032..37e3aaf7499d5 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -102,13 +102,16 @@ void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties);
-void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
- const torch::Tensor& rowEnds, torch::Tensor& indices,
- int64_t numRows, int64_t stride0, int64_t stride1);
+void top_k_per_row_prefill(const torch::Tensor& logits,
+ const torch::Tensor& rowStarts,
+ const torch::Tensor& rowEnds, torch::Tensor& indices,
+ int64_t numRows, int64_t stride0, int64_t stride1,
+ int64_t topK);
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
- const torch::Tensor& seq_lens, torch::Tensor& indices,
- int64_t numRows, int64_t stride0, int64_t stride1);
+ const torch::Tensor& seqLens, torch::Tensor& indices,
+ int64_t numRows, int64_t stride0, int64_t stride1,
+ int64_t topK);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale,
@@ -128,6 +131,13 @@ void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
std::optional scale_ub,
std::optional residual);
+void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
+ torch::Tensor const& weight,
+ torch::Tensor& scales, double const epsilon,
+ std::optional scale_ub,
+ std::optional residual,
+ int64_t group_size, bool is_scale_transposed);
+
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
std::optional key, int64_t head_size,
torch::Tensor& cos_sin_cache, bool is_neox);
@@ -252,7 +262,8 @@ void get_cutlass_moe_mm_data(
void get_cutlass_moe_mm_problem_sizes(
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
- const int64_t k, const std::optional& blockscale_offsets);
+ const int64_t k, const std::optional& blockscale_offsets,
+ std::optional force_swap_ab = std::nullopt);
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
@@ -299,6 +310,14 @@ void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
double eps, double int8_min, double int8_max);
+
+// Fused activation quantisation + DeepGEMM-compatible UE8M0-packed scales.
+void per_token_group_quant_8bit_packed(const torch::Tensor& input,
+ torch::Tensor& output_q,
+ torch::Tensor& output_s_packed,
+ int64_t group_size, double eps,
+ double min_8bit, double max_8bit);
+
#endif
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
new file mode 100644
index 0000000000000..fec142d0d87a1
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
@@ -0,0 +1,104 @@
+// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
+#pragma once
+
+#include
+#include
+#include
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+// ElementB is int32 (packed int4)
+// ElementGroupScale is cutlass::Array (packed fp8)
+template
+__global__ void get_group_gemm_starts(
+ int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
+ ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+ ElementAccumulator** b_scales_offsets,
+ ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
+ ElementB* b_base_as_int, ElementC* out_base_as_int,
+ ElementAccumulator* a_scales_base_as_int,
+ ElementAccumulator* b_scales_base_as_int,
+ ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
+ int64_t scale_k) {
+ int expert_id = threadIdx.x;
+
+ int64_t expert_offset = expert_offsets[expert_id];
+
+ // same as w8a8
+ a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+ out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+ a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
+ b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);
+
+ // w4a8 specific
+ constexpr int pack_factor = 8; // pack 8 int4 into int32
+ b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
+ b_group_scales_offsets[expert_id] =
+ b_group_scales_base_as_int + (expert_id * scale_k * n);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE) \
+ else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
+ get_group_gemm_starts> \
+ <<<1, num_experts, 0, stream>>>( \
+ static_cast(expert_offsets.data_ptr()), \
+ static_cast(a_ptrs.data_ptr()), \
+ static_cast(b_ptrs.data_ptr()), \
+ static_cast(out_ptrs.data_ptr()), \
+ static_cast(a_scales_ptrs.data_ptr()), \
+ static_cast(b_scales_ptrs.data_ptr()), \
+ static_cast**>( \
+ b_group_scales_ptrs.data_ptr()), \
+ static_cast(a_tensors.data_ptr()), \
+ static_cast(b_tensors.data_ptr()), \
+ static_cast(out_tensors.data_ptr()), \
+ static_cast(a_scales.data_ptr()), \
+ static_cast(b_scales.data_ptr()), \
+ static_cast*>( \
+ b_group_scales.data_ptr()), \
+ n, k, scale_k); \
+ }
+
+namespace {
+
+void run_get_group_gemm_starts(
+ torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+ torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+ torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+ torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
+ torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
+ torch::Tensor const& a_scales, torch::Tensor const& b_scales,
+ torch::Tensor const& b_group_scales, const int64_t b_group_size) {
+ TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+ TORCH_CHECK(b_tensors.dtype() == torch::kInt32); // int4 8x packed into int32
+ TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+ TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+ TORCH_CHECK(b_group_scales.dtype() ==
+ torch::kFloat8_e4m3fn); // the underlying torch type is e4m3
+ TORCH_CHECK(out_tensors.dtype() ==
+ torch::kBFloat16); // only support bf16 for now
+ // expect int64_t to avoid overflow during offset calculations
+ TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
+
+ int num_experts = static_cast(expert_offsets.size(0));
+ // logical k, n
+ int64_t n = out_tensors.size(1);
+ int64_t k = a_tensors.size(1);
+ int64_t scale_k = cutlass::ceil_div(k, b_group_size);
+
+ auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+ if (false) {
+ }
+ __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+ __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+ else {
+ TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+ }
+}
+
+} // namespace
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
new file mode 100644
index 0000000000000..4b425790dbac7
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
@@ -0,0 +1,483 @@
+#include
+#include
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+
+// vllm includes
+#include
+#include
+#include
+#include "cutlass_extensions/torch_utils.hpp"
+#include "cutlass_extensions/common.hpp"
+
+#include "core/registration.h"
+#include "get_group_starts.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "w4a8_utils.cuh"
+
+namespace vllm::cutlass_w4a8_moe {
+
+using namespace cute;
+
+// -------------------------------------------------------------------------------------
+// Static configuration shared across all instantiations
+// -------------------------------------------------------------------------------------
+using ProblemShape =
+ cutlass::gemm::GroupProblemShape>; // per
+ // group
+using MmaType = cutlass::float_e4m3_t;
+using QuantType = cutlass::int4b_t;
+
+constexpr int TileShapeK = 128 * 8 / sizeof_bits::value;
+static int constexpr PackFactor = 8; // 8 int4 packed into int32
+
+// A matrix configuration
+using ElementA = MmaType;
+using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
+constexpr int AlignmentA =
+ 128 /
+ cutlass::sizeof_bits::value; // Alignment of A matrix in units of
+ // elements (up to 16 bytes)
+
+// B matrix configuration
+using ElementB = QuantType; // Element type for B matrix operand
+using LayoutB =
+ cutlass::layout::ColumnMajor; // Layout type for B matrix operand
+constexpr int AlignmentB =
+ 128 / cutlass::sizeof_bits<
+ ElementB>::value; // Memory access granularity/alignment of B
+ // matrix in units of elements (up to 16 bytes)
+
+// This example manually swaps and transposes, so keep transpose of input
+// layouts
+using LayoutA_Transpose =
+ typename cutlass::layout::LayoutTranspose::type;
+using LayoutB_Transpose =
+ typename cutlass::layout::LayoutTranspose::type;
+
+// Need to pass a pointer type to make the 3rd dimension of Stride be _0
+using StrideA =
+ cute::remove_pointer_t>;
+using StrideB =
+ cute::remove_pointer_t>;
+
+// Define the CuTe layout for reoredered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in
+// contiguous locations in global memory. It specifies the reordering within a
+// single warp's fragment
+using LayoutAtomQuant =
+ decltype(cutlass::compute_memory_reordering_atom());
+using LayoutB_Reordered = decltype(cute::tile_to_shape(
+ LayoutAtomQuant{}, Layout>, StrideB>{}));
+
+using ElementScale = cutlass::float_e4m3_t;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// C/D matrix configuration
+using ElementC =
+ cutlass::bfloat16_t; // Element type for C and D matrix operands
+using LayoutC =
+ cutlass::layout::RowMajor; // Layout type for C and D matrix operands
+constexpr int AlignmentC =
+ 128 / cutlass::sizeof_bits<
+ ElementC>::value; // Memory access granularity/alignment of C
+ // matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using ElementD = ElementC;
+using LayoutD = LayoutC;
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value;
+
+// Core kernel configurations
+using ElementAccumulator = float; // Element type for internal accumulation
+using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that
+ // supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
+using StageCountType =
+ cutlass::gemm::collective::StageCountAuto; // Stage count maximized based
+ // on the tile size
+
+// per-channel and per-token scales for epilogue
+using ElementSChannel = float;
+
+template
+struct W4A8GroupedGemmKernel {
+ using TileShape =
+ decltype(cute::append(TileShape_MN{}, cute::Int{}));
+ using ClusterShape = ClusterShape_MNK;
+
+ // per-channel, per-token scales epilogue
+ using ChTokScalesEpilogue =
+ typename vllm::c3x::ScaledEpilogueArray;
+ using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
+ using CollectiveEpilogue =
+ typename cutlass::epilogue::collective::CollectiveBuilder<
+ ArchTag, OperatorClass, TileShape, ClusterShape,
+ cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+ ElementSChannel, ElementC,
+ typename cutlass::layout::LayoutTranspose::type*, AlignmentC,
+ ElementD, typename cutlass::layout::LayoutTranspose::type*,
+ AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+ // =========================================================== MIXED INPUT
+ // WITH SCALES
+ // ===========================================================================
+ // The Scale information must get paired with the operand that will be scaled.
+ // In this example, B is scaled so we make a tuple of B's information and the
+ // scale information.
+ using CollectiveMainloopShuffled =
+ typename cutlass::gemm::collective::CollectiveBuilder<
+ ArchTag, OperatorClass,
+ cute::tuple>,
+ LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
+ AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+ cutlass::gemm::collective::StageCountAutoCarveout(
+ sizeof(typename CollectiveEpilogue::SharedStorage))>,
+ KernelSchedule>::CollectiveOp;
+
+ using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
+ ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;
+
+ using GemmShuffled =
+ cutlass::gemm::device::GemmUniversalAdapter;
+
+ using StrideC = typename GemmKernelShuffled::InternalStrideC;
+ using StrideD = typename GemmKernelShuffled::InternalStrideD;
+
+ using StrideC_ref = cutlass::detail::TagToStrideC_t;
+ using StrideD_ref = cutlass::detail::TagToStrideC_t;
+ using StrideS = typename CollectiveMainloopShuffled::StrideScale;
+ using StrideS_ref = cutlass::detail::TagToStrideB_t;
+
+ // static asserts for passing in strides/layouts
+ // pack to 2x int64
+ static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
+ // pack to 3xint32,
+ static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
+ "LayoutB_Reordered size must be divisible by 4 bytes");
+
+ static void grouped_mm(
+ torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+ const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+ const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+ const int64_t b_group_size, const torch::Tensor& expert_offsets,
+ const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
+ const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+ const torch::Tensor& group_scale_strides) {
+ auto device = a_tensors.device();
+ auto device_id = device.index();
+ const at::cuda::OptionalCUDAGuard device_guard(device);
+ auto stream = at::cuda::getCurrentCUDAStream(device_id);
+
+ int num_experts = static_cast(expert_offsets.size(0));
+ int n = static_cast(b_tensors.size(1));
+ int k = static_cast(b_tensors.size(2)) * PackFactor;
+
+ auto options_int =
+ torch::TensorOptions().dtype(torch::kInt64).device(device);
+ torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+ torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+ torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+ torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+ torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+ torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);
+
+ // get the correct offsets to pass to gemm
+ run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+ a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
+ a_tensors, b_tensors, out_tensors, a_scales,
+ b_scales, b_group_scales, b_group_size);
+
+ // construct args
+ using Args = typename GemmShuffled::Arguments;
+ using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
+ using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
+ Args arguments;
+
+ ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+ static_cast(
+ problem_sizes_torch.data_ptr());
+ ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+ // SwapAB so B operands come first
+ MainloopArguments mainloop_arguments{
+ static_cast(b_ptrs.data_ptr()),
+ static_cast(b_strides.data_ptr()),
+ static_cast(a_ptrs.data_ptr()),
+ static_cast(a_strides.data_ptr()),
+ static_cast**>(
+ b_group_scales_ptrs.data_ptr()),
+ static_cast(group_scale_strides.data_ptr()),
+ static_cast(b_group_size)};
+
+ EpilogueArguments epilogue_arguments{
+ // since we are doing SwapAB the channel scales comes first, then token
+ // scales
+ ChTokScalesEpilogue::prepare_args( // see ScaledEpilogueArray
+ static_cast(
+ b_scales_ptrs.data_ptr()), // per-channel
+ static_cast(
+ a_scales_ptrs.data_ptr()), // per-token
+ true, true),
+ nullptr, // C
+ static_cast(c_strides.data_ptr()), // C
+ static_cast(out_ptrs.data_ptr()), // D
+ static_cast(c_strides.data_ptr()) // D
+ };
+
+ static const cutlass::KernelHardwareInfo hw_info{
+ device_id,
+ cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+ device_id)};
+
+ arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
+ mainloop_arguments, epilogue_arguments, hw_info};
+
+ // Allocate workspace
+ size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
+ torch::Tensor workspace =
+ torch::empty(workspace_size,
+ torch::TensorOptions().dtype(torch::kU8).device(device));
+
+ // Run GEMM
+ GemmShuffled gemm;
+ CUTLASS_CHECK(gemm.can_implement(arguments));
+ CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+ CUTLASS_CHECK(gemm.run(stream));
+ }
+};
+
+// ----------------------------------------------------------------------------
+// Kernel instantiations and dispatch logic
+// ----------------------------------------------------------------------------
+using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+
+// Kernel_TileShape_ClusterShape_Schedule
+using Kernel_128x16_1x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_128x16_2x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x16_1x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x16_2x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x32_1x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x32_2x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x64_1x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x64_2x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_256x128_1x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>;
+using Kernel_256x128_2x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+using Kernel_128x256_2x1x1_Coop =
+ W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>;
+
+void mm_dispatch(
+ torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+ const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+ const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+ const int64_t b_group_size, const torch::Tensor& expert_offsets,
+ const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+ const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+ const torch::Tensor& group_scale_strides, const std::string& schedule) {
+ if (schedule == "Kernel_128x16_1x1x1_Coop") {
+ Kernel_128x16_1x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_128x16_2x1x1_Coop") {
+ Kernel_128x16_2x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_256x16_1x1x1_Coop") {
+ Kernel_256x16_1x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_256x16_2x1x1_Coop") {
+ Kernel_256x16_2x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_256x32_1x1x1_Coop") {
+ Kernel_256x32_1x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_256x32_2x1x1_Coop") {
+ Kernel_256x32_2x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_256x64_1x1x1_Coop") {
+ Kernel_256x64_1x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_256x64_2x1x1_Coop") {
+ Kernel_256x64_2x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_256x128_1x1x1_Coop") {
+ Kernel_256x128_1x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_256x128_2x1x1_Coop") {
+ Kernel_256x128_2x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else if (schedule == "Kernel_128x256_2x1x1_Coop") {
+ Kernel_128x256_2x1x1_Coop::grouped_mm(
+ out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
+ b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
+ c_strides, group_scale_strides);
+ } else {
+ TORCH_CHECK(false,
+ "cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
+ }
+}
+
+void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
+ const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
+ const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
+ const int64_t b_group_size, const torch::Tensor& expert_offsets,
+ const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
+ const torch::Tensor& b_strides, const torch::Tensor& c_strides,
+ const torch::Tensor& group_scale_strides,
+ std::optional maybe_schedule) {
+ // user has specified a schedule
+ if (maybe_schedule) {
+ mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+ b_group_scales, b_group_size, expert_offsets, problem_sizes,
+ a_strides, b_strides, c_strides, group_scale_strides,
+ *maybe_schedule);
+ return;
+ }
+
+ // use heuristic
+ int m_full = a_tensors.size(0);
+ int n = b_tensors.size(1);
+ int k = b_tensors.size(2) * PackFactor; // logical k
+ int num_experts = b_tensors.size(0);
+ // per-expert batch size assuming uniform distribution
+ int m_expert = m_full / num_experts;
+
+ std::string schedule;
+ if (m_expert <= 16) {
+ schedule = "Kernel_128x16_2x1x1_Coop";
+ } else if (m_expert <= 32) {
+ schedule = "Kernel_256x32_1x1x1_Coop";
+ } else if (m_expert <= 64) {
+ schedule = "Kernel_256x64_1x1x1_Coop";
+ } else if (m_expert <= 128) {
+ schedule = "Kernel_256x128_2x1x1_Coop";
+ } else { // m_expert > 128
+ schedule = "Kernel_128x256_2x1x1_Coop";
+ }
+
+ mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+ b_group_scales, b_group_size, expert_offsets, problem_sizes,
+ a_strides, b_strides, c_strides, group_scale_strides, schedule);
+}
+
+std::tuple encode_and_reorder_int4b(
+ torch::Tensor const& b_tensors) {
+ TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
+ TORCH_CHECK(b_tensors.dim() == 3); // (experts, n, k)
+ TORCH_CHECK(b_tensors.is_contiguous());
+ TORCH_CHECK(b_tensors.is_cuda());
+
+ int n = static_cast(b_tensors.size(1));
+ int k = static_cast(b_tensors.size(2)) * PackFactor; // logical k
+
+ // CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
+ // These misalignments cause silent OOB unless run under Compute Sanitizer.
+ TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
+ TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");
+
+ // we will store the layout to an int32 tensor;
+ // this is the number of elements we need per layout
+ constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);
+
+ torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
+ int num_experts = static_cast(b_tensors.size(0));
+
+ auto b_ptr = static_cast(b_tensors.const_data_ptr());
+ auto b_packed_ptr = static_cast(b_tensors_packed.data_ptr());
+
+ // multiply by ull so result does not overflow int32
+ size_t num_int4_elems = 1ull * num_experts * n * k;
+ bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
+ num_int4_elems);
+ TORCH_CHECK(ok, "unified_encode_int4b failed");
+
+ // construct the layout once; assumes each expert has the same layout
+ using LayoutType = LayoutB_Reordered;
+ std::vector layout_B_reordered_host(num_experts);
+ auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
+ auto shape_B = cute::make_shape(n, k, Int<1>{});
+ auto layout_B = make_layout(shape_B, stride_B);
+ LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+ // reorder weights for each expert
+ for (int i = 0; i < num_experts; i++) {
+ // since the storage type of int4b is 1 byte but one element is 4 bits
+ // we need to adjust the offset
+ int64_t offset =
+ 1ull * i * n * k * cutlass::sizeof_bits::value / 8;
+ cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
+ layout_B_reordered);
+ }
+
+ // save the packed layout to torch tensor so we can re-use it
+ auto cpu_opts =
+ torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
+ torch::Tensor layout_cpu =
+ torch::empty({num_experts, layout_width}, cpu_opts);
+
+ int32_t* layout_data = layout_cpu.data_ptr();
+ for (int i = 0; i < num_experts; ++i) {
+ std::memcpy(layout_data + i * layout_width, // dst (int32*)
+ &layout_B_reordered, // src (LayoutType*)
+ sizeof(LayoutType)); // number of bytes
+ }
+
+ torch::Tensor packed_layout =
+ layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);
+
+ return {b_tensors_packed, packed_layout};
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+ m.impl("cutlass_w4a8_moe_mm", &mm);
+ m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
+}
+
+} // namespace vllm::cutlass_w4a8_moe
+/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
index 2d1568b08651c..f77af06cd6c08 100644
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -7,6 +7,7 @@
#include
#include
#include "cutlass_extensions/torch_utils.hpp"
+#include "w4a8_utils.cuh"
#include "core/registration.h"
@@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
return packed_scales;
}
-/*
- GPU-accelerated implementation of cutlass::unified_encode_int4b.
- Constructs a lookup table in constant memory to map 8 bits
- (two 4-bit values) at a time. Assumes memory is contiguous
- and pointers are 16-byte aligned.
-*/
-__constant__ uint8_t kNibbleLUT[256];
-
-__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
- size_t nbytes) {
- constexpr size_t V = sizeof(uint4); // 16 bytes
- const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
- const size_t nthreads = size_t(gridDim.x) * blockDim.x;
- const size_t nvec = nbytes / V;
-
- // 1-D grid-stride loop over 16-byte chunks
- for (size_t vec = tid; vec < nvec; vec += nthreads) {
- uint4 v = reinterpret_cast(in)[vec];
- uint8_t* b = reinterpret_cast(&v);
-#pragma unroll
- for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
- reinterpret_cast