Merge remote-tracking branch 'origin/main' into conftest/generate_beam_search/simplify-return-value

This commit is contained in:
Chukwuma Nwaugha 2025-12-11 19:39:04 +00:00
commit b13842b68f
347 changed files with 12123 additions and 3325 deletions

24
.buildkite/ci_config.yaml Normal file
View File

@ -0,0 +1,24 @@
name: vllm_ci
job_dirs:
- ".buildkite/test_areas"
- ".buildkite/image_build"
run_all_patterns:
- "docker/Dockerfile"
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/cuda.txt"
- "requirements/build.txt"
- "requirements/test.txt"
- "setup.py"
- "csrc/"
- "cmake/"
run_all_exclude_patterns:
- "docker/Dockerfile."
- "csrc/cpu/"
- "csrc/rocm/"
- "cmake/hipify.py"
- "cmake/cpu_extension.cmake"
registries: public.ecr.aws/q9t5s3a7
repositories:
main: "vllm-ci-postmerge-repo"
premerge: "vllm-ci-test-repo"

View File

@ -0,0 +1,56 @@
#!/bin/bash
set -e
if [[ $# -lt 8 ]]; then
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6
CACHE_FROM=$7
CACHE_TO=$8
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
# docker buildx
docker buildx create --name vllm-builder --driver docker-container --use
docker buildx inspect --bootstrap
docker buildx ls
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
else
merge_base_commit_build_args=""
fi
# build
docker buildx build --file docker/Dockerfile \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg USE_SCCACHE=1 \
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
${merge_base_commit_build_args} \
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
--cache-to type=registry,ref=${CACHE_TO},mode=max \
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
--push \
--target test \
--progress plain .

View File

@ -0,0 +1,57 @@
group: Abuild
steps:
- label: ":docker: Build image"
key: image-build
depends_on: []
commands:
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU image"
key: image-build-cpu
depends_on: []
commands:
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build HPU image"
soft_fail: true
depends_on: []
key: image-build-hpu
commands:
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU arm64 image"
key: cpu-arm64-image-build
depends_on: []
optional: true
commands:
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2

View File

@ -0,0 +1,36 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@ -0,0 +1,33 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@ -0,0 +1,34 @@
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
--progress plain \
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu

View File

@ -15,6 +15,21 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- label: "Build arm64 wheel - CUDA 13.0"
depends_on: ~
id: build-wheel-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# aarch64 build
- label: "Build arm64 CPU wheel"
depends_on: ~
@ -25,7 +40,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
@ -39,7 +54,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
@ -52,7 +67,7 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"

View File

@ -372,6 +372,17 @@ if __name__ == "__main__":
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# keep only "official" files for a non-nightly version (specifed by cli args)
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
if PY_VERSION_RE.match(version):
# upload-wheels.sh ensures no "dev" is in args.version
wheel_files = list(
filter(lambda x: version in x and "dev" not in x, wheel_files)
)
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
else:
print("Nightly version detected, keeping all wheel files.")
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>

View File

@ -36,11 +36,17 @@ function cpu_tests() {
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run model tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
# Run kernel tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving
docker exec cpu-test bash -c '

View File

@ -47,6 +47,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py
'

View File

@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
exit 0
fi
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory

View File

@ -1,73 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-lite"
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
--data-parallel-size 2 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

View File

@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
--data-parallel-size 2 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600}' \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &

View File

@ -1,74 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8040}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \
--max-model-len 2048 \
--gpu-memory-utilization 0.9 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

View File

@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
fi
wheel="${wheel_files[0]}"
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
# we also accept params as manylinux tag
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
manylinux_version="manylinux_2_31"
manylinux_version="${1:-manylinux_2_31}"
# Rename 'linux' to the appropriate manylinux version in the wheel filename
if [[ "$wheel" != *"linux"* ]]; then
@ -96,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]];
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
fi
# copy to /<pure_version>/ only if it does not have "dev" in the version
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then
echo "Uploading indices to overwrite /$pure_version/"
echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@ -398,7 +398,8 @@ steps:
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/
- vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@ -440,23 +441,29 @@ steps:
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
- vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
# for basic
- python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@ -718,6 +725,18 @@ steps:
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: OpenAI API correctness # 10min
timeout_in_minutes: 15
mirror_hardwares: [amdexperimental, amdproduction]
@ -727,7 +746,7 @@ steps:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval
commands: # LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/
@ -963,6 +982,19 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 # 60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
@ -1098,7 +1130,6 @@ steps:
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
@ -1132,12 +1163,25 @@ steps:
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
@ -1155,6 +1199,16 @@ steps:
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
@ -1397,6 +1451,39 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
gpu: a100
optional: true
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
gpu: h100
optional: true
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test #####
- label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental]
@ -1440,29 +1527,6 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 70
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: LM Eval Large Models (4 Card)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
@ -1478,21 +1542,6 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (H100) # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: ROCm LM Eval Large Models (8 Card)
mirror_hardwares: [amdproduction]
agent_pool: mi325_8
@ -1517,6 +1566,20 @@ steps:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
# grade: Blocking
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
@ -1550,17 +1613,26 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_2
agent_pool: mi325_4
# grade: Blocking
timeout_in_minutes: 30
gpu: h100
optional: true
num_gpus: 2
num_gpus: 4
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

View File

@ -468,7 +468,9 @@ steps:
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
# However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@ -482,7 +484,9 @@ steps:
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
# However, find does not normally propagate error codes, so we combine it with xargs
# (using -0 for proper path handling)
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
- label: PyTorch Fullgraph Test # 27min
timeout_in_minutes: 40
@ -1375,22 +1379,4 @@ steps:
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

View File

@ -0,0 +1,21 @@
group: Attention
depends_on:
- image-build
steps:
- label: V1 attention (H100)
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: V1 attention (B200)
timeout_in_minutes: 30
gpu: b200
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this

View File

@ -0,0 +1,16 @@
group: Basic Correctness
depends_on:
- image-build
steps:
- label: Basic Correctness
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py

View File

@ -0,0 +1,19 @@
group: Benchmarks
depends_on:
- image-build
steps:
- label: Benchmarks
timeout_in_minutes: 20
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
- benchmarks/
commands:
- bash scripts/run-benchmarks.sh
- label: Benchmarks CLI Test
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/

View File

@ -0,0 +1,57 @@
group: Compile
depends_on:
- image-build
steps:
- label: Fusion and Compile Tests (B200)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Fusion E2E (2 GPUs)(B200)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py

View File

@ -0,0 +1,22 @@
group: CUDA
depends_on:
- image-build
steps:
- label: Platform Tests (CUDA)
timeout_in_minutes: 15
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- label: Cudagraph
timeout_in_minutes: 20
source_file_dependencies:
- tests/v1/cudagraph
- vllm/v1/cudagraph_dispatcher.py
- vllm/config/compilation.py
- vllm/compilation
commands:
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py

View File

@ -0,0 +1,199 @@
group: Distributed
depends_on:
- image-build
steps:
- label: Distributed Comm Ops
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- pytest -v -s distributed/test_shm_buffer.py
- pytest -v -s distributed/test_shm_storage.py
- label: Distributed (2 GPUs)
timeout_in_minutes: 90
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/fullgraph/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py
- tests/v1/distributed
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Tests (4 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10
gpu: h100
num_gpus: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
- vllm/config/parallel.py
- vllm/distributed/
- vllm/v1/engine/llm_engine.py
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: Distributed Tests (4 GPUs)(A100)
gpu: a100
optional: true
num_gpus: 4
source_file_dependencies:
- vllm/
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H200)
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
gpu: b200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: 2 Node Test (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py

View File

@ -0,0 +1,59 @@
group: E2E Integration
depends_on:
- image-build
steps:
- label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: Prime-RL Integration (2 GPUs)
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

View File

@ -0,0 +1,26 @@
group: Engine
depends_on:
- image-build
steps:
- label: Engine
timeout_in_minutes: 15
source_file_dependencies:
- vllm/
- tests/engine
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/v1
commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/engine

View File

@ -0,0 +1,68 @@
group: Entrypoints
depends_on:
- image-build
steps:
- label: Entrypoints Unit Tests
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/entrypoints
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration (API Server)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration (Pooling)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/pooling
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
- label: Entrypoints V1
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/entrypoints
- label: OpenAI API Correctness
timeout_in_minutes: 30
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/

View File

@ -0,0 +1,23 @@
group: Expert Parallelism
depends_on:
- image-build
steps:
- label: EPLB Algorithm
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_algo.py
commands:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution
timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py

View File

@ -0,0 +1,117 @@
group: Kernels
depends_on:
- image-build
steps:
- label: Kernels Core Operation Test
timeout_in_minutes: 75
source_file_dependencies:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N
timeout_in_minutes: 35
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Quantization Test %N
timeout_in_minutes: 90
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test %N
timeout_in_minutes: 60
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
- vllm/envs.py
- vllm/config
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Mamba Test
timeout_in_minutes: 45
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
- label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45
gpu: h100
num_gpus: 1
source_file_dependencies:
- tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization
- tests/kernels/quantization/test_block_fp8.py
- tests/kernels/moe/test_deepgemm.py
- tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py
commands:
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
- pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Kernels (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

View File

@ -0,0 +1,46 @@
group: LM Eval
depends_on:
- image-build
steps:
- label: LM Eval Small Models
timeout_in_minutes: 75
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
autorun_on_main: true
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: LM Eval Large Models (4 GPUs)(A100)
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100)
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: LM Eval Small Models (B200)
timeout_in_minutes: 120
gpu: b200
optional: true
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1

View File

@ -0,0 +1,31 @@
group: LoRA
depends_on:
- image-build
steps:
- label: LoRA %N
timeout_in_minutes: 30
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
parallelism: 4
- label: LoRA TP (Distributed)
timeout_in_minutes: 30
num_gpus: 4
source_file_dependencies:
- vllm/lora
- tests/lora
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py

View File

@ -0,0 +1,163 @@
group: Miscellaneous
depends_on:
- image-build
steps:
- label: V1 Others
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/v1
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py
- pytest -v -s v1/test_outputs.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Others (CPU)
depends_on: ~
source_file_dependencies:
- vllm/
- tests/v1
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'cpu_test' v1/metrics
- label: Regression
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/test_regression
commands:
- pip install modelscope
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: Examples
timeout_in_minutes: 45
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
- vllm/multimodal
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/basic/chat.py # for basic
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Metrics, Tracing (2 GPUs)
timeout_in_minutes: 20
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/v1/tracing
commands:
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s v1/tracing
- label: Python-only Installation
depends_on: ~
timeout_in_minutes: 20
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- label: Async Engine, Inputs, Utils, Worker
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/multimodal
- tests/utils_
commands:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on: ~
timeout_in_minutes: 20
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/standalone_tests/lazy_imports.py
- tests/tokenizers_
- tests/transformers_utils
- tests/config
no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s tokenizers_
- pytest -v -s transformers_utils
- pytest -v -s config
- label: GPT-OSS Eval (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Batch Invariance (H100)
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/determinism/test_batch_invariance.py
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py

View File

@ -0,0 +1,17 @@
group: Model Executor
depends_on:
- image-build
steps:
- label: Model Executor
timeout_in_minutes: 35
source_file_dependencies:
- vllm/engine/arg_utils.py
- vllm/config/model.py
- vllm/model_executor
- tests/model_executor
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

View File

@ -0,0 +1,62 @@
group: Models - Basic
depends_on:
- image-build
steps:
- label: Basic Models Tests (Initialization)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_initialization.py
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/test_initialization.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
# test.) Also run if model initialization test file is modified
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Basic Models Tests (Other)
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/test_transformers.py
- tests/models/test_registry.py
commands:
- pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/models/test_utils.py
- tests/models/test_vision.py
no_gpu: true
commands:
- pytest -v -s models/test_utils.py models/test_vision.py
- label: Transformers Nightly Models
working_dir: "/vllm-workspace/"
optional: true
soft_fail: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

View File

@ -0,0 +1,22 @@
group: Models - Distributed
depends_on:
- image-build
steps:
- label: Distributed Model Tests (2 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/
- tests/basic_correctness/
- tests/model_executor/model_loader/test_sharded_state_loader.py
- tests/models/
commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'

View File

@ -0,0 +1,91 @@
group: Models - Language
depends_on:
- image-build
steps:
- label: Language Models Tests (Standard)
timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
commands:
# Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and (not slow_test)'
- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/language/pooling/test_embedding.py
- tests/models/language/generation/test_common.py
- tests/models/language/pooling/test_classification.py
commands:
# Shard slow subset of standard language models tests. Only run when model
# source is modified, or when specified test files are modified
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/generation_ppl_test
commands:
- pytest -v -s models/language/generation_ppl_test
- label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
- label: Language Models Test (MTEB)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling_mteb_test
commands:
- pytest -v -s models/language/pooling_mteb_test

View File

@ -0,0 +1,79 @@
group: Models - Multimodal
depends_on:
- image-build
steps:
- label: Multi-Modal Models (Standard) # 60min
timeout_in_minutes: 80
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Processor Test (CPU)
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor # 44min
timeout_in_minutes: 60
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models (Extended) 1
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models (Extended) 2
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models (Extended) 3
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

View File

@ -0,0 +1,34 @@
group: Plugins
depends_on:
- image-build
steps:
- label: Plugin Tests (2 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/plugins/
- tests/plugins/
commands:
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
- pip install -e ./plugins/vllm_add_dummy_platform
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
# begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
- pip uninstall dummy_stat_logger -y
# end stat_logger plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

View File

@ -0,0 +1,50 @@
group: PyTorch
depends_on:
- image-build
steps:
- label: PyTorch Compilation Unit Tests
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/compile
commands:
# Run unit tests defined directly under compile/,
# not including subdirectories, which are usually heavier
# tests covered elsewhere.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
- label: PyTorch Fullgraph Smoke Test
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/compile
commands:
# Run smoke tests under fullgraph directory, except test_full_graph.py
# as it is a heavy test that is covered in other steps.
# Use `find` to launch multiple instances of pytest so that
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
- label: PyTorch Fullgraph
timeout_in_minutes: 40
source_file_dependencies:
- vllm/
- tests/compile
commands:
# fp8 kv scales not supported on sm89, tested on Blackwell instead
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
# Limit to no custom ops to reduce running time
# Wrap with quotes to escape yaml and avoid starting -k string with a -
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh

View File

@ -0,0 +1,46 @@
group: Quantization
depends_on:
- image-build
steps:
- label: Quantization
timeout_in_minutes: 90
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release, and pin a working version of torchao nightly here
# since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: Quantized MoE Test (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/models/llama4.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization/compressed_tensors
- vllm/model_executor/layers/quantization/modelopt.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Quantized Models Test
timeout_in_minutes: 60
source_file_dependencies:
- vllm/model_executor/layers/quantization
- tests/models/quantization
commands:
- pytest -v -s models/quantization

View File

@ -0,0 +1,14 @@
group: Samplers
depends_on:
- image-build
steps:
- label: Samplers Test
timeout_in_minutes: 75
source_file_dependencies:
- vllm/model_executor/layers
- vllm/sampling_metadata.py
- tests/samplers
- tests/conftest.py
commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

View File

@ -0,0 +1,23 @@
group: Tool use
depends_on:
- image-build
steps:
- label: OpenAI-Compatible Tool Use
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s -m 'not cpu_test' tool_use
- label: OpenAI-Compatible Tool Use (CPU)
depends_on: ~
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use

View File

@ -0,0 +1,25 @@
group: Weight Loading
depends_on:
- image-build
steps:
- label: Weight Loading Multiple GPU # 33min
timeout_in_minutes: 45
working_dir: "/vllm-workspace/tests"
num_gpus: 2
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU - Large Models # optional
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

View File

@ -13,7 +13,7 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Python
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0

View File

@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@v6.0.1
- uses: astral-sh/setup-uv@v7
with:

View File

@ -16,7 +16,7 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"

View File

@ -15,7 +15,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months

View File

@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
"csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
"csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")

View File

@ -96,8 +96,9 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
VLLM_SERVER_DEV_MODE=1 \
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_SERVER_DEV_MODE=1 \

View File

@ -32,12 +32,11 @@ def benchmark_propose(args):
model_config = ModelConfig(
model="facebook/opt-125m",
task="generate",
max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
seed=None,
seed=0,
trust_remote_code=False,
)
proposer = NgramProposer(

View File

@ -574,7 +574,7 @@ async def benchmark(
)
print(
"{:<40} {:<10.2f}".format(
"Total Token throughput (tok/s):", metrics.total_token_throughput
"Total token throughput (tok/s):", metrics.total_token_throughput
)
)
@ -963,8 +963,7 @@ def create_argument_parser():
parser.add_argument(
"--profile",
action="store_true",
help="Use Torch Profiler. The endpoint must be launched with "
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
)
parser.add_argument(
"--result-dir",

View File

@ -0,0 +1,150 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
in MLA (Multi-head Latent Attention) prefill.
This validates that the optimization from commit 8d4142bd is beneficial across
various batch sizes, not just the originally tested batch size of 32768.
"""
import time
from collections.abc import Callable
import torch
# DeepSeek-V3 MLA dimensions
NUM_HEADS = 128
QK_NOPE_HEAD_DIM = 128
PE_DIM = 64
def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
"""Original torch.cat approach with expand."""
return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
"""Optimized direct copy approach (avoids expand + cat overhead)."""
k = torch.empty(
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
dtype=k_nope.dtype,
device=k_nope.device,
)
k[..., : k_nope.shape[-1]] = k_nope
k[..., k_nope.shape[-1] :] = k_pe
return k
def benchmark_method(
method: Callable,
k_nope: torch.Tensor,
k_pe: torch.Tensor,
num_warmup: int = 10,
num_iters: int = 100,
) -> float:
"""Benchmark a concatenation method and return mean latency in ms."""
# Warmup
for _ in range(num_warmup):
_ = method(k_nope, k_pe)
torch.cuda.synchronize()
# Benchmark
start = time.perf_counter()
for _ in range(num_iters):
_ = method(k_nope, k_pe)
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / num_iters * 1000 # Convert to ms
@torch.inference_mode()
def run_benchmark(dtype: torch.dtype, dtype_name: str):
"""Run benchmark for a specific dtype."""
torch.set_default_device("cuda")
# Batch sizes to test (powers of 2 from 32 to 65536)
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
print("=" * 80)
print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
print("=" * 80)
print(
f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
f"k_pe=[B, 1, {PE_DIM}]"
)
print(f"dtype: {dtype_name}")
print()
print(
f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
f"{'Speedup':>8} | {'Reduction':>10}"
)
print("-" * 70)
results = []
for batch_size in batch_sizes:
# Create input tensors (generate in float32 then convert for FP8 compatibility)
k_nope = torch.randn(
batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
).to(dtype)
k_pe = torch.randn(
batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
).to(dtype)
# Benchmark both methods
cat_time = benchmark_method(cat_method, k_nope, k_pe)
direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
speedup = cat_time / direct_time
reduction = (1 - direct_time / cat_time) * 100
results.append((batch_size, cat_time, direct_time, speedup, reduction))
print(
f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
f"{speedup:>7.2f}x | {reduction:>9.1f}%"
)
print("=" * 80)
# Summary statistics
speedups = [r[3] for r in results]
print("\nSpeedup summary:")
print(f" Min: {min(speedups):.2f}x")
print(f" Max: {max(speedups):.2f}x")
print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
# Find crossover point
crossover_batch = None
for batch_size, _, _, speedup, _ in results:
if speedup >= 1.0:
crossover_batch = batch_size
break
print("\nConclusion:")
if crossover_batch:
print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
# Filter for large batches (>= 512 which is typical for prefill)
large_batch_speedups = [r[3] for r in results if r[0] >= 512]
if large_batch_speedups:
avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
print(" - MLA prefill typically uses large batches, so optimization is effective")
return results
@torch.inference_mode()
def main():
# Test bfloat16
print("\n")
run_benchmark(torch.bfloat16, "bfloat16")
# Test float8_e4m3fn
print("\n")
run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
if __name__ == "__main__":
main()

View File

@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
# Build ACL with CMake
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
set(CMAKE_BUILD_TYPE "Release")
set(ARM_COMPUTE_ARCH "armv8.2-a")
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
set(ARM_COMPUTE_BUILD_TESTING "OFF")
set(_cmake_config_cmd
${CMAKE_COMMAND} -G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF

View File

@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
input.casual = casual;
input.isa = isa;
input.enable_kv_split = enable_kv_split;
TORCH_CHECK(casual, "Only supports casual mask for now.");
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {

View File

@ -186,7 +186,7 @@ struct AttentionMetadata {
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
// * q_tile_size * 4, partial output, max + sum (float)
// Reduction scratchpad contains:
// - flags: bool array to indicate wether the split is finished
// - flags: bool array to indicate whether the split is finished
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
// - max, sum: 2 * split_num * q_tile_size * 4
class AttentionScratchPad {

View File

@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) {
return cuda_cast<T, float>(sigmoid_accurate(f));
}
template <typename T>
template <ScoringFunc SF, typename T>
__device__ inline T apply_scoring(T val) {
if constexpr (SF == SCORING_SIGMOID) {
return apply_sigmoid(val);
} else {
return val;
}
}
template <typename T, ScoringFunc SF>
__device__ void topk_with_k2(T* output, T const* input, T const* bias,
cg::thread_block_tile<32> const& tile,
int32_t const lane_id,
int const num_experts_per_group,
int const scoring_func) {
int const num_experts_per_group) {
// Get the top2 per thread
T largest = neg_inf<T>();
T second_largest = neg_inf<T>();
if (num_experts_per_group > WARP_SIZE) {
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
T value = input[i];
// Apply scoring function if needed
if (scoring_func == SCORING_SIGMOID) {
value = apply_sigmoid(value);
}
T value = apply_scoring<SF>(input[i]);
value = value + bias[i];
if (value > largest) {
@ -472,11 +476,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
}
} else {
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
T value = input[i];
// Apply scoring function if needed
if (scoring_func == SCORING_SIGMOID) {
value = apply_sigmoid(value);
}
T value = apply_scoring<SF>(input[i]);
value = value + bias[i];
largest = value;
}
@ -501,13 +501,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
}
}
template <typename T>
template <typename T, ScoringFunc SF>
__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
int64_t const num_tokens,
int64_t const num_cases,
int64_t const n_group,
int64_t const num_experts_per_group,
int const scoring_func) {
int64_t const num_experts_per_group) {
int32_t warp_id = threadIdx.x / WARP_SIZE;
int32_t lane_id = threadIdx.x % WARP_SIZE;
@ -525,21 +524,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.wait;");
#endif
topk_with_k2(output, input, group_bias, tile, lane_id,
num_experts_per_group, scoring_func);
topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
num_experts_per_group);
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;");
#endif
}
template <typename T, typename IdxT>
template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
__global__ void group_idx_and_topk_idx_kernel(
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
T const* bias, int64_t const num_tokens, int64_t const n_group,
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
int64_t const num_experts_per_group, bool renormalize,
double routed_scaling_factor, int scoring_func) {
double routed_scaling_factor) {
int32_t warp_id = threadIdx.x / WARP_SIZE;
int32_t lane_id = threadIdx.x % WARP_SIZE;
int32_t case_id =
@ -549,6 +548,11 @@ __global__ void group_idx_and_topk_idx_kernel(
topk_values += case_id * topk;
topk_indices += case_id * topk;
constexpr bool kUseStaticNGroup = (NGroup > 0);
// use int32 to avoid implicit conversion
int32_t const n_group_i32 =
kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
int32_t align_num_experts_per_group =
warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
@ -574,13 +578,14 @@ __global__ void group_idx_and_topk_idx_kernel(
if (case_id < num_tokens) {
// calculate group_idx
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
int32_t target_num_min =
WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
// The check is necessary to avoid abnormal input
if (lane_id < n_group && is_finite(group_scores[lane_id])) {
if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
value = group_scores[lane_id];
}
int count_equal_to_top_value = WARP_SIZE - n_group;
int count_equal_to_top_value = WARP_SIZE - n_group_i32;
int pre_count_equal_to_top_value = 0;
// Use loop to find the largset top_group
while (count_equal_to_top_value < target_num_min) {
@ -604,7 +609,7 @@ __global__ void group_idx_and_topk_idx_kernel(
int count_equalto_topkth_group = 0;
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
if (case_id < num_tokens && if_proceed_next_topk) {
for (int i_group = 0; i_group < n_group; i_group++) {
auto process_group = [&](int i_group) {
if ((group_scores[i_group] > topk_group_value) ||
((group_scores[i_group] == topk_group_value) &&
(count_equalto_topkth_group < num_equalto_topkth_group))) {
@ -613,11 +618,10 @@ __global__ void group_idx_and_topk_idx_kernel(
i += WARP_SIZE) {
T candidates = neg_inf<T>();
if (i < num_experts_per_group) {
// Apply scoring function (if any) and add bias
// apply scoring function (if any) and add bias
T input = scores[offset + i];
if (is_finite(input)) {
T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
: input;
T score = apply_scoring<SF>(input);
candidates = score + bias[offset + i];
}
}
@ -627,6 +631,17 @@ __global__ void group_idx_and_topk_idx_kernel(
count_equalto_topkth_group++;
}
}
};
if constexpr (kUseStaticNGroup) {
#pragma unroll
for (int i_group = 0; i_group < NGroup; ++i_group) {
process_group(i_group);
}
} else {
for (int i_group = 0; i_group < n_group_i32; ++i_group) {
process_group(i_group);
}
}
queue.done();
__syncwarp();
@ -646,12 +661,13 @@ __global__ void group_idx_and_topk_idx_kernel(
if (i < topk) {
// Load the score value (without bias) for normalization
T input = scores[s_topk_idx[i]];
value =
(scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
value = apply_scoring<SF>(input);
s_topk_value[i] = value;
}
topk_sum +=
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
if (renormalize) {
topk_sum +=
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
}
}
}
@ -660,13 +676,9 @@ __global__ void group_idx_and_topk_idx_kernel(
if (case_id < num_tokens) {
if (if_proceed_next_topk) {
for (int i = lane_id; i < topk; i += WARP_SIZE) {
float value;
if (renormalize) {
value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
routed_scaling_factor;
} else {
value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
}
float base = cuda_cast<float, T>(s_topk_value[i]);
float value = renormalize ? (base / topk_sum * routed_scaling_factor)
: (base * routed_scaling_factor);
topk_indices[i] = s_topk_idx[i];
topk_values[i] = value;
}
@ -684,6 +696,45 @@ __global__ void group_idx_and_topk_idx_kernel(
#endif
}
template <typename T, typename IdxT, ScoringFunc SF>
inline void launch_group_idx_and_topk_kernel(
cudaLaunchConfig_t const& config, T* scores, T* group_scores,
float* topk_values, IdxT* topk_indices, T const* bias,
int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
int64_t const topk, int64_t const num_experts,
int64_t const num_experts_per_group, bool const renormalize,
double const routed_scaling_factor) {
auto launch = [&](auto* kernel_instance2) {
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
topk_values, topk_indices, bias, num_tokens, n_group,
topk_group, topk, num_experts, num_experts_per_group,
renormalize, routed_scaling_factor);
};
switch (n_group) {
case 4: {
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
break;
}
case 8: {
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
break;
}
case 16: {
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
break;
}
case 32: {
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
break;
}
default: {
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
break;
}
}
}
template <typename T, typename IdxT>
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
IdxT* topk_indices, T const* bias, int64_t const num_tokens,
@ -694,7 +745,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
cudaStream_t const stream = 0) {
int64_t num_cases = num_tokens * n_group;
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
cudaLaunchConfig_t config;
config.gridDim = topk_with_k2_num_blocks;
config.blockDim = BLOCK_SIZE;
@ -705,16 +755,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
num_tokens, num_cases, n_group, num_experts / n_group,
scoring_func);
auto const sf = static_cast<ScoringFunc>(scoring_func);
int64_t const num_experts_per_group = num_experts / n_group;
auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
num_tokens, num_cases, n_group, num_experts_per_group);
};
switch (sf) {
case SCORING_NONE: {
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
launch_topk_with_k2(kernel_instance1);
break;
}
case SCORING_SIGMOID: {
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
launch_topk_with_k2(kernel_instance1);
break;
}
default:
// should be guarded by higher level checks.
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
}
int64_t topk_with_k_group_num_blocks =
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
size_t dynamic_smem_in_bytes =
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
topk);
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
config.gridDim = topk_with_k_group_num_blocks;
config.blockDim = BLOCK_SIZE;
config.dynamicSmemBytes = dynamic_smem_in_bytes;
@ -723,10 +790,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
topk_values, topk_indices, bias, num_tokens, n_group,
topk_group, topk, num_experts, num_experts / n_group,
renormalize, routed_scaling_factor, scoring_func);
switch (sf) {
case SCORING_NONE: {
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
config, scores, group_scores, topk_values, topk_indices, bias,
num_tokens, n_group, topk_group, topk, num_experts,
num_experts_per_group, renormalize, routed_scaling_factor);
break;
}
case SCORING_SIGMOID: {
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
config, scores, group_scores, topk_values, topk_indices, bias,
num_tokens, n_group, topk_group, topk, num_experts,
num_experts_per_group, renormalize, routed_scaling_factor);
break;
}
default:
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
}
}
#define INSTANTIATE_NOAUX_TC(T, IdxT) \

View File

@ -14,7 +14,6 @@
namespace vllm {
namespace moe {
namespace batched_moe_align_block_size {
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
@ -80,23 +79,30 @@ __global__ void batched_moe_align_block_size_kernel(
} // namespace batched_moe_align_block_size
template <typename scalar_t>
__global__ void moe_align_block_size_kernel(
__device__ void _moe_align_block_size(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad,
int32_t* __restrict__ expert_map, int32_t num_experts,
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
bool has_expert_map) {
int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
extern __shared__ int32_t shared_counts[];
// Use a separate threadblock to fill sorted_token_ids.
// Compute input buffer offsets. Typically these will all be 0, except when
// using Multi LoRA.
int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
int expert_ids_offset = max_num_m_blocks * model_offset;
int cumsum_offset = (num_experts + 1) * model_offset;
// Use separate threadblocks to fill sorted_token_ids.
// This is safe since the current kernel does not use sorted_token_ids.
if (blockIdx.x == 1) {
if (blockIdx.x % 2) {
// Initialize sorted_token_ids with numel
for (size_t it = threadIdx.x; it < max_num_tokens_padded;
it += blockDim.x) {
sorted_token_ids[it] = numel;
sorted_token_ids[sorted_token_ids_offset + it] = numel;
}
return;
}
@ -127,7 +133,9 @@ __global__ void moe_align_block_size_kernel(
}
int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
mask);
}
__syncthreads();
@ -148,77 +156,44 @@ __global__ void moe_align_block_size_kernel(
int cumsum_val;
BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
if (expert_id <= num_experts) {
cumsum[expert_id] = cumsum_val;
cumsum[cumsum_offset + expert_id] = cumsum_val;
}
if (expert_id == num_experts) {
*total_tokens_post_pad = cumsum_val;
total_tokens_post_pad[model_offset] = cumsum_val;
}
__syncthreads();
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
for (int i = cumsum[cumsum_offset + threadIdx.x];
i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
}
}
// Fill remaining expert_ids with 0
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
expert_ids[i] = 0;
}
}
template <typename scalar_t>
__global__ void count_and_sort_expert_tokens_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
bool has_expert_map) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
if (has_expert_map) {
expert_id = expert_map[expert_id];
// filter invalid experts
if (expert_id == -1) continue;
}
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
sorted_token_ids[rank_post_pad] = i;
}
}
template <typename scalar_t, int TOPK>
__global__ void moe_sum_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., topk, d]
const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
scalar_t x = 0.0;
#pragma unroll
for (int k = 0; k < TOPK; ++k) {
x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
}
out[token_idx * d + idx] = x;
const size_t fill_start_idx =
cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
expert_ids[expert_ids_offset + i] = inactive_expert_id;
}
}
template <typename scalar_t, int32_t fill_threads>
__global__ void moe_align_block_size_small_batch_expert_kernel(
__device__ void _moe_align_block_size_small_batch_expert(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad,
int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) {
size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
int32_t* token_mask, bool has_expert_map) {
// Compute input buffer offsets. Typically these will all be 0, except when
// using Multi LoRA.
int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
int expert_ids_offset = max_num_m_blocks * model_offset;
// Use an additional group of threads to fill sorted_token_ids.
// Since the current kernel will use sorted_token_ids afterward,
// we fill sorted_token_ids within the same threadblock to make
@ -227,7 +202,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
// Initialize sorted_token_ids with numel
for (size_t it = threadIdx.x; it < max_num_tokens_padded;
it += fill_threads) {
sorted_token_ids[it] = numel;
sorted_token_ids[sorted_token_ids_offset + it] = numel;
}
// Three __syncthreads() corresponding to the other threads
__syncthreads();
@ -254,7 +229,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
// filter invalid expert
if (expert_id == -1) continue;
}
++tokens_cnts[(tid + 1) * num_experts + expert_id];
int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
}
__syncthreads();
@ -277,22 +253,22 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
total_tokens_post_pad[model_offset] =
static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
if (tid < num_experts) {
for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
expert_ids[i / block_size] = tid;
expert_ids[expert_ids_offset + i / block_size] = tid;
}
}
// Fill remaining expert_ids with 0
const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) {
expert_ids[i] = 0;
for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
expert_ids[expert_ids_offset + i] = inactive_expert_id;
}
for (size_t i = tid; i < numel; i += stride) {
@ -304,11 +280,195 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
}
int32_t rank_post_pad =
tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[tid * num_experts + expert_id];
if (token_mask == nullptr || token_mask[i / topk_num]) {
sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
++tokens_cnts[tid * num_experts + expert_id];
}
}
}
template <typename scalar_t>
__device__ void _count_and_sort_expert_tokens(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
int32_t model_offset, int32_t topk_num, bool has_expert_map) {
const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.y;
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
if (has_expert_map) {
expert_id = expert_map[expert_id];
// filter invalid experts
if (expert_id == -1) continue;
}
if (token_mask == nullptr || token_mask[i / topk_num]) {
int32_t rank_post_pad = atomicAdd(
&cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
i;
}
}
}
template <typename scalar_t>
__global__ void moe_align_block_size_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad,
int32_t* __restrict__ expert_map, int32_t num_experts,
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
int32_t topk_num, bool has_expert_map) {
_moe_align_block_size(
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
num_experts, padded_num_experts, experts_per_warp, block_size, numel,
cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
0, 0, topk_num, nullptr, has_expert_map);
}
template <typename scalar_t>
__global__ void count_and_sort_expert_tokens_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
_count_and_sort_expert_tokens(
topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
}
template <typename scalar_t, int TOPK>
__global__ void moe_sum_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., topk, d]
const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
scalar_t x = 0.0;
#pragma unroll
for (int k = 0; k < TOPK; ++k) {
x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
}
out[token_idx * d + idx] = x;
}
}
template <typename scalar_t, int32_t fill_threads>
__global__ void moe_align_block_size_small_batch_expert_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad,
int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
bool has_expert_map) {
_moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
num_experts, block_size, numel, max_num_tokens_padded,
CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
has_expert_map);
}
template <typename scalar_t>
__global__ void moe_lora_align_block_size_kernel(
scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
int max_loras, size_t numel, int max_num_tokens_padded,
int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
int32_t* __restrict__ expert_ids, int32_t topk_num,
int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
int32_t* __restrict__ cumsum, int32_t experts_per_warp,
int32_t padded_num_experts, int32_t* lora_ids,
int32_t* __restrict__ token_mask, bool has_expert_map) {
int lora_idx = blockIdx.x / 2;
int lora_id = lora_ids[lora_idx];
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
return;
}
// Populate the token_mask based on the token-LoRA mapping
int num_tokens = numel / topk_num;
if (threadIdx.x == 0) {
total_tokens_post_pad[lora_id] = 0;
for (int i = 0; i < num_tokens; i++) {
token_mask[(lora_id * num_tokens) + i] =
(int)token_lora_mapping[i] == lora_id;
}
}
__syncthreads();
_moe_align_block_size(
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
num_experts, padded_num_experts, experts_per_warp, block_size, numel,
cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
&token_mask[(lora_id * num_tokens)], has_expert_map);
}
template <typename scalar_t>
__global__ void lora_count_and_sort_expert_tokens_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
int32_t* lora_ids, bool has_expert_map) {
int lora_idx = blockIdx.x;
int lora_id = lora_ids[lora_idx];
if (lora_id == -1) {
return;
}
int num_tokens = numel / topk_num;
_count_and_sort_expert_tokens(
topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
topk_num, has_expert_map);
}
template <typename scalar_t, int32_t fill_threads>
__global__ void moe_lora_align_block_size_small_batch_expert_kernel(
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
int max_loras, size_t numel, int max_num_tokens_padded,
int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
int32_t* __restrict__ expert_ids, int topk_num,
int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
int32_t* token_mask, bool has_expert_map) {
int lora_idx = blockIdx.x;
int lora_id = lora_ids[lora_idx];
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
return;
}
int num_tokens = numel / topk_num;
if (threadIdx.x == 0) {
total_tokens_post_pad[lora_id] = 0;
for (int i = 0; i < num_tokens; i++) {
token_mask[(lora_id * num_tokens) + i] =
(int)token_lora_mapping[i] == lora_id;
}
}
__syncthreads();
_moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
-1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
has_expert_map);
}
} // namespace moe
} // namespace vllm
@ -365,7 +525,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(),
expert_map.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), sorted_token_ids.size(0), has_expert_map);
topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
has_expert_map);
} else {
torch::Tensor cumsum_buffer =
torch::empty({num_experts + 1}, options_int);
@ -386,21 +547,23 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
experts_per_warp, block_size, topk_ids.numel(),
cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
has_expert_map);
topk_ids.size(1), has_expert_map);
const int block_threads = std::min(256, (int)threads);
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
dim3 gridDims(1, actual_blocks);
auto sort_kernel =
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
sort_kernel<<<gridDims, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
topk_ids.numel(), num_experts, has_expert_map);
topk_ids.numel(), num_experts, sorted_token_ids.size(0),
topk_ids.size(1), has_expert_map);
}
});
}
@ -474,3 +637,123 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
break;
}
}
void moe_lora_align_block_size(
torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
int64_t num_experts, int64_t block_size, int64_t max_loras,
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map) {
const int topk_num = topk_ids.size(1);
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
int device_max_shared_mem;
auto dev = topk_ids.get_device();
cudaDeviceGetAttribute(&device_max_shared_mem,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int64_t padded_num_experts =
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
// BlockScan uses 1024 threads and assigns one thread per expert.
TORCH_CHECK(padded_num_experts < 1024,
"padded_num_experts must be less than 1024");
auto options_int =
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
torch::Tensor token_mask =
torch::empty({max_loras * topk_ids.size(0)}, options_int);
bool has_expert_map = maybe_expert_map.has_value();
torch::Tensor expert_map;
if (has_expert_map) {
expert_map = maybe_expert_map.value();
} else {
expert_map = torch::empty({0}, options_int);
}
VLLM_DISPATCH_INTEGRAL_TYPES(
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
bool small_batch_expert_mode =
(topk_ids.numel() < 1024) && (num_experts <= 64);
if (small_batch_expert_mode) {
const int32_t num_thread = max((int32_t)num_experts, 128);
const int32_t shared_mem =
(num_thread + 1) * num_experts * sizeof(int32_t) +
(num_experts + 1) * sizeof(int32_t);
if (shared_mem > device_max_shared_mem) {
TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
}
// threadIdx.x >= fill_threads: counting experts and aligning
// threadIdx.x < fill_threads: filling sorted_token_ids
constexpr int32_t fill_threads = 256;
dim3 blockDim(num_thread + fill_threads);
auto kernel =
vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
scalar_t, fill_threads>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem));
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
topk_ids.data_ptr<scalar_t>(),
token_lora_mapping.data_ptr<int32_t>(), block_size,
expert_map.data_ptr<int32_t>(), num_experts, max_loras,
topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
sorted_token_ids.data_ptr<int32_t>(),
expert_ids.data_ptr<int32_t>(), topk_num,
num_tokens_post_pad.data_ptr<int32_t>(),
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>(),
token_mask.data_ptr<int32_t>(), has_expert_map);
} else {
int num_thread = 1024;
dim3 blockDim(num_thread);
size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);
size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);
// cumsum buffer
torch::Tensor cumsum =
torch::zeros({max_loras * (num_experts + 1)}, options_int);
auto align_kernel =
vllm::moe::moe_lora_align_block_size_kernel<scalar_t>;
// launch two threadblocks for each lora
// blockIdx.x % 2 == 0: counting experts and aligning
// blockIdx.x % 2 == 1: filling sorted_token_ids
align_kernel<<<max_loras * 2, blockDim, shared_mem_size, stream>>>(
topk_ids.data_ptr<scalar_t>(),
token_lora_mapping.data_ptr<int32_t>(), block_size,
expert_map.data_ptr<int32_t>(), num_experts, max_loras,
topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
sorted_token_ids.data_ptr<int32_t>(),
expert_ids.data_ptr<int32_t>(), topk_num,
num_tokens_post_pad.data_ptr<int32_t>(),
adapter_enabled.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
WARP_SIZE, padded_num_experts, lora_ids.data_ptr<int32_t>(),
token_mask.data_ptr<int32_t>(), has_expert_map);
const int block_threads = std::min(256, (int)num_thread);
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
dim3 gridDims(max_loras, actual_blocks);
auto sort_kernel =
vllm::moe::lora_count_and_sort_expert_tokens_kernel<scalar_t>;
sort_kernel<<<gridDims, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
expert_map.data_ptr<int32_t>(), topk_ids.numel(), num_experts,
max_num_tokens_padded, topk_num, token_mask.data_ptr<int32_t>(),
lora_ids.data_ptr<int32_t>(), has_expert_map);
}
});
}

View File

@ -1,174 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/ATen.h>
#include <ATen/cuda/Atomic.cuh>
#include "../cuda_compat.h"
#include "../dispatch_utils.h"
#include "core/math.hpp"
namespace {
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
int32_t col) {
return row * total_col + col;
}
} // namespace
// TODO: Refactor common parts with moe_align_sum_kernels
template <typename scalar_t, typename token_cnts_t>
__global__ void moe_lora_align_sum_kernel(
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
int64_t block_size, int num_experts, int max_loras, size_t numel,
int max_num_tokens_padded, int max_num_m_blocks,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
int32_t* lora_ids) {
const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
int lora_idx = blockIdx.x;
int lora_id = lora_ids[lora_idx];
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
return;
}
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem;
token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
// Initialize sorted_token_ids with numel
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
}
// Initialize expert_ids with -1
for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
expert_ids[lora_id * max_num_m_blocks + it] = -1;
}
// Initialize total_tokens_post_pad with 0
if (threadIdx.x == 0) {
total_tokens_post_pad[lora_id] = 0;
}
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int mask = token_lora_mapping[i / topk_num] == lora_id;
int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
tokens_cnts[idx] += mask;
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
threadIdx.x;
}
}
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
atomicAdd(
&sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
(i - numel) * mask);
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
}
}
void moe_lora_align_block_size(
torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
int64_t num_experts, int64_t block_size, int64_t max_loras,
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
torch::Tensor lora_ids) {
const int topk_num = topk_ids.size(1);
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
int device_max_shared_mem;
auto dev = topk_ids.get_device();
cudaDeviceGetAttribute(&device_max_shared_mem,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE,
TORCH_CHECK(num_thread <= 1024,
"num_thread must be less than 1024, "
"and fallback is not implemented yet.");
const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
(num_experts + 1) * sizeof(int32_t);
if (shared_mem > device_max_shared_mem) {
TORCH_CHECK(false,
"Shared memory usage exceeds device limit, and global memory "
"fallback is not implemented yet.");
}
VLLM_DISPATCH_INTEGRAL_TYPES(
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
dim3 blockDim(num_thread);
auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem));
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
topk_ids.data_ptr<scalar_t>(),
token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
max_loras, topk_ids.numel(), max_num_tokens_padded,
max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
expert_ids.data_ptr<int32_t>(), topk_num,
num_tokens_post_pad.data_ptr<int32_t>(),
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
});
}

View File

@ -27,7 +27,7 @@ void moe_lora_align_block_size(
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
torch::Tensor lora_ids);
torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map);
#ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales,

View File

@ -47,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor !experts_ids,"
" Tensor !num_tokens_post_pad,"
" Tensor !adapter_enabled,"
" Tensor !lora_ids) -> () ");
" Tensor !lora_ids,"
" Tensor? maybe_expert_map) -> () ");
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
#ifndef USE_ROCM

View File

@ -262,7 +262,8 @@ void get_cutlass_moe_mm_data(
void get_cutlass_moe_mm_problem_sizes(
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
std::optional<bool> force_swap_ab = std::nullopt);
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,

View File

@ -0,0 +1,104 @@
// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
#pragma once
#include <cuda.h>
#include <torch/all.h>
#include <c10/cuda/CUDAStream.h>
#include "core/scalar_type.hpp"
#include "cutlass/bfloat16.h"
#include "cutlass/float8.h"
// ElementB is int32 (packed int4)
// ElementGroupScale is cutlass::Array<cutlass::float_e4m3_t, 8> (packed fp8)
template <typename ElementA, typename ElementB, typename ElementC,
typename ElementAccumulator, typename ElementGroupScale>
__global__ void get_group_gemm_starts(
int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
ElementAccumulator** b_scales_offsets,
ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
ElementB* b_base_as_int, ElementC* out_base_as_int,
ElementAccumulator* a_scales_base_as_int,
ElementAccumulator* b_scales_base_as_int,
ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
int64_t scale_k) {
int expert_id = threadIdx.x;
int64_t expert_offset = expert_offsets[expert_id];
// same as w8a8
a_offsets[expert_id] = a_base_as_int + expert_offset * k;
out_offsets[expert_id] = out_base_as_int + expert_offset * n;
a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);
// w4a8 specific
constexpr int pack_factor = 8; // pack 8 int4 into int32
b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
b_group_scales_offsets[expert_id] =
b_group_scales_base_as_int + (expert_id * scale_k * n);
}
#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE) \
else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
get_group_gemm_starts<cutlass::float_e4m3_t, int32_t, C_TYPE, float, \
cutlass::Array<cutlass::float_e4m3_t, 8>> \
<<<1, num_experts, 0, stream>>>( \
static_cast<int64_t*>(expert_offsets.data_ptr()), \
static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()), \
static_cast<int32_t**>(b_ptrs.data_ptr()), \
static_cast<C_TYPE**>(out_ptrs.data_ptr()), \
static_cast<float**>(a_scales_ptrs.data_ptr()), \
static_cast<float**>(b_scales_ptrs.data_ptr()), \
static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>**>( \
b_group_scales_ptrs.data_ptr()), \
static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()), \
static_cast<int32_t*>(b_tensors.data_ptr()), \
static_cast<C_TYPE*>(out_tensors.data_ptr()), \
static_cast<float*>(a_scales.data_ptr()), \
static_cast<float*>(b_scales.data_ptr()), \
static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>*>( \
b_group_scales.data_ptr()), \
n, k, scale_k); \
}
namespace {
void run_get_group_gemm_starts(
torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
torch::Tensor const& a_scales, torch::Tensor const& b_scales,
torch::Tensor const& b_group_scales, const int64_t b_group_size) {
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
TORCH_CHECK(b_tensors.dtype() == torch::kInt32); // int4 8x packed into int32
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_group_scales.dtype() ==
torch::kFloat8_e4m3fn); // the underlying torch type is e4m3
TORCH_CHECK(out_tensors.dtype() ==
torch::kBFloat16); // only support bf16 for now
// expect int64_t to avoid overflow during offset calculations
TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
int num_experts = static_cast<int>(expert_offsets.size(0));
// logical k, n
int64_t n = out_tensors.size(1);
int64_t k = a_tensors.size(1);
int64_t scale_k = cutlass::ceil_div(k, b_group_size);
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
if (false) {
}
__CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
__CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
else {
TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
}
}
} // namespace

View File

@ -0,0 +1,483 @@
#include <vector>
#include <tuple>
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/group_array_problem_shape.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/mixed_dtype_utils.hpp"
// vllm includes
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/all.h>
#include "cutlass_extensions/torch_utils.hpp"
#include "cutlass_extensions/common.hpp"
#include "core/registration.h"
#include "get_group_starts.cuh"
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
#include "w4a8_utils.cuh"
namespace vllm::cutlass_w4a8_moe {
using namespace cute;
// -------------------------------------------------------------------------------------
// Static configuration shared across all instantiations
// -------------------------------------------------------------------------------------
using ProblemShape =
cutlass::gemm::GroupProblemShape<Shape<int, int, int>>; // <M,N,K> per
// group
using MmaType = cutlass::float_e4m3_t;
using QuantType = cutlass::int4b_t;
constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
static int constexpr PackFactor = 8; // 8 int4 packed into int32
// A matrix configuration
using ElementA = MmaType;
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
constexpr int AlignmentA =
128 /
cutlass::sizeof_bits<ElementA>::value; // Alignment of A matrix in units of
// elements (up to 16 bytes)
// B matrix configuration
using ElementB = QuantType; // Element type for B matrix operand
using LayoutB =
cutlass::layout::ColumnMajor; // Layout type for B matrix operand
constexpr int AlignmentB =
128 / cutlass::sizeof_bits<
ElementB>::value; // Memory access granularity/alignment of B
// matrix in units of elements (up to 16 bytes)
// This example manually swaps and transposes, so keep transpose of input
// layouts
using LayoutA_Transpose =
typename cutlass::layout::LayoutTranspose<LayoutA>::type;
using LayoutB_Transpose =
typename cutlass::layout::LayoutTranspose<LayoutB>::type;
// Need to pass a pointer type to make the 3rd dimension of Stride be _0
using StrideA =
cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
using StrideB =
cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
// Define the CuTe layout for reoredered quantized tensor B
// LayoutAtomQuant places values that will be read by the same thread in
// contiguous locations in global memory. It specifies the reordering within a
// single warp's fragment
using LayoutAtomQuant =
decltype(cutlass::compute_memory_reordering_atom<MmaType>());
using LayoutB_Reordered = decltype(cute::tile_to_shape(
LayoutAtomQuant{}, Layout<Shape<int, int, Int<1>>, StrideB>{}));
using ElementScale = cutlass::float_e4m3_t;
using LayoutScale = cutlass::layout::RowMajor;
// C/D matrix configuration
using ElementC =
cutlass::bfloat16_t; // Element type for C and D matrix operands
using LayoutC =
cutlass::layout::RowMajor; // Layout type for C and D matrix operands
constexpr int AlignmentC =
128 / cutlass::sizeof_bits<
ElementC>::value; // Memory access granularity/alignment of C
// matrix in units of elements (up to 16 bytes)
// D matrix configuration
using ElementD = ElementC;
using LayoutD = LayoutC;
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
// Core kernel configurations
using ElementAccumulator = float; // Element type for internal accumulation
using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that
// supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
using StageCountType =
cutlass::gemm::collective::StageCountAuto; // Stage count maximized based
// on the tile size
// per-channel and per-token scales for epilogue
using ElementSChannel = float;
template <class TileShape_MN, class ClusterShape_MNK, class KernelSchedule,
class EpilogueSchedule>
struct W4A8GroupedGemmKernel {
using TileShape =
decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
using ClusterShape = ClusterShape_MNK;
// per-channel, per-token scales epilogue
using ChTokScalesEpilogue =
typename vllm::c3x::ScaledEpilogueArray<ElementAccumulator, ElementD,
TileShape>;
using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass, TileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
ElementSChannel, ElementC,
typename cutlass::layout::LayoutTranspose<LayoutC>::type*, AlignmentC,
ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*,
AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;
// =========================================================== MIXED INPUT
// WITH SCALES
// ===========================================================================
// The Scale information must get paired with the operand that will be scaled.
// In this example, B is scaled so we make a tuple of B's information and the
// scale information.
using CollectiveMainloopShuffled =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass,
cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>,
LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
AlignmentA, ElementAccumulator, TileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
sizeof(typename CollectiveEpilogue::SharedStorage))>,
KernelSchedule>::CollectiveOp;
using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;
using GemmShuffled =
cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
using StrideC = typename GemmKernelShuffled::InternalStrideC;
using StrideD = typename GemmKernelShuffled::InternalStrideD;
using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
using StrideS = typename CollectiveMainloopShuffled::StrideScale;
using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
// static asserts for passing in strides/layouts
// pack to 2x int64
static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
// pack to 3xint32,
static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
"LayoutB_Reordered size must be divisible by 4 bytes");
static void grouped_mm(
torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
const int64_t b_group_size, const torch::Tensor& expert_offsets,
const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
const torch::Tensor& group_scale_strides) {
auto device = a_tensors.device();
auto device_id = device.index();
const at::cuda::OptionalCUDAGuard device_guard(device);
auto stream = at::cuda::getCurrentCUDAStream(device_id);
int num_experts = static_cast<int>(expert_offsets.size(0));
int n = static_cast<int>(b_tensors.size(1));
int k = static_cast<int>(b_tensors.size(2)) * PackFactor;
auto options_int =
torch::TensorOptions().dtype(torch::kInt64).device(device);
torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);
// get the correct offsets to pass to gemm
run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
a_tensors, b_tensors, out_tensors, a_scales,
b_scales, b_group_scales, b_group_size);
// construct args
using Args = typename GemmShuffled::Arguments;
using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
Args arguments;
ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
static_cast<ProblemShape::UnderlyingProblemShape*>(
problem_sizes_torch.data_ptr());
ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
// SwapAB so B operands come first
MainloopArguments mainloop_arguments{
static_cast<const QuantType**>(b_ptrs.data_ptr()),
static_cast<LayoutB_Reordered*>(b_strides.data_ptr()),
static_cast<const MmaType**>(a_ptrs.data_ptr()),
static_cast<StrideA*>(a_strides.data_ptr()),
static_cast<const cutlass::Array<ElementScale, 8>**>(
b_group_scales_ptrs.data_ptr()),
static_cast<StrideS*>(group_scale_strides.data_ptr()),
static_cast<int>(b_group_size)};
EpilogueArguments epilogue_arguments{
// since we are doing SwapAB the channel scales comes first, then token
// scales
ChTokScalesEpilogue::prepare_args( // see ScaledEpilogueArray
static_cast<const ElementAccumulator**>(
b_scales_ptrs.data_ptr()), // per-channel
static_cast<const ElementAccumulator**>(
a_scales_ptrs.data_ptr()), // per-token
true, true),
nullptr, // C
static_cast<StrideC*>(c_strides.data_ptr()), // C
static_cast<ElementD**>(out_ptrs.data_ptr()), // D
static_cast<StrideC*>(c_strides.data_ptr()) // D
};
static const cutlass::KernelHardwareInfo hw_info{
device_id,
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
device_id)};
arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
mainloop_arguments, epilogue_arguments, hw_info};
// Allocate workspace
size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
torch::Tensor workspace =
torch::empty(workspace_size,
torch::TensorOptions().dtype(torch::kU8).device(device));
// Run GEMM
GemmShuffled gemm;
CUTLASS_CHECK(gemm.can_implement(arguments));
CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
CUTLASS_CHECK(gemm.run(stream));
}
};
// ----------------------------------------------------------------------------
// Kernel instantiations and dispatch logic
// ----------------------------------------------------------------------------
using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
// Kernel_TileShape_ClusterShape_Schedule
using Kernel_128x16_1x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
using Kernel_128x16_2x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
using Kernel_256x16_1x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
using Kernel_256x16_2x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
using Kernel_256x32_1x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>, Coop, CoopEpi>;
using Kernel_256x32_2x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_2, _1, _1>, Coop, CoopEpi>;
using Kernel_256x64_1x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>, Coop, CoopEpi>;
using Kernel_256x64_2x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_2, _1, _1>, Coop, CoopEpi>;
using Kernel_256x128_1x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>, Coop, CoopEpi>;
using Kernel_256x128_2x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_2, _1, _1>, Coop, CoopEpi>;
using Kernel_128x256_2x1x1_Coop =
W4A8GroupedGemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>, Coop, CoopEpi>;
void mm_dispatch(
torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
const int64_t b_group_size, const torch::Tensor& expert_offsets,
const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
const torch::Tensor& group_scale_strides, const std::string& schedule) {
if (schedule == "Kernel_128x16_1x1x1_Coop") {
Kernel_128x16_1x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_128x16_2x1x1_Coop") {
Kernel_128x16_2x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_256x16_1x1x1_Coop") {
Kernel_256x16_1x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_256x16_2x1x1_Coop") {
Kernel_256x16_2x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_256x32_1x1x1_Coop") {
Kernel_256x32_1x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_256x32_2x1x1_Coop") {
Kernel_256x32_2x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_256x64_1x1x1_Coop") {
Kernel_256x64_1x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_256x64_2x1x1_Coop") {
Kernel_256x64_2x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_256x128_1x1x1_Coop") {
Kernel_256x128_1x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_256x128_2x1x1_Coop") {
Kernel_256x128_2x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else if (schedule == "Kernel_128x256_2x1x1_Coop") {
Kernel_128x256_2x1x1_Coop::grouped_mm(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, group_scale_strides);
} else {
TORCH_CHECK(false,
"cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
}
}
void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
const int64_t b_group_size, const torch::Tensor& expert_offsets,
const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
const torch::Tensor& group_scale_strides,
std::optional<std::string> maybe_schedule) {
// user has specified a schedule
if (maybe_schedule) {
mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
b_group_scales, b_group_size, expert_offsets, problem_sizes,
a_strides, b_strides, c_strides, group_scale_strides,
*maybe_schedule);
return;
}
// use heuristic
int m_full = a_tensors.size(0);
int n = b_tensors.size(1);
int k = b_tensors.size(2) * PackFactor; // logical k
int num_experts = b_tensors.size(0);
// per-expert batch size assuming uniform distribution
int m_expert = m_full / num_experts;
std::string schedule;
if (m_expert <= 16) {
schedule = "Kernel_128x16_2x1x1_Coop";
} else if (m_expert <= 32) {
schedule = "Kernel_256x32_1x1x1_Coop";
} else if (m_expert <= 64) {
schedule = "Kernel_256x64_1x1x1_Coop";
} else if (m_expert <= 128) {
schedule = "Kernel_256x128_2x1x1_Coop";
} else { // m_expert > 128
schedule = "Kernel_128x256_2x1x1_Coop";
}
mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
b_group_scales, b_group_size, expert_offsets, problem_sizes,
a_strides, b_strides, c_strides, group_scale_strides, schedule);
}
std::tuple<torch::Tensor, torch::Tensor> encode_and_reorder_int4b(
torch::Tensor const& b_tensors) {
TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
TORCH_CHECK(b_tensors.dim() == 3); // (experts, n, k)
TORCH_CHECK(b_tensors.is_contiguous());
TORCH_CHECK(b_tensors.is_cuda());
int n = static_cast<int>(b_tensors.size(1));
int k = static_cast<int>(b_tensors.size(2)) * PackFactor; // logical k
// CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
// These misalignments cause silent OOB unless run under Compute Sanitizer.
TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");
// we will store the layout to an int32 tensor;
// this is the number of elements we need per layout
constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);
torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
int num_experts = static_cast<int>(b_tensors.size(0));
auto b_ptr = static_cast<QuantType const*>(b_tensors.const_data_ptr());
auto b_packed_ptr = static_cast<QuantType*>(b_tensors_packed.data_ptr());
// multiply by ull so result does not overflow int32
size_t num_int4_elems = 1ull * num_experts * n * k;
bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
num_int4_elems);
TORCH_CHECK(ok, "unified_encode_int4b failed");
// construct the layout once; assumes each expert has the same layout
using LayoutType = LayoutB_Reordered;
std::vector<LayoutType> layout_B_reordered_host(num_experts);
auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
auto shape_B = cute::make_shape(n, k, Int<1>{});
auto layout_B = make_layout(shape_B, stride_B);
LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
// reorder weights for each expert
for (int i = 0; i < num_experts; i++) {
// since the storage type of int4b is 1 byte but one element is 4 bits
// we need to adjust the offset
int64_t offset =
1ull * i * n * k * cutlass::sizeof_bits<QuantType>::value / 8;
cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
layout_B_reordered);
}
// save the packed layout to torch tensor so we can re-use it
auto cpu_opts =
torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
torch::Tensor layout_cpu =
torch::empty({num_experts, layout_width}, cpu_opts);
int32_t* layout_data = layout_cpu.data_ptr<int32_t>();
for (int i = 0; i < num_experts; ++i) {
std::memcpy(layout_data + i * layout_width, // dst (int32*)
&layout_B_reordered, // src (LayoutType*)
sizeof(LayoutType)); // number of bytes
}
torch::Tensor packed_layout =
layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);
return {b_tensors_packed, packed_layout};
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("cutlass_w4a8_moe_mm", &mm);
m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
}
} // namespace vllm::cutlass_w4a8_moe
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -7,6 +7,7 @@
#include <c10/cuda/CUDAGuard.h>
#include <torch/all.h>
#include "cutlass_extensions/torch_utils.hpp"
#include "w4a8_utils.cuh"
#include "core/registration.h"
@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
return packed_scales;
}
/*
GPU-accelerated implementation of cutlass::unified_encode_int4b.
Constructs a lookup table in constant memory to map 8 bits
(two 4-bit values) at a time. Assumes memory is contiguous
and pointers are 16-byte aligned.
*/
__constant__ uint8_t kNibbleLUT[256];
__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
size_t nbytes) {
constexpr size_t V = sizeof(uint4); // 16 bytes
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t nthreads = size_t(gridDim.x) * blockDim.x;
const size_t nvec = nbytes / V;
// 1-D grid-stride loop over 16-byte chunks
for (size_t vec = tid; vec < nvec; vec += nthreads) {
uint4 v = reinterpret_cast<const uint4*>(in)[vec];
uint8_t* b = reinterpret_cast<uint8_t*>(&v);
#pragma unroll
for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
reinterpret_cast<uint4*>(out)[vec] = v;
}
}
static bool upload_lut() {
std::array<uint8_t, 256> lut{};
auto map_nib = [](uint8_t v) -> uint8_t {
// 1..7 -> (8 - v); keep 0 and 8..15
return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
};
for (int b = 0; b < 256; ++b) {
uint8_t lo = b & 0xF;
uint8_t hi = (b >> 4) & 0xF;
lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
}
cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
/*offset=*/0, cudaMemcpyHostToDevice);
return (e == cudaSuccess);
}
static bool unified_encode_int4b(cutlass::int4b_t const* in,
cutlass::int4b_t* out, size_t num_int4_elems) {
// Build/upload LUT
if (!upload_lut()) return false;
static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
"int4 storage must be 1 byte");
const size_t nbytes = num_int4_elems >> 1;
auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
auto* out_bytes = reinterpret_cast<uint8_t*>(out);
// kernel launch params
constexpr int block = 256;
const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors
int grid = int((nvec + block - 1) / block);
if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel
unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
cudaError_t err = cudaGetLastError();
return (err == cudaSuccess);
}
torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
TORCH_CHECK(B.dtype() == torch::kInt32);
TORCH_CHECK(B.dim() == 2);
@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
LayoutB_Reordered layout_B_reordered =
cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
bool ok =
vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr,
n * k);
TORCH_CHECK(ok, "unified_encode_int4b failed");
cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);

View File

@ -0,0 +1,90 @@
#include "w4a8_utils.cuh"
#include <array>
#include <cuda_runtime.h>
#include <cstdio>
namespace vllm::cutlass_w4a8_utils {
/*
GPU-accelerated implementation of cutlass::unified_encode_int4b.
Constructs a lookup table in constant memory to map 8 bits
(two 4-bit values) at a time. Assumes memory is contiguous
and pointers are 16-byte aligned.
*/
__constant__ uint8_t kNibbleLUT[256];
__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
size_t nbytes) {
constexpr size_t V = sizeof(uint4); // 16 bytes
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t nthreads = size_t(gridDim.x) * blockDim.x;
const size_t nvec = nbytes / V;
// 1-D grid-stride loop over 16-byte chunks
for (size_t vec = tid; vec < nvec; vec += nthreads) {
uint4 v = reinterpret_cast<const uint4*>(in)[vec];
uint8_t* b = reinterpret_cast<uint8_t*>(&v);
#pragma unroll
for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
reinterpret_cast<uint4*>(out)[vec] = v;
}
}
static bool upload_lut() {
std::array<uint8_t, 256> lut{};
auto map_nib = [](uint8_t v) -> uint8_t {
// 1..7 -> (8 - v); keep 0 and 8..15
return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
};
for (int b = 0; b < 256; ++b) {
uint8_t lo = b & 0xF;
uint8_t hi = (b >> 4) & 0xF;
lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
}
cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
/*offset=*/0, cudaMemcpyHostToDevice);
return (e == cudaSuccess);
}
bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
size_t num_int4_elems) {
// Build/upload LUT
if (!upload_lut()) return false;
static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
"int4 storage must be 1 byte");
const size_t nbytes = num_int4_elems >> 1;
auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
auto* out_bytes = reinterpret_cast<uint8_t*>(out);
// kernel launch params
constexpr int block = 256;
const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors
int grid = int((nvec + block - 1) / block);
if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel
unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
// launch errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("unified_encode_int4b_device launch error: %s (%d)\n",
cudaGetErrorString(err), err);
return false;
}
// runtime errors
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
printf("unified_encode_int4b_device runtime error: %s (%d)\n",
cudaGetErrorString(err), err);
return false;
}
return true;
}
} // namespace vllm::cutlass_w4a8_utils

View File

@ -0,0 +1,11 @@
#pragma once
#include <cstddef>
#include "cutlass/numeric_types.h"
namespace vllm::cutlass_w4a8_utils {
bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
size_t num_int4_elems);
} // namespace vllm::cutlass_w4a8_utils

View File

@ -617,7 +617,7 @@ struct MacheteCollectiveMma {
// Same as upstream, should be kept the same when possible, not formatted for
// easier comparison
// with `SwapAB ? N : M -> M` since we dont support SwapAB
// with `SwapAB ? N : M -> M` since we don't support SwapAB
// clang-format off
template<class ProblemShape>
static bool

View File

@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
void get_cutlass_moe_mm_problem_sizes_caller(
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
std::optional<bool> force_swap_ab = std::nullopt) {
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
auto options_int32 =
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
// Swap-AB should be disabled for FP4 path
bool may_swap_ab = (!blockscale_offsets.has_value()) &&
(topk_ids.numel() <= SWAP_AB_THRESHOLD);
bool may_swap_ab =
force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
(topk_ids.numel() <= SWAP_AB_THRESHOLD));
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
atomic_buffer, num_experts, n, k, stream,

View File

@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller(
void get_cutlass_moe_mm_problem_sizes_caller(
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
std::optional<bool> force_swap_ab = std::nullopt);
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data(
void get_cutlass_moe_mm_problem_sizes(
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
std::optional<bool> force_swap_ab = std::nullopt) {
int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
problem_sizes2, num_experts, n, k,
blockscale_offsets);
blockscale_offsets, force_swap_ab);
return;
#endif
TORCH_CHECK_NOT_IMPLEMENTED(

View File

@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
}
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
// Find the min val of div2 that doesn't increase N/(div1*div2)
int mindiv(int N, int div1, int div2) {
int nPrRnd = div1 * div2;
int rnds0 = N / nPrRnd;
nPrRnd -= div1 * 3;
int rnds3 = N / nPrRnd;
nPrRnd -= div1;
int rnds4 = N / nPrRnd;
nPrRnd -= div1;
int rnds5 = N / nPrRnd;
nPrRnd -= div1;
int rnds6 = N / nPrRnd;
nPrRnd -= div1;
int rnds7 = N / nPrRnd;
nPrRnd -= div1;
int rnds8 = N / nPrRnd;
nPrRnd -= div1;
int rnds9 = N / nPrRnd;
nPrRnd -= div1;
int rtn = div2;
if (rnds0 == rnds3) rtn = div2 - 3;
if (rnds0 == rnds4) rtn = div2 - 4;
if (rnds0 == rnds5) rtn = div2 - 5;
if (rnds0 == rnds6) rtn = div2 - 6;
if (rnds0 == rnds7) rtn = div2 - 7;
if (rnds0 == rnds8) rtn = div2 - 8;
if (rnds0 == rnds9) rtn = div2 - 9;
return rtn;
int rnds[13];
for (int i = 0; i < 13; i++) {
rnds[i] = (N + nPrRnd - 1) / nPrRnd;
nPrRnd -= div1;
}
for (int i = 12; i >= 0; i--)
if (rnds[0] == rnds[i]) return (div2 - i);
}
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int max_lds_len = get_lds_size() / 2;
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
_N) \
{ \
dim3 block(64, _WvPrGrp); \
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
} else if (K_in * N_in <= max_lds_len * 1.2) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
} else { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \
wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
} \
#define WVSPLITK(_YTILE, _UNRL, _N) \
{ \
dim3 block(64, 16); \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \
wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
else if (K_in * N_in <= max_lds_len * 1.2) \
wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
else \
wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
}
#define WVSPLIT_TILE(_sYT, __N) \
{ \
bool fit_lds = (K_in * N_in <= max_lds_len); \
if (_sYT <= 1) \
WVSPLITK(1, 4, __N) \
else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
WVSPLITK(2, 2, __N) \
else if (_sYT <= 4 * 3) \
WVSPLITK(3, 2, __N) \
else if (__N == 4) \
WVSPLITK(4, 1, __N) \
else \
WVSPLITK(4, 2, __N) \
}
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
: nullptr;
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
// first shoot for biggest tile-size that keeps all simd busy,
// then cut the active waves to balance their distribution...
int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
switch (N_in) {
case 1:
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
WVSPLIT_TILE(sYT, 1)
break;
case 2:
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
WVSPLIT_TILE(sYT, 2)
break;
case 3:
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
WVSPLIT_TILE(sYT, 3)
break;
case 4:
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
WVSPLIT_TILE(sYT, 4)
break;
default:
throw std::runtime_error(

View File

@ -350,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
// conditionally compiled so impl registration is in source file
// CUTLASS w4a8 grouped GEMM
ops.def(
"cutlass_w4a8_moe_mm("
" Tensor! out_tensors,"
" Tensor a_tensors,"
" Tensor b_tensors,"
" Tensor a_scales,"
" Tensor b_scales,"
" Tensor b_group_scales,"
" int b_group_size,"
" Tensor expert_offsets,"
" Tensor problem_sizes,"
" Tensor a_strides,"
" Tensor b_strides,"
" Tensor c_strides,"
" Tensor group_scale_strides,"
" str? maybe_schedule"
") -> ()");
ops.def(
"cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, "
"Tensor)");
// conditionally compiled so impl registration is in source file
#endif
// Dequantization for GGML.
@ -466,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! problem_sizes1, "
" Tensor! problem_sizes2, "
" int num_experts, int n, int k, "
" Tensor? blockscale_offsets) -> ()");
" Tensor? blockscale_offsets, "
" bool? force_swap_ab) -> ()");
ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
&get_cutlass_moe_mm_problem_sizes);

View File

@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
- [vllm.config.MultiModalConfig][]
- [vllm.config.PoolerConfig][]
- [vllm.config.StructuredOutputsConfig][]
- [vllm.config.ProfilerConfig][]
- [vllm.config.ObservabilityConfig][]
- [vllm.config.KVTransferConfig][]
- [vllm.config.CompilationConfig][]

View File

@ -84,7 +84,7 @@ Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
Total token throughput (tok/s): 619.85
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88

View File

@ -5,16 +5,15 @@
## Profile with PyTorch Profiler
We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
- `torch_profiler_with_memory` to record memory, off by default
- `torch_profiler_with_stack` to enable recording stack information, on by default
- `torch_profiler_with_flops` to enable recording FLOPs, off by default
- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default
- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default
When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag.
@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
#### OpenAI Server
```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
vllm serve meta-llama/Llama-3.1-8B-Instruct
vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
```
vllm bench command:
@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with `
```bash
# server
VLLM_TORCH_CUDA_PROFILE=1 \
nsys profile \
--trace-fork-before-exec=true \
--cuda-graph-trace=node \
--capture-range=cudaProfilerApi \
--capture-range-end repeat \
vllm serve meta-llama/Llama-3.1-8B-Instruct
vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda
# client
vllm bench serve \

View File

@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request
### v1 Metrics
In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:
- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (01).
- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.

View File

@ -152,5 +152,5 @@ The interface for the model/module may change during vLLM's development. If you
## Deprecation announcement
!!! warning "Deprecations"
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0.
- `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.

View File

@ -68,8 +68,8 @@ th:not(:first-child) {
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [](https://github.com/vllm-project/vllm/issues/26970) |
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26965) |
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ |
| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ |
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ |

View File

@ -32,14 +32,14 @@ Design doc: <https://docs.google.com/document/d/1aed8KtC6XkXtdoV87pWT0a8OJlZ-Cpn
## 2 Usage Example
The current reference pathway is **SharedStorageConnector**.
The current reference pathway is **ExampleConnector**.
Below ready-to-run scripts shows the workflow:
1 Encoder instance + 1 PD instance:
`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh`
`examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh`
1 Encoder instance + 1 Prefill instance + 1 Decode instance:
`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh`
`examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh`
---

View File

@ -21,14 +21,14 @@ Please refer to [examples/online_serving/disaggregated_prefill.sh](../../example
Now supports 5 types of connectors:
- **SharedStorageConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of SharedStorageConnector disaggregated prefilling.
- **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
- **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
- **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
```bash
--kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
--kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"ExampleConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
```
For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:

View File

@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
```bash
# Example UCX configuration, adjust according to your enviroment
# Example UCX configuration, adjust according to your environment
export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1"
```
@ -146,6 +146,8 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
--decoder-ports 8000 8000
```
For multi-host DP deployment, only need to provide the host/port of the head instances.
### KV Role Options
- **kv_producer**: For prefiller instances that generate KV caches

View File

@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids
def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
return self.end_token_id in delta_token_ids
...
```

View File

@ -61,7 +61,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as
print(completion.choices[0].message.content)
```
The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
The next example shows how to use the `regex`. The supported regex syntax depends on the structured output backend. For example, `xgrammar`, `guidance`, and `outlines` use Rust-style regex, while `lm-format-enforcer` uses Python's `re` module. The idea is to generate an email address, given a simple regex template:
??? code

View File

@ -26,3 +26,4 @@ The backends below live **outside** the main `vllm` repository and follow the
| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |

View File

@ -29,8 +29,27 @@ uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.a
The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
!!! note
Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
**Install the latest code**
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides working pre-built Arm CPU wheels for every commit since `v0.11.2` on <https://wheels.vllm.ai/nightly>. For native CPU wheels, this index should be used:
* `https://wheels.vllm.ai/nightly/cpu/vllm`
To install from nightly index, copy the link address of the `*.whl` under this index to run, for example:
```bash
uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!)
```
**Install specific revisions**
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index:
https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm .
Then, copy the link address of the `*.whl` under this index to run:
```bash
uv pip install -U <wheel-url>
```
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]

View File

@ -0,0 +1,149 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
import logging
from pathlib import Path
from typing import Literal
logger = logging.getLogger("mkdocs")
ROOT_DIR = Path(__file__).parent.parent.parent.parent
DOCS_DIR = ROOT_DIR / "docs"
GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
# Files to scan for metric definitions - each will generate a separate table
METRIC_SOURCE_FILES = [
{"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
{
"path": "vllm/v1/spec_decode/metrics.py",
"output": "spec_decode.md",
},
{
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
"output": "nixl_connector.md",
},
]
class MetricExtractor(ast.NodeVisitor):
"""AST visitor to extract metric definitions."""
def __init__(self):
self.metrics: list[dict[str, str]] = []
def visit_Call(self, node: ast.Call) -> None:
"""Visit function calls to find metric class instantiations."""
metric_type = self._get_metric_type(node)
if metric_type:
name = self._extract_kwarg(node, "name")
documentation = self._extract_kwarg(node, "documentation")
if name:
self.metrics.append(
{
"name": name,
"type": metric_type,
"documentation": documentation or "",
}
)
self.generic_visit(node)
def _get_metric_type(self, node: ast.Call) -> str | None:
"""Determine if this call creates a metric and return its type."""
metric_type_map = {
"_gauge_cls": "gauge",
"_counter_cls": "counter",
"_histogram_cls": "histogram",
}
if isinstance(node.func, ast.Attribute):
return metric_type_map.get(node.func.attr)
return None
def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
"""Extract a keyword argument value from a function call."""
for keyword in node.keywords:
if keyword.arg == key:
return self._get_string_value(keyword.value)
return None
def _get_string_value(self, node: ast.AST) -> str | None:
"""Extract string value from an AST node."""
if isinstance(node, ast.Constant):
return str(node.value) if node.value is not None else None
return None
def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
"""Parse a Python file and extract all metric definitions."""
try:
with open(filepath, encoding="utf-8") as f:
source = f.read()
tree = ast.parse(source, filename=str(filepath))
extractor = MetricExtractor()
extractor.visit(tree)
return extractor.metrics
except Exception as e:
raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
"""Generate a markdown table from extracted metrics."""
if not metrics:
return "No metrics found.\n"
# Sort by type, then by name
metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
lines = []
lines.append("| Metric Name | Type | Description |")
lines.append("|-------------|------|-------------|")
for metric in metrics_sorted:
name = metric["name"]
metric_type = metric["type"].capitalize()
doc = metric["documentation"].replace("\n", " ").strip()
lines.append(f"| `{name}` | {metric_type} | {doc} |")
return "\n".join(lines) + "\n"
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
"""Generate metrics documentation tables from source files."""
logger.info("Generating metrics documentation")
# Create generated directory if it doesn't exist
GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
total_metrics = 0
for source_config in METRIC_SOURCE_FILES:
source_path = source_config["path"]
output_file = source_config["output"]
filepath = ROOT_DIR / source_path
if not filepath.exists():
raise FileNotFoundError(f"Metrics source file not found: {filepath}")
logger.debug("Extracting metrics from: %s", source_path)
metrics = extract_metrics_from_file(filepath)
logger.debug("Found %d metrics in %s", len(metrics), source_path)
# Generate and write the markdown table for this source
table_content = generate_markdown_table(metrics)
output_path = GENERATED_METRICS_DIR / output_file
with open(output_path, "w", encoding="utf-8") as f:
f.write(table_content)
total_metrics += len(metrics)
logger.info(
"Generated metrics table: %s (%d metrics)",
output_path.relative_to(ROOT_DIR),
len(metrics),
)
logger.info(
"Total metrics generated: %d across %d files",
total_metrics,
len(METRIC_SOURCE_FILES),
)

View File

@ -316,10 +316,13 @@ We have split the `encode` task into two more specific token-wise tasks: `token_
### Remove softmax from PoolingParams
We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
### as_reward_model
!!! warning
We are going to remove `--convert reward` in v0.15, use `--convert embed` instead.
Pooling models now default support all pooling, you can use it without any settings.
- Extracting hidden states prefers using `token_embed` task.

View File

@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
```
!!! note
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py).
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/offline_reranker.py](../../examples/pooling/score/offline_reranker.py).
```bash
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'

View File

@ -24,7 +24,7 @@ There are two distinct modes supported for online deployments - self-contained w
vLLM supports "self-contained" data parallel deployments that expose a single API endpoint.
It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs.
It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. When sizing DP deployments, remember that `--max-num-seqs` applies per DP rank.
Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks.
@ -80,6 +80,18 @@ When deploying large DP sizes using this method, the API server process can beco
![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png)
</figure>
## Hybrid Load Balancing
Hybrid load balancing sits between the internal and external approaches. Each node runs its own API server(s) that only queue requests to the data-parallel engines colocated on that node. An upstream load balancer (for example, an ingress controller or traffic router) spreads user requests across those per-node endpoints.
Enable this mode with `--data-parallel-hybrid-lb` while still launching every node with the global data-parallel size. The key differences from internal load balancing are:
- You must provide `--data-parallel-size-local` and `--data-parallel-start-rank` so each node knows which ranks it owns.
- Not compatible with `--headless` since every node exposes an API endpoint.
- Scale `--api-server-count` per node based on the number of local ranks
In this configuration, each node keeps scheduling decisions local, which reduces cross-node traffic and avoids single node bottlenecks at larger DP sizes.
## External Load Balancing
For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally.

View File

@ -40,10 +40,12 @@ EP_SIZE = TP_SIZE × DP_SIZE
Where:
- `TP_SIZE`: Tensor parallel size (always 1 for now)
- `TP_SIZE`: Tensor parallel size
- `DP_SIZE`: Data parallel size
- `EP_SIZE`: Expert parallel size (computed automatically)
When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
### Example Command
The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
@ -81,7 +83,7 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
--data-parallel-size-local 8 \ # Local DP size on this node (8 GPUs per node)
--data-parallel-address 192.168.1.100 \ # Replace with actual IP of Node 1
--data-parallel-rpc-port 13345 \ # RPC communication port, can be any port as long as reachable by all nodes
--api-server-count=8 # Number of API servers for load handling (scaling this out to total ranks are recommended)
--api-server-count=8 # Number of API servers for load handling (scaling this out to # local ranks is recommended)
# Node 2 (Secondary - headless mode, no API server)
vllm serve deepseek-ai/DeepSeek-V3-0324 \
@ -119,9 +121,6 @@ While MoE models are typically trained so that each expert receives a similar nu
Enable EPLB with the `--enable-eplb` flag.
!!! note "Model Support"
Currently only DeepSeek V3 architecture is supported.
When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution.
### EPLB Parameters
@ -134,6 +133,8 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T
| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
| `policy` | The policy type for expert parallel load balancing | `"default"` |
For example:
@ -183,6 +184,26 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.
## Advanced Configuration
### Performance Optimization
- **DeepEP kernels**: The `high_throughput` and `low_latency` kernels are optimized for disaggregated serving and may show poor performance for mixed workloads
- **Dual Batch Overlap**: Use `--enable-dbo` to overlap all-to-all communication with compute. See [Dual Batch Overlap](../design/dbo.md) for more details.
- **Async scheduling (experimental)**: Try `--async-scheduling` to overlap scheduling with model execution.
### Troubleshooting
- **`non-zero status: 7 cannot register cq buf`**: When using Infiniband/RoCE, make sure host VM and pods show `ulimit -l` "unlimited".
- **`init failed for transport: IBGDA`**: The InfiniBand GDA kernel modules are missing. Run `tools/ep_kernels/configure_system_drivers.sh` on each GPU node and reboot. Also fixes error `NVSHMEM API called before NVSHMEM initialization has completed`.
- **NVSHMEM peer disconnect**: Usually a networking misconfiguration. If deploying via Kubernetes, verify that every pod runs with `hostNetwork: true`, `securityContext.privileged: true` to access Infiniband.
### Benchmarking
- Use simulator flags `VLLM_MOE_ROUTING_SIMULATION_STRATEGY=uniform_random` and `VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1` so token routing is balanced across EP ranks.
- Increasing `VLLM_MOE_DP_CHUNK_SIZE` may increase throughput by increasing the maximum batch size for inter-rank token transfers. This may cause DeepEP to throw `assert self.nvshmem_qp_depth >= (num_max_dispatch_tokens_per_rank + 1) * 2`, which can be fixed by increasing environment variable `NVSHMEM_QP_DEPTH`.
## Disaggregated Serving (Prefill/Decode Split)
For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations.
@ -273,3 +294,9 @@ except Exception as e:
print(f"❌ Error during disaggregated serving: {e}")
print("Check that both prefill and decode instances are running and accessible")
```
### Benchmarking
- To simulate the decode deployment of disaggregated serving, pass `--kv-transfer-config '{"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}'` to the `vllm serve` invocation. The connector populates KV cache with random values so decode can be profiled in isolation.
- **CUDAGraph capture**: Use `--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'` to enable CUDA graph capture for decode only and save KV cache.

View File

@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin
[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with
popular open-source tools.
Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py)
Code example: [examples/pooling/score/openai_reranker.py](../../examples/pooling/score/openai_reranker.py)
#### Example Request

View File

@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:
The following metrics are exposed:
??? code
## General Metrics
```python
--8<-- "vllm/engine/metrics.py:metrics-definitions"
```
--8<-- "docs/generated/metrics/general.md"
## Speculative Decoding Metrics
--8<-- "docs/generated/metrics/spec_decode.md"
## NIXL KV Connector Metrics
--8<-- "docs/generated/metrics/nixl_connector.md"
## Deprecation Policy
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,

View File

@ -422,7 +422,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
parser.add_argument(

View File

@ -30,7 +30,7 @@ def main():
max_num_batched_tokens=64,
max_num_seqs=16,
kv_transfer_config=KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
),

View File

@ -26,7 +26,7 @@ def main():
enforce_eager=True,
gpu_memory_utilization=0.8,
kv_transfer_config=KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
),

View File

@ -77,7 +77,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()

View File

@ -10,7 +10,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
- `decode_example.py` performs the decode stage. Accepts:
- `--simulate-failure`: simulates KV load failure using a custom connector.
- `--async-load`: enables asynchronous KV loading mode.
- `rogue_shared_storage_connector.py` defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
- `load_recovery_example_connector.py` defines `LoadRecoveryExampleConnector`, a subclass of `ExampleConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
- `run.sh` orchestrates the test: runs the prefill stage, then three decode stages:
1. Normal decode (baseline).
2. Decode with simulated sync KV load failure.
@ -20,7 +20,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron
## How It Works
- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
- The test dynamically loads `LoadRecoveryExampleConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
- The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
- If recovery fails, the script prints a unified diff of the output mismatch and exits with error.

View File

@ -35,13 +35,13 @@ def main():
if args.simulate_failure:
ktc = KVTransferConfig(
kv_connector="RogueSharedStorageConnector",
kv_connector="LoadRecoveryExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={
"shared_storage_path": "local_storage",
"async_load": args.async_load,
},
kv_connector_module_path="rogue_shared_storage_connector",
kv_connector_module_path="load_recovery_example_connector",
)
out_file = (
"async_decode_recovered_output.txt"
@ -50,7 +50,7 @@ def main():
)
else:
ktc = KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={
"shared_storage_path": "local_storage",

View File

@ -10,9 +10,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorMetadata,
KVConnectorRole,
)
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
SharedStorageConnector,
SharedStorageConnectorMetadata,
from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import (
ExampleConnector,
ExampleConnectorMetadata,
)
from vllm.forward_context import ForwardContext
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@ -26,15 +26,15 @@ logging.basicConfig(level=logging.INFO)
@dataclass
class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
class LoadRecoveryExampleConnectorMetadata(ExampleConnectorMetadata):
req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)
@classmethod
def from_base(cls, base: SharedStorageConnectorMetadata):
def from_base(cls, base: ExampleConnectorMetadata):
return cls(requests=base.requests)
class RogueSharedStorageConnector(SharedStorageConnector):
class LoadRecoveryExampleConnector(ExampleConnector):
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
super().__init__(vllm_config=vllm_config, role=role)
self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
@ -45,7 +45,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
self._req_to_block_ids: dict[str, list[int]] = dict()
def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
assert isinstance(connector_metadata, LoadRecoveryExampleConnectorMetadata)
index, failed_request = next(
(
(i, x)
@ -84,7 +84,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
) -> tuple[set[str] | None, set[str] | None]:
if self._async_load:
meta = self._get_connector_metadata()
assert isinstance(meta, RogueSharedStorageConnectorMetadata)
assert isinstance(meta, LoadRecoveryExampleConnectorMetadata)
if meta.req_to_block_ids:
return None, set(meta.req_to_block_ids)
@ -126,9 +126,9 @@ class RogueSharedStorageConnector(SharedStorageConnector):
) -> KVConnectorMetadata:
if not self._async_load:
base = super().build_connector_meta(scheduler_output)
meta = RogueSharedStorageConnectorMetadata.from_base(base)
meta = LoadRecoveryExampleConnectorMetadata.from_base(base)
else:
meta = RogueSharedStorageConnectorMetadata()
meta = LoadRecoveryExampleConnectorMetadata()
if self._requests_need_load:
for req_id, request in self._requests_need_load.items():
meta.add_request(

View File

@ -26,7 +26,7 @@ def main():
enforce_eager=True,
gpu_memory_utilization=0.8,
kv_transfer_config=KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
),

View File

@ -158,7 +158,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)

View File

@ -158,7 +158,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)

Some files were not shown because too many files have changed in this diff Show More