mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-25 04:07:56 +08:00
Merge branch 'main' into tpopp/fix_aiter_triton_rope
This commit is contained in:
commit
a8afa2d5ce
24
.buildkite/ci_config.yaml
Normal file
24
.buildkite/ci_config.yaml
Normal file
@ -0,0 +1,24 @@
|
||||
name: vllm_ci
|
||||
job_dirs:
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/image_build"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
- "requirements/common.txt"
|
||||
- "requirements/cuda.txt"
|
||||
- "requirements/build.txt"
|
||||
- "requirements/test.txt"
|
||||
- "setup.py"
|
||||
- "csrc/"
|
||||
- "cmake/"
|
||||
run_all_exclude_patterns:
|
||||
- "docker/Dockerfile."
|
||||
- "csrc/cpu/"
|
||||
- "csrc/rocm/"
|
||||
- "cmake/hipify.py"
|
||||
- "cmake/cpu_extension.cmake"
|
||||
registries: public.ecr.aws/q9t5s3a7
|
||||
repositories:
|
||||
main: "vllm-ci-postmerge-repo"
|
||||
premerge: "vllm-ci-test-repo"
|
||||
@ -1,46 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
template = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<h1>Links for vLLM</h1/>
|
||||
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
||||
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
filename = os.path.basename(args.wheel)
|
||||
|
||||
with open("index.html", "w") as f:
|
||||
print(f"Generated index.html for {args.wheel}")
|
||||
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
||||
if "x86_64" in filename:
|
||||
x86_wheel = filename
|
||||
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
||||
"manylinux1", "manylinux2014"
|
||||
)
|
||||
elif "aarch64" in filename:
|
||||
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
||||
"manylinux2014", "manylinux1"
|
||||
)
|
||||
arm_wheel = filename
|
||||
else:
|
||||
raise ValueError(f"Unsupported wheel: {filename}")
|
||||
# cloudfront requires escaping the '+' character
|
||||
f.write(
|
||||
template.format(
|
||||
x86_wheel=x86_wheel,
|
||||
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
||||
arm_wheel=arm_wheel,
|
||||
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
||||
)
|
||||
)
|
||||
56
.buildkite/image_build/image_build.sh
Executable file
56
.buildkite/image_build/image_build.sh
Executable file
@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 8 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
BRANCH=$4
|
||||
VLLM_USE_PRECOMPILED=$5
|
||||
VLLM_MERGE_BASE_COMMIT=$6
|
||||
CACHE_FROM=$7
|
||||
CACHE_TO=$8
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
# docker buildx
|
||||
docker buildx create --name vllm-builder --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
docker buildx ls
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||
else
|
||||
merge_base_commit_build_args=""
|
||||
fi
|
||||
|
||||
# build
|
||||
docker buildx build --file docker/Dockerfile \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||
${merge_base_commit_build_args} \
|
||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||
--push \
|
||||
--target test \
|
||||
--progress plain .
|
||||
57
.buildkite/image_build/image_build.yaml
Normal file
57
.buildkite/image_build/image_build.yaml
Normal file
@ -0,0 +1,57 @@
|
||||
group: Abuild
|
||||
steps:
|
||||
- label: ":docker: Build image"
|
||||
key: image-build
|
||||
depends_on: []
|
||||
commands:
|
||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build CPU image"
|
||||
key: image-build-cpu
|
||||
depends_on: []
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build HPU image"
|
||||
soft_fail: true
|
||||
depends_on: []
|
||||
key: image-build-hpu
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build CPU arm64 image"
|
||||
key: cpu-arm64-image-build
|
||||
depends_on: []
|
||||
optional: true
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
36
.buildkite/image_build/image_build_cpu.sh
Executable file
36
.buildkite/image_build/image_build_cpu.sh
Executable file
@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||
--build-arg VLLM_CPU_AMXBF16=true \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
33
.buildkite/image_build/image_build_cpu_arm64.sh
Executable file
33
.buildkite/image_build/image_build_cpu_arm64.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
34
.buildkite/image_build/image_build_hpu.sh
Executable file
34
.buildkite/image_build/image_build_hpu.sh
Executable file
@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build \
|
||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
||||
--progress plain \
|
||||
https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
||||
@ -8,3 +8,4 @@ tasks:
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
||||
rtol: 0.05
|
||||
|
||||
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||
@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
|
||||
--tp-size=1
|
||||
"""
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
RTOL = 0.08
|
||||
DEFAULT_RTOL = 0.08
|
||||
|
||||
|
||||
@contextmanager
|
||||
def scoped_env_vars(new_env: dict[str, str]):
|
||||
if not new_env:
|
||||
# Fast path: nothing to do
|
||||
yield
|
||||
return
|
||||
|
||||
old_values = {}
|
||||
new_keys = []
|
||||
|
||||
try:
|
||||
for key, value in new_env.items():
|
||||
if key in os.environ:
|
||||
old_values[key] = os.environ[key]
|
||||
else:
|
||||
new_keys.append(key)
|
||||
os.environ[key] = str(value)
|
||||
yield
|
||||
finally:
|
||||
# Restore / clean up
|
||||
for key, value in old_values.items():
|
||||
os.environ[key] = value
|
||||
for key in new_keys:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
env_vars = eval_config.get("env_vars", None)
|
||||
with scoped_env_vars(env_vars):
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
|
||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||
|
||||
success = True
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(
|
||||
f"{task['name']} | {metric['name']}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||
f"ground_truth={ground_truth:.3f} | "
|
||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||
|
||||
assert success
|
||||
|
||||
@ -15,6 +15,21 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build arm64 wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
depends_on: ~
|
||||
@ -25,7 +40,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -39,7 +54,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -52,7 +67,21 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 CPU wheel build
|
||||
- label: "Build x86 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
|
||||
@ -7,18 +7,21 @@
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
import regex as re
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
<html>
|
||||
<!-- {comment} -->
|
||||
<meta name="pypi:repository-version" content="1.0">
|
||||
<body>
|
||||
{items}
|
||||
@ -89,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
)
|
||||
|
||||
|
||||
def generate_project_list(subdir_names: list[str]) -> str:
|
||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
"""
|
||||
@ -97,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
|
||||
for name in sorted(subdir_names):
|
||||
name = name.strip("/").strip(".")
|
||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||
|
||||
|
||||
def generate_package_index_and_metadata(
|
||||
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
|
||||
wheel_files: list[WheelFileInfo],
|
||||
wheel_base_dir: Path,
|
||||
index_base_dir: Path,
|
||||
comment: str = "",
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
||||
@ -119,7 +125,7 @@ def generate_package_index_and_metadata(
|
||||
file_meta = asdict(file)
|
||||
file_meta["path"] = file_path_quoted
|
||||
metadata.append(file_meta)
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||
metadata_str = json.dumps(metadata, indent=2)
|
||||
return index_str, metadata_str
|
||||
|
||||
@ -130,6 +136,7 @@ def generate_index_and_metadata(
|
||||
index_base_dir: Path,
|
||||
default_variant: str | None = None,
|
||||
alias_to_default: str | None = None,
|
||||
comment: str = "",
|
||||
):
|
||||
"""
|
||||
Generate index for all wheel files.
|
||||
@ -140,6 +147,7 @@ def generate_index_and_metadata(
|
||||
index_base_dir (Path): Base directory to store index files.
|
||||
default_variant (str | None): The default variant name, if any.
|
||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
||||
comment (str | None): Optional comment to include in the generated HTML files.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
@ -233,6 +241,10 @@ def generate_index_and_metadata(
|
||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
||||
|
||||
# Generate comment in HTML header
|
||||
comment_str = f" ({comment})" if comment else ""
|
||||
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
||||
|
||||
# Generate index for each variant
|
||||
subdir_names = set()
|
||||
for variant, files in variant_to_files.items():
|
||||
@ -252,7 +264,7 @@ def generate_index_and_metadata(
|
||||
subdir_names = subdir_names.union(packages)
|
||||
else:
|
||||
# generate project list for this variant directly
|
||||
project_list_str = generate_project_list(sorted(packages))
|
||||
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
||||
with open(variant_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
@ -262,7 +274,7 @@ def generate_index_and_metadata(
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
package_files, wheel_base_dir, package_dir
|
||||
package_files, wheel_base_dir, package_dir, comment
|
||||
)
|
||||
with open(package_dir / "index.html", "w") as f:
|
||||
f.write(index_str)
|
||||
@ -270,7 +282,7 @@ def generate_index_and_metadata(
|
||||
f.write(metadata_str)
|
||||
|
||||
# Generate top-level project list index
|
||||
project_list_str = generate_project_list(sorted(subdir_names))
|
||||
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
||||
with open(index_base_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
@ -282,6 +294,7 @@ if __name__ == "__main__":
|
||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||
--output-dir <output_directory> : directory to store generated index files
|
||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
@ -311,6 +324,12 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help="Alias variant name for the default variant",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--comment",
|
||||
type=str,
|
||||
default="",
|
||||
help="Optional comment string to include in generated HTML files",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -353,6 +372,17 @@ if __name__ == "__main__":
|
||||
|
||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||
|
||||
# keep only "official" files for a non-nightly version (specifed by cli args)
|
||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
||||
if PY_VERSION_RE.match(version):
|
||||
# upload-wheels.sh ensures no "dev" is in args.version
|
||||
wheel_files = list(
|
||||
filter(lambda x: version in x and "dev" not in x, wheel_files)
|
||||
)
|
||||
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
|
||||
else:
|
||||
print("Nightly version detected, keeping all wheel files.")
|
||||
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{version}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
@ -365,5 +395,6 @@ if __name__ == "__main__":
|
||||
index_base_dir=index_base_dir,
|
||||
default_variant=None,
|
||||
alias_to_default=args.alias_to_default,
|
||||
comment=args.comment.strip(),
|
||||
)
|
||||
print(f"Successfully generated index and metadata in {output_dir}")
|
||||
|
||||
@ -36,11 +36,17 @@ function cpu_tests() {
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run model tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/test_onednn.py
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
|
||||
|
||||
# basic online serving
|
||||
docker exec cpu-test bash -c '
|
||||
|
||||
@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV SOC_VERSION="ascend910b1"
|
||||
|
||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||
|
||||
@ -38,6 +38,7 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
cd tests
|
||||
pytest -v -s v1/core
|
||||
@ -46,6 +47,6 @@ docker run \
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||
|
||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Setting up Prime-RL integration test environment..."
|
||||
|
||||
# Clean up any existing Prime-RL directory
|
||||
|
||||
@ -1,73 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8030}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
|
||||
@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
fi
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# we also accept params as manylinux tag
|
||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||
manylinux_version="manylinux_2_31"
|
||||
manylinux_version="${1:-manylinux_2_31}"
|
||||
|
||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||
if [[ "$wheel" != *"linux"* ]]; then
|
||||
@ -81,7 +82,10 @@ else
|
||||
alias_arg=""
|
||||
fi
|
||||
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
|
||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
||||
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
@ -93,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]];
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||
fi
|
||||
|
||||
# copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Uploading indices to overwrite /$pure_version/"
|
||||
echo "Re-generating indices for /$pure_version/"
|
||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
|
||||
@ -61,8 +61,8 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
||||
timeout_in_minutes: 20
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
grade: Blocking
|
||||
@ -73,6 +73,7 @@ steps:
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
@ -82,6 +83,7 @@ steps:
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
@ -326,10 +328,10 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: V1 Test e2e + engine # 30min
|
||||
timeout_in_minutes: 45
|
||||
- label: V1 Test e2e + engine # 65min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -398,7 +400,8 @@ steps:
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
@ -434,29 +437,34 @@ steps:
|
||||
|
||||
- label: Examples Test # 30min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
@ -718,14 +726,15 @@ steps:
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
@ -738,7 +747,7 @@ steps:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval
|
||||
commands: # LMEval+Transcription WER check
|
||||
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
@ -752,19 +761,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' tool_use
|
||||
|
||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s -m 'cpu_test' tool_use
|
||||
- pytest -v -s tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
@ -974,8 +971,8 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||
timeout_in_minutes: 70
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||
timeout_in_minutes: 180
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -987,7 +984,8 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -1011,7 +1009,8 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
- label: Multi-Modal Models Test (Extended) 3 # 75min
|
||||
timeout_in_minutes: 150
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -1120,7 +1119,6 @@ steps:
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- vllm/model_executor/layers/fused_moe/layer.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
@ -1154,17 +1152,15 @@ steps:
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: ROCm GPT-OSS Eval
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@ -1173,7 +1169,7 @@ steps:
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
@ -1378,7 +1374,7 @@ steps:
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# doesn't support LoRA yet
|
||||
#- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
@ -1444,12 +1440,13 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
@ -1461,11 +1458,11 @@ steps:
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
@ -1475,6 +1472,7 @@ steps:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -1506,6 +1504,57 @@ steps:
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### E2E Eval Tests #####
|
||||
- label: LM Eval Small Models (1 Card) # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: LM Eval Large Models (4 Card)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: ROCm LM Eval Large Models (8 Card)
|
||||
mirror_hardwares: [amdproduction]
|
||||
agent_pool: mi325_8
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
|
||||
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -1520,7 +1569,6 @@ steps:
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
@ -1552,4 +1600,27 @@ steps:
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
|
||||
@ -57,8 +57,8 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
||||
timeout_in_minutes: 20
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
@ -66,6 +66,7 @@ steps:
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
@ -75,6 +76,7 @@ steps:
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
@ -350,7 +352,8 @@ steps:
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
@ -387,6 +390,7 @@ steps:
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
@ -466,7 +470,9 @@ steps:
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@ -480,7 +486,9 @@ steps:
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph Test # 27min
|
||||
timeout_in_minutes: 40
|
||||
@ -666,16 +674,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' tool_use
|
||||
|
||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s -m 'cpu_test' tool_use
|
||||
- pytest -v -s tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
@ -686,6 +685,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_initialization.py
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
# Run a subset of model initialization tests
|
||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||
@ -698,6 +698,7 @@ steps:
|
||||
- vllm/model_executor/models/
|
||||
- vllm/transformers_utils/
|
||||
- tests/models/test_initialization.py
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
# subset of supported models (the complement of the small subset in the above
|
||||
@ -830,7 +831,7 @@ steps:
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor Test
|
||||
@ -1340,6 +1341,7 @@ steps:
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
soft_fail: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
@ -1374,21 +1376,3 @@ steps:
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
|
||||
21
.buildkite/test_areas/attention.yaml
Normal file
21
.buildkite/test_areas/attention.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
group: Attention
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: V1 attention (H100)
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 attention (B200)
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
16
.buildkite/test_areas/basic_correctness.yaml
Normal file
16
.buildkite/test_areas/basic_correctness.yaml
Normal file
@ -0,0 +1,16 @@
|
||||
group: Basic Correctness
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Basic Correctness
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
- tests/basic_correctness/test_cpu_offload
|
||||
- tests/basic_correctness/test_cumem.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
19
.buildkite/test_areas/benchmarks.yaml
Normal file
19
.buildkite/test_areas/benchmarks.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
group: Benchmarks
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Benchmarks
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Benchmarks CLI Test
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
57
.buildkite/test_areas/compile.yaml
Normal file
57
.buildkite/test_areas/compile.yaml
Normal file
@ -0,0 +1,57 @@
|
||||
group: Compile
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Fusion and Compile Tests (B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Fusion E2E (2 GPUs)(B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
22
.buildkite/test_areas/cuda.yaml
Normal file
22
.buildkite/test_areas/cuda.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
group: CUDA
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Platform Tests (CUDA)
|
||||
timeout_in_minutes: 15
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
|
||||
- label: Cudagraph
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- tests/v1/cudagraph
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/config/compilation.py
|
||||
- vllm/compilation
|
||||
commands:
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||
199
.buildkite/test_areas/distributed.yaml
Normal file
199
.buildkite/test_areas/distributed.yaml
Normal file
@ -0,0 +1,199 @@
|
||||
group: Distributed
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Distributed Comm Ops
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed
|
||||
- tests/distributed
|
||||
commands:
|
||||
- pytest -v -s distributed/test_comm_ops.py
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
- pytest -v -s distributed/test_shm_buffer.py
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 90
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=4 and dp=1
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2, pp=2 and dp=1
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=1 and dp=4 with ep
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2 and dp=2 with ep
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- cd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
- vllm/config/parallel.py
|
||||
- vllm/distributed/
|
||||
- vllm/v1/engine/llm_engine.py
|
||||
- vllm/v1/executor/uniproc_executor.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and dp=4 with ep
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: Distributed Tests (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H200)
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
gpu: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: 2 Node Test (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_nodes: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
59
.buildkite/test_areas/e2e_integration.yaml
Normal file
59
.buildkite/test_areas/e2e_integration.yaml
Normal file
@ -0,0 +1,59 @@
|
||||
group: E2E Integration
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: Prime-RL Integration (2 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
26
.buildkite/test_areas/engine.yaml
Normal file
26
.buildkite/test_areas/engine.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
group: Engine
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Engine
|
||||
timeout_in_minutes: 15
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: V1 e2e + engine
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
68
.buildkite/test_areas/entrypoints.yaml
Normal file
68
.buildkite/test_areas/entrypoints.yaml
Normal file
@ -0,0 +1,68 @@
|
||||
group: Entrypoints
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Entrypoints Unit Tests
|
||||
timeout_in_minutes: 10
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration (LLM)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration (API Server)
|
||||
timeout_in_minutes: 130
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
|
||||
- label: Entrypoints Integration (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/pooling
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/pooling
|
||||
|
||||
|
||||
- label: Entrypoints V1
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/entrypoints
|
||||
|
||||
- label: OpenAI API Correctness
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
23
.buildkite/test_areas/expert_parallelism.yaml
Normal file
23
.buildkite/test_areas/expert_parallelism.yaml
Normal file
@ -0,0 +1,23 @@
|
||||
group: Expert Parallelism
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: EPLB Algorithm
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_algo.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
117
.buildkite/test_areas/kernels.yaml
Normal file
117
.buildkite/test_areas/kernels.yaml
Normal file
@ -0,0 +1,117 @@
|
||||
group: Kernels
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Kernels Core Operation Test
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
- tests/kernels/test_top_k_per_row.py
|
||||
commands:
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Quantization Test %N
|
||||
timeout_in_minutes: 90
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test %N
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- csrc/moe/
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
- vllm/distributed/device_communicators/
|
||||
- vllm/envs.py
|
||||
- vllm/config
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
- tests/kernels/mamba
|
||||
- vllm/model_executor/layers/mamba/ops
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization/test_block_fp8.py
|
||||
- tests/kernels/moe/test_deepgemm.py
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- csrc/attention/mla/
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||
- vllm/platforms/cuda.py
|
||||
- vllm/attention/selector.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
46
.buildkite/test_areas/lm_eval.yaml
Normal file
46
.buildkite/test_areas/lm_eval.yaml
Normal file
@ -0,0 +1,46 @@
|
||||
group: LM Eval
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: LM Eval Small Models
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Small Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
31
.buildkite/test_areas/lora.yaml
Normal file
31
.buildkite/test_areas/lora.yaml
Normal file
@ -0,0 +1,31 @@
|
||||
group: LoRA
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: LoRA %N
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
|
||||
- label: LoRA TP (Distributed)
|
||||
timeout_in_minutes: 30
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
165
.buildkite/test_areas/misc.yaml
Normal file
165
.buildkite/test_areas/misc.yaml
Normal file
@ -0,0 +1,165 @@
|
||||
group: Miscellaneous
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: V1 Others
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_outputs.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Others (CPU)
|
||||
depends_on: ~
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
no_gpu: true
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'cpu_test' v1/core
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
||||
|
||||
- label: Regression
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
commands:
|
||||
- pip install modelscope
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Examples
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/basic/chat.py # for basic
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/tracing
|
||||
commands:
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0' \
|
||||
'opentelemetry-api>=1.26.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||
- pytest -v -s v1/tracing
|
||||
|
||||
- label: Python-only Installation
|
||||
depends_on: ~
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- tests/standalone_tests/python_only_compile.sh
|
||||
- setup.py
|
||||
commands:
|
||||
- bash standalone_tests/python_only_compile.sh
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||
depends_on: ~
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
- label: GPT-OSS Eval (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
17
.buildkite/test_areas/model_executor.yaml
Normal file
17
.buildkite/test_areas/model_executor.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
group: Model Executor
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Model Executor
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- vllm/engine/arg_utils.py
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
62
.buildkite/test_areas/models_basic.yaml
Normal file
62
.buildkite/test_areas/models_basic.yaml
Normal file
@ -0,0 +1,62 @@
|
||||
group: Models - Basic
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Basic Models Tests (Initialization)
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Run a subset of model initialization tests
|
||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||
|
||||
- label: Basic Models Tests (Extra Initialization) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
# subset of supported models (the complement of the small subset in the above
|
||||
# test.) Also run if model initialization test file is modified
|
||||
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Basic Models Tests (Other)
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_utils.py
|
||||
- tests/models/test_vision.py
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||
|
||||
- label: Transformers Nightly Models
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
soft_fail: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
22
.buildkite/test_areas/models_distributed.yaml
Normal file
22
.buildkite/test_areas/models_distributed.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
group: Models - Distributed
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Distributed Model Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||
- vllm/model_executor/models/
|
||||
- tests/basic_correctness/
|
||||
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
||||
- tests/models/
|
||||
commands:
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||
91
.buildkite/test_areas/models_language.yaml
Normal file
91
.buildkite/test_areas/models_language.yaml
Normal file
@ -0,0 +1,91 @@
|
||||
group: Models - Language
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language
|
||||
commands:
|
||||
# Test standard language models, excluding a subset of slow tests
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m 'core_model and (not slow_test)'
|
||||
|
||||
- label: Language Models Tests (Extra Standard) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- tests/models/language/pooling/test_embedding.py
|
||||
- tests/models/language/generation/test_common.py
|
||||
- tests/models/language/pooling/test_classification.py
|
||||
commands:
|
||||
# Shard slow subset of standard language models tests. Only run when model
|
||||
# source is modified, or when specified test files are modified
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Language Models Tests (Hybrid) %N
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
# Shard hybrid language model tests
|
||||
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 80min
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
|
||||
- label: Language Models Test (PPL)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation_ppl_test
|
||||
commands:
|
||||
- pytest -v -s models/language/generation_ppl_test
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
|
||||
- label: Language Models Test (MTEB)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling_mteb_test
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling_mteb_test
|
||||
79
.buildkite/test_areas/models_multimodal.yaml
Normal file
79
.buildkite/test_areas/models_multimodal.yaml
Normal file
@ -0,0 +1,79 @@
|
||||
group: Models - Multimodal
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Multi-Modal Models (Standard) # 60min
|
||||
timeout_in_minutes: 80
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor # 44min
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models (Extended) 1
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||
|
||||
- label: Multi-Modal Models (Extended) 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models (Extended) 3
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
# PR authors can temporarily add commands below to test individual models
|
||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||
34
.buildkite/test_areas/plugins.yaml
Normal file
34
.buildkite/test_areas/plugins.yaml
Normal file
@ -0,0 +1,34 @@
|
||||
group: Plugins
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Plugin Tests (2 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
commands:
|
||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
# end platform plugin tests
|
||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||
- pip uninstall dummy_stat_logger -y
|
||||
# end stat_logger plugins test
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
50
.buildkite/test_areas/pytorch.yaml
Normal file
50
.buildkite/test_areas/pytorch.yaml
Normal file
@ -0,0 +1,50 @@
|
||||
group: PyTorch
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# Run unit tests defined directly under compile/,
|
||||
# not including subdirectories, which are usually heavier
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||
|
||||
- label: PyTorch Fullgraph
|
||||
timeout_in_minutes: 40
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
# of the dependencies. Please check the error message and add the package to whitelist
|
||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- requirements/nightly_torch_test.txt
|
||||
commands:
|
||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||
46
.buildkite/test_areas/quantization.yaml
Normal file
46
.buildkite/test_areas/quantization.yaml
Normal file
@ -0,0 +1,46 @@
|
||||
group: Quantization
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Quantization
|
||||
timeout_in_minutes: 90
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
commands:
|
||||
# temporary install here since we need nightly, will move to requirements/test.in
|
||||
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
||||
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- tests/quantization/test_blackwell_moe.py
|
||||
- vllm/model_executor/models/deepseek_v2.py
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/models/llama4.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
||||
- vllm/model_executor/layers/quantization/modelopt.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized Models Test
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/models/quantization
|
||||
commands:
|
||||
- pytest -v -s models/quantization
|
||||
14
.buildkite/test_areas/samplers.yaml
Normal file
14
.buildkite/test_areas/samplers.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
group: Samplers
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Samplers Test
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/sampling_metadata.py
|
||||
- tests/samplers
|
||||
- tests/conftest.py
|
||||
commands:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
13
.buildkite/test_areas/tool_use.yaml
Normal file
13
.buildkite/test_areas/tool_use.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
group: Tool use
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: OpenAI-Compatible Tool Use
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: false
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
25
.buildkite/test_areas/weight_loading.yaml
Normal file
25
.buildkite/test_areas/weight_loading.yaml
Normal file
@ -0,0 +1,25 @@
|
||||
group: Weight Loading
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Weight Loading Multiple GPU # 33min
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
50
.github/mergify.yml
vendored
50
.github/mergify.yml
vendored
@ -14,6 +14,52 @@ pull_request_rules:
|
||||
comment:
|
||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||
|
||||
- name: comment-pre-commit-failure
|
||||
description: Comment on PR when pre-commit check fails
|
||||
conditions:
|
||||
- status-failure=pre-commit
|
||||
- -closed
|
||||
- -draft
|
||||
actions:
|
||||
comment:
|
||||
message: |
|
||||
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
||||
|
||||
```bash
|
||||
uv pip install pre-commit
|
||||
pre-commit install
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
Then, commit the changes and push to your branch.
|
||||
|
||||
For future commits, `pre-commit` will run automatically on changed files before each commit.
|
||||
|
||||
> [!TIP]
|
||||
> <details>
|
||||
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
||||
> <br/>
|
||||
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
||||
>
|
||||
> ```bash
|
||||
> # For mypy (substitute "3.10" with the failing version if needed)
|
||||
> pre-commit run --hook-stage manual mypy-3.10
|
||||
> # For markdownlint
|
||||
> pre-commit run --hook-stage manual markdownlint
|
||||
> ```
|
||||
> </details>
|
||||
|
||||
- name: comment-dco-failure
|
||||
description: Comment on PR when DCO check fails
|
||||
conditions:
|
||||
- status-failure=dco
|
||||
- -closed
|
||||
- -draft
|
||||
actions:
|
||||
comment:
|
||||
message: |
|
||||
Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
|
||||
|
||||
- name: label-ci-build
|
||||
description: Automatically apply ci/build label
|
||||
conditions:
|
||||
@ -140,7 +186,7 @@ pull_request_rules:
|
||||
- files~=^tests/entrypoints/test_context.py
|
||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
||||
- files~=^vllm/entrypoints/harmony_utils.py
|
||||
- files~=^vllm/entrypoints/openai/parser/harmony_utils.py
|
||||
- files~=^vllm/entrypoints/tool_server.py
|
||||
- files~=^vllm/entrypoints/tool.py
|
||||
- files~=^vllm/entrypoints/context.py
|
||||
@ -358,4 +404,4 @@ pull_request_rules:
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- kv-connector
|
||||
- kv-connector
|
||||
|
||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
|
||||
2
.github/workflows/macos-smoke-test.yml
vendored
2
.github/workflows/macos-smoke-test.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v6.0.1
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
|
||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
4
.github/workflows/stale.yml
vendored
4
.github/workflows/stale.yml
vendored
@ -7,13 +7,15 @@ on:
|
||||
|
||||
jobs:
|
||||
close-issues-and-pull-requests:
|
||||
# Prevents triggering on forks or other repos
|
||||
if: github.repository == 'vllm-project/vllm'
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
actions: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
|
||||
with:
|
||||
# Increasing this value ensures that changes to this workflow
|
||||
# propagate to all issues and PRs in days rather than months
|
||||
|
||||
@ -384,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
PYTHONPATH=$ENV{PYTHONPATH}
|
||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||
RESULT_VARIABLE marlin_generation_result
|
||||
OUTPUT_VARIABLE marlin_generation_result
|
||||
@ -822,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
|
||||
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||
RESULT_VARIABLE machete_generation_result
|
||||
OUTPUT_VARIABLE machete_generation_output
|
||||
@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
|
||||
)
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/torch_bindings.cpp"
|
||||
"csrc/moe/moe_align_sum_kernels.cu"
|
||||
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
@ -1002,7 +1004,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
PYTHONPATH=$ENV{PYTHONPATH}
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||
RESULT_VARIABLE moe_marlin_generation_result
|
||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||
|
||||
@ -143,11 +143,13 @@ Compute Resources:
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Google Cloud
|
||||
- IBM
|
||||
- Intel
|
||||
- Lambda Lab
|
||||
- Nebius
|
||||
- Novita AI
|
||||
- NVIDIA
|
||||
- Red Hat
|
||||
- Replicate
|
||||
- Roblox
|
||||
- RunPod
|
||||
|
||||
@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
||||
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
||||
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
||||
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
||||
HOSTNAME=$(hostname)
|
||||
if [[ -z "$HOSTNAME" ]]; then
|
||||
echo "Error: Failed to determine hostname." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||
RESULT="$LOG_FOLDER/result.txt"
|
||||
@ -82,6 +87,7 @@ start_server() {
|
||||
"$MODEL"
|
||||
"--disable-log-requests"
|
||||
"--port" "8004"
|
||||
"--host" "$HOSTNAME"
|
||||
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
||||
"--max-num-seqs" "$max_num_seqs"
|
||||
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
||||
@ -96,8 +102,9 @@ start_server() {
|
||||
# This correctly passes each element as a separate argument.
|
||||
if [[ -n "$profile_dir" ]]; then
|
||||
# Start server with profiling enabled
|
||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
else
|
||||
# Start server without profiling
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
@ -112,7 +119,7 @@ start_server() {
|
||||
# since that we should always have permission to send signal to the server process.
|
||||
kill -0 $server_pid 2> /dev/null || break
|
||||
|
||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||
server_started=1
|
||||
@ -172,6 +179,7 @@ run_benchmark() {
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 1000 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 &> "$bm_log"
|
||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
@ -187,7 +195,7 @@ run_benchmark() {
|
||||
request_rate=$((${throughput%.*} + 1))
|
||||
while ((request_rate > 0)); do
|
||||
# clear prefix cache
|
||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
|
||||
sleep 5
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||
vllm bench serve \
|
||||
@ -203,6 +211,7 @@ run_benchmark() {
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 &> "$bm_log"
|
||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 \
|
||||
--profile &> "$bm_log"
|
||||
else
|
||||
|
||||
@ -620,7 +620,7 @@ def get_tokenizer(
|
||||
kwargs["use_fast"] = False
|
||||
if tokenizer_mode == "mistral":
|
||||
try:
|
||||
from vllm.tokenizers import MistralTokenizer
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"MistralTokenizer requires vllm package.\n"
|
||||
|
||||
@ -32,12 +32,11 @@ def benchmark_propose(args):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
max_model_len=args.num_token + args.num_spec_token,
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=None,
|
||||
seed=0,
|
||||
trust_remote_code=False,
|
||||
)
|
||||
proposer = NgramProposer(
|
||||
|
||||
@ -574,7 +574,7 @@ async def benchmark(
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||
"Total token throughput (tok/s):", metrics.total_token_throughput
|
||||
)
|
||||
)
|
||||
|
||||
@ -963,8 +963,7 @@ def create_argument_parser():
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Use Torch Profiler. The endpoint must be launched with "
|
||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
||||
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--result-dir",
|
||||
|
||||
@ -14,6 +14,9 @@ from tqdm import tqdm
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -22,6 +25,7 @@ class bench_params_t:
|
||||
hidden_size: int
|
||||
add_residual: bool
|
||||
dtype: torch.dtype
|
||||
group_size: list[int]
|
||||
|
||||
def description(self):
|
||||
return (
|
||||
@ -29,6 +33,7 @@ class bench_params_t:
|
||||
f"x D {self.hidden_size} "
|
||||
f"x R {self.add_residual} "
|
||||
f"x DT {self.dtype}"
|
||||
f"x GS {self.group_size}"
|
||||
)
|
||||
|
||||
|
||||
@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
|
||||
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
||||
ADD_RESIDUAL = [True, False]
|
||||
DTYPES = [torch.bfloat16, torch.float]
|
||||
GROUP_SIZES = [[1, 64], [1, 128]]
|
||||
|
||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
|
||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
|
||||
bench_params = list(
|
||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
|
||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
|
||||
)
|
||||
return bench_params
|
||||
|
||||
@ -52,6 +58,7 @@ def unfused_int8_impl(
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
@ -69,6 +76,7 @@ def unfused_fp8_impl(
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
@ -81,23 +89,63 @@ def unfused_fp8_impl(
|
||||
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
||||
|
||||
|
||||
def unfused_groupwise_fp8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
if residual is None:
|
||||
torch_out = rms_norm_layer.forward_cuda(x, residual)
|
||||
else:
|
||||
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
|
||||
|
||||
# Quant
|
||||
torch_out, _ = per_token_group_quant_fp8(
|
||||
torch_out, group_size=group_size[1], use_ue8m0=False
|
||||
)
|
||||
|
||||
|
||||
def fused_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
||||
)
|
||||
|
||||
|
||||
def fused_groupwise_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
out, _ = ops.rms_norm_per_block_quant(
|
||||
x,
|
||||
rms_norm_layer.weight,
|
||||
1e-6,
|
||||
quant_dtype,
|
||||
group_size,
|
||||
residual=residual,
|
||||
is_scale_transposed=True,
|
||||
)
|
||||
|
||||
|
||||
# Bench functions
|
||||
def bench_fn(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
label: str,
|
||||
sub_label: str,
|
||||
fn: Callable,
|
||||
@ -110,10 +158,11 @@ def bench_fn(
|
||||
"x": x,
|
||||
"residual": residual,
|
||||
"quant_dtype": quant_dtype,
|
||||
"group_size": group_size,
|
||||
"fn": fn,
|
||||
}
|
||||
return TBenchmark.Timer(
|
||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
|
||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.int8,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_int8_impl,
|
||||
@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_fp8_impl,
|
||||
@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.int8,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_impl,
|
||||
@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_impl,
|
||||
@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
)
|
||||
)
|
||||
|
||||
# unfused groupwise fp8 impl.
|
||||
timers.append(
|
||||
bench_fn(
|
||||
layer,
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_groupwise_fp8_impl,
|
||||
"unfused_groupwise_fp8_impl",
|
||||
)
|
||||
)
|
||||
|
||||
# fused groupwise fp8 impl.
|
||||
timers.append(
|
||||
bench_fn(
|
||||
layer,
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_groupwise_impl,
|
||||
"fused_groupwise_fp8_impl",
|
||||
)
|
||||
)
|
||||
|
||||
print_timers(timers)
|
||||
|
||||
return timers
|
||||
|
||||
150
benchmarks/kernels/benchmark_mla_k_concat.py
Normal file
150
benchmarks/kernels/benchmark_mla_k_concat.py
Normal file
@ -0,0 +1,150 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
|
||||
in MLA (Multi-head Latent Attention) prefill.
|
||||
|
||||
This validates that the optimization from commit 8d4142bd is beneficial across
|
||||
various batch sizes, not just the originally tested batch size of 32768.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
|
||||
import torch
|
||||
|
||||
# DeepSeek-V3 MLA dimensions
|
||||
NUM_HEADS = 128
|
||||
QK_NOPE_HEAD_DIM = 128
|
||||
PE_DIM = 64
|
||||
|
||||
|
||||
def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
||||
"""Original torch.cat approach with expand."""
|
||||
return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
|
||||
|
||||
|
||||
def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
||||
"""Optimized direct copy approach (avoids expand + cat overhead)."""
|
||||
k = torch.empty(
|
||||
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
|
||||
dtype=k_nope.dtype,
|
||||
device=k_nope.device,
|
||||
)
|
||||
k[..., : k_nope.shape[-1]] = k_nope
|
||||
k[..., k_nope.shape[-1] :] = k_pe
|
||||
return k
|
||||
|
||||
|
||||
def benchmark_method(
|
||||
method: Callable,
|
||||
k_nope: torch.Tensor,
|
||||
k_pe: torch.Tensor,
|
||||
num_warmup: int = 10,
|
||||
num_iters: int = 100,
|
||||
) -> float:
|
||||
"""Benchmark a concatenation method and return mean latency in ms."""
|
||||
# Warmup
|
||||
for _ in range(num_warmup):
|
||||
_ = method(k_nope, k_pe)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
start = time.perf_counter()
|
||||
for _ in range(num_iters):
|
||||
_ = method(k_nope, k_pe)
|
||||
torch.cuda.synchronize()
|
||||
end = time.perf_counter()
|
||||
|
||||
return (end - start) / num_iters * 1000 # Convert to ms
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def run_benchmark(dtype: torch.dtype, dtype_name: str):
|
||||
"""Run benchmark for a specific dtype."""
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
# Batch sizes to test (powers of 2 from 32 to 65536)
|
||||
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
|
||||
|
||||
print("=" * 80)
|
||||
print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
|
||||
print("=" * 80)
|
||||
print(
|
||||
f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
|
||||
f"k_pe=[B, 1, {PE_DIM}]"
|
||||
)
|
||||
print(f"dtype: {dtype_name}")
|
||||
print()
|
||||
print(
|
||||
f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
|
||||
f"{'Speedup':>8} | {'Reduction':>10}"
|
||||
)
|
||||
print("-" * 70)
|
||||
|
||||
results = []
|
||||
for batch_size in batch_sizes:
|
||||
# Create input tensors (generate in float32 then convert for FP8 compatibility)
|
||||
k_nope = torch.randn(
|
||||
batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
|
||||
).to(dtype)
|
||||
k_pe = torch.randn(
|
||||
batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
|
||||
).to(dtype)
|
||||
|
||||
# Benchmark both methods
|
||||
cat_time = benchmark_method(cat_method, k_nope, k_pe)
|
||||
direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
|
||||
|
||||
speedup = cat_time / direct_time
|
||||
reduction = (1 - direct_time / cat_time) * 100
|
||||
|
||||
results.append((batch_size, cat_time, direct_time, speedup, reduction))
|
||||
|
||||
print(
|
||||
f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
|
||||
f"{speedup:>7.2f}x | {reduction:>9.1f}%"
|
||||
)
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
# Summary statistics
|
||||
speedups = [r[3] for r in results]
|
||||
print("\nSpeedup summary:")
|
||||
print(f" Min: {min(speedups):.2f}x")
|
||||
print(f" Max: {max(speedups):.2f}x")
|
||||
print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
|
||||
|
||||
# Find crossover point
|
||||
crossover_batch = None
|
||||
for batch_size, _, _, speedup, _ in results:
|
||||
if speedup >= 1.0:
|
||||
crossover_batch = batch_size
|
||||
break
|
||||
|
||||
print("\nConclusion:")
|
||||
if crossover_batch:
|
||||
print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
|
||||
# Filter for large batches (>= 512 which is typical for prefill)
|
||||
large_batch_speedups = [r[3] for r in results if r[0] >= 512]
|
||||
if large_batch_speedups:
|
||||
avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
|
||||
print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
|
||||
print(" - MLA prefill typically uses large batches, so optimization is effective")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main():
|
||||
# Test bfloat16
|
||||
print("\n")
|
||||
run_benchmark(torch.bfloat16, "bfloat16")
|
||||
|
||||
# Test float8_e4m3fn
|
||||
print("\n")
|
||||
run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
||||
num_tokens_range = [1, 16, 256, 4096]
|
||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||
topk_range = [1, 2, 8]
|
||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||
ep_size_range = [1, 8]
|
||||
configs = list(
|
||||
itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
|
||||
)
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_names=["num_tokens", "num_experts", "topk", "ep_size"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["vllm"],
|
||||
@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(num_tokens, num_experts, topk, provider):
|
||||
def benchmark(num_tokens, num_experts, topk, ep_size, provider):
|
||||
"""Benchmark function for Triton."""
|
||||
block_size = 256
|
||||
torch.cuda.manual_seed_all(0)
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
e_map = None
|
||||
if ep_size != 1:
|
||||
local_e = num_experts // ep_size
|
||||
e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
|
||||
e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
|
||||
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "vllm":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
||||
lambda: moe_align_block_size(
|
||||
topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
|
||||
@ -99,7 +99,6 @@ def benchmark_mrope(
|
||||
# the parameters to compute the q k v size based on tp_size
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=rope_parameters,
|
||||
|
||||
@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
||||
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||
dtype = torch.bfloat16
|
||||
max_position = 8192
|
||||
base = 10000
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||
rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
|
||||
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
|
||||
rope = rope.to(dtype=dtype, device=device)
|
||||
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
||||
|
||||
|
||||
@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
endif()
|
||||
|
||||
# Build ACL with CMake
|
||||
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
set(ARM_COMPUTE_ARCH "armv8.2-a")
|
||||
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
|
||||
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
|
||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
||||
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
|
||||
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
|
||||
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
|
||||
set(ARM_COMPUTE_BUILD_TESTING "OFF")
|
||||
|
||||
set(_cmake_config_cmd
|
||||
${CMAKE_COMMAND} -G Ninja -B build
|
||||
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
||||
|
||||
@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
||||
# sm90a
|
||||
|
||||
set(SUPPORT_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
|
||||
list(APPEND SUPPORT_ARCHS 9.0a)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
|
||||
list(APPEND SUPPORT_ARCHS "9.0a")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
|
||||
list(APPEND SUPPORT_ARCHS 10.0a)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
|
||||
# CUDA 12.9 has introduced "Family-Specific Architecture Features"
|
||||
# this supports all compute_10x family
|
||||
list(APPEND SUPPORT_ARCHS "10.0f")
|
||||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
list(APPEND SUPPORT_ARCHS "10.0a")
|
||||
endif()
|
||||
|
||||
|
||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
||||
if(FLASH_MLA_ARCHS)
|
||||
message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
|
||||
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
||||
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
||||
|
||||
@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||
else()
|
||||
# Create empty targets for setup.py when not targeting sm90a systems
|
||||
message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
|
||||
# Create empty targets for setup.py on unsupported systems
|
||||
add_custom_target(_flashmla_C)
|
||||
add_custom_target(_flashmla_extension_C)
|
||||
endif()
|
||||
|
||||
@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
|
||||
run_python(_VLLM_TORCH_GOMP_PATH
|
||||
"
|
||||
import os, glob
|
||||
try:
|
||||
import torch
|
||||
torch_pkg = os.path.dirname(torch.__file__)
|
||||
site_root = os.path.dirname(torch_pkg)
|
||||
torch_libs = os.path.join(site_root, 'torch.libs')
|
||||
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
|
||||
except:
|
||||
print('')
|
||||
import torch
|
||||
torch_pkg = os.path.dirname(torch.__file__)
|
||||
site_root = os.path.dirname(torch_pkg)
|
||||
|
||||
# Search both torch.libs and torch/lib
|
||||
roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
|
||||
candidates = []
|
||||
for root in roots:
|
||||
if not os.path.isdir(root):
|
||||
continue
|
||||
candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
|
||||
|
||||
print(candidates[0] if candidates else '')
|
||||
"
|
||||
"failed to probe torch.libs for libgomp")
|
||||
"failed to probe for libgomp")
|
||||
|
||||
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
||||
return()
|
||||
|
||||
12
csrc/cache.h
12
csrc/cache.h
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
@ -58,6 +59,15 @@ void cp_gather_cache(
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
// Gather and upconvert FP8 KV cache to BF16 workspace
|
||||
void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& seq_lens, // [BATCH]
|
||||
torch::Tensor const& workspace_starts, // [BATCH]
|
||||
int64_t batch_size);
|
||||
|
||||
// Indexer K quantization and cache function
|
||||
void indexer_k_quant_and_cache(
|
||||
torch::Tensor& k, // [num_tokens, head_dim]
|
||||
@ -72,4 +82,4 @@ void cp_gather_indexer_k_quant_cache(
|
||||
torch::Tensor& dst_k, // [num_tokens, head_dim]
|
||||
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
|
||||
const torch::Tensor& block_table, // [batch_size, num_blocks]
|
||||
const torch::Tensor& cu_seq_lens); // [batch_size + 1]
|
||||
const torch::Tensor& cu_seq_lens); // [batch_size + 1]
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "cuda_compat.h"
|
||||
@ -514,7 +515,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
const int quant_block_size, // quantization block size
|
||||
const int cache_block_size, // cache block size
|
||||
const int cache_stride, // stride for each token in kv_cache
|
||||
const bool use_ue8m0 // use ue8m0 scale format
|
||||
|
||||
const bool use_ue8m0 // use ue8m0 scale format
|
||||
) {
|
||||
constexpr int VEC_SIZE = 4;
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
@ -1061,6 +1063,82 @@ void gather_and_maybe_dequant_cache(
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Gather and upconvert FP8 KV cache tokens to BF16 workspace
|
||||
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
||||
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ seq_lens, // [BATCH]
|
||||
const int32_t* __restrict__ workspace_starts, // [BATCH]
|
||||
const int32_t block_size, const int32_t head_dim,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = workspace_starts[bid];
|
||||
const int32_t seq_len = seq_lens[bid];
|
||||
const int32_t tot_slots = seq_len;
|
||||
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
||||
|
||||
const int32_t split_start = split * split_slots;
|
||||
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
||||
|
||||
const bool is_active_split = (split_start < tot_slots);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = split_start;
|
||||
int32_t offset_div = offset / block_size;
|
||||
offset = offset % block_size;
|
||||
const int32_t* batch_block_table = block_table + batch_offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
// Process each token in this split
|
||||
for (int pid = split_start; pid < split_end; ++pid) {
|
||||
auto block_id = batch_block_table[offset_div];
|
||||
const uint8_t* token_ptr =
|
||||
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
|
||||
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
|
||||
|
||||
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
|
||||
const uint8_t* no_pe_ptr = token_ptr;
|
||||
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
||||
const __nv_bfloat16* rope_ptr =
|
||||
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
|
||||
|
||||
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
|
||||
if (tid < 512) {
|
||||
// FP8 dequantization
|
||||
const int tile = tid >> 7; // each tile is 128 elements
|
||||
const float scale = scales_ptr[tile];
|
||||
const uint8_t val = no_pe_ptr[tid];
|
||||
dst_ptr[tid] =
|
||||
fp8::scaled_convert<__nv_bfloat16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
|
||||
} else if (tid < 576) {
|
||||
// Rope copy (64 bf16 elements)
|
||||
const int rope_idx = tid - 512;
|
||||
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
|
||||
}
|
||||
|
||||
// Move to next token
|
||||
offset += 1;
|
||||
if (offset == block_size) {
|
||||
offset_div += 1;
|
||||
offset = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
|
||||
// block_size.
|
||||
@ -1202,6 +1280,57 @@ void cp_gather_cache(
|
||||
}
|
||||
}
|
||||
|
||||
void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& seq_lens, // [BATCH]
|
||||
torch::Tensor const& workspace_starts, // [BATCH]
|
||||
int64_t batch_size) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t head_dim = dst.size(1);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
|
||||
TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
|
||||
"workspace_starts must be int32");
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == block_table.device(),
|
||||
"src_cache and block_table must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == seq_lens.device(),
|
||||
"src_cache and seq_lens must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == workspace_starts.device(),
|
||||
"src_cache and workspace_starts must be on the same device");
|
||||
|
||||
TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
|
||||
TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
|
||||
TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
|
||||
|
||||
int64_t block_table_stride = block_table.stride(0);
|
||||
int64_t cache_block_stride = src_cache.stride(0);
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(576);
|
||||
|
||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
||||
src_cache.data_ptr<uint8_t>(),
|
||||
reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
||||
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
||||
block_table_stride, cache_block_stride, cache_entry_stride,
|
||||
dst_entry_stride);
|
||||
}
|
||||
|
||||
// Macro to dispatch the kernel based on the data type.
|
||||
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
|
||||
@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
|
||||
input.casual = casual;
|
||||
input.isa = isa;
|
||||
input.enable_kv_split = enable_kv_split;
|
||||
TORCH_CHECK(casual, "Only supports casual mask for now.");
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
|
||||
@ -186,7 +186,7 @@ struct AttentionMetadata {
|
||||
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
|
||||
// * q_tile_size * 4, partial output, max + sum (float)
|
||||
// Reduction scratchpad contains:
|
||||
// - flags: bool array to indicate wether the split is finished
|
||||
// - flags: bool array to indicate whether the split is finished
|
||||
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
|
||||
// - max, sum: 2 * split_num * q_tile_size * 4
|
||||
class AttentionScratchPad {
|
||||
@ -1246,14 +1246,8 @@ class AttentionMainLoop {
|
||||
// rescale sum and partial outputs
|
||||
if (need_rescale) {
|
||||
// compute rescale factor
|
||||
#ifdef DEFINE_FAST_EXP
|
||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||
rescale_factor_vec = fast_exp(rescale_factor_vec);
|
||||
rescale_factor = rescale_factor_vec.get_last_elem();
|
||||
#else
|
||||
rescale_factor = std::exp(rescale_factor);
|
||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||
#endif
|
||||
|
||||
// rescale sum
|
||||
new_sum_val += rescale_factor * init_sum_val;
|
||||
@ -1889,15 +1883,8 @@ class AttentionMainLoop {
|
||||
: curr_output_buffer;
|
||||
float rescale_factor = final_max > curr_max ? curr_max - final_max
|
||||
: final_max - curr_max;
|
||||
|
||||
#ifdef DEFINE_FAST_EXP
|
||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||
rescale_factor_vec = fast_exp(rescale_factor_vec);
|
||||
rescale_factor = rescale_factor_vec.get_last_elem();
|
||||
#else
|
||||
rescale_factor = std::exp(rescale_factor);
|
||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||
#endif
|
||||
|
||||
local_sum[head_idx] = final_max > curr_max
|
||||
? final_sum + rescale_factor * curr_sum
|
||||
|
||||
@ -60,4 +60,54 @@
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
// Implementation copied from Arm Optimized Routines (expf AdvSIMD)
|
||||
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||
#include <limits>
|
||||
#define DEFINE_FAST_EXP \
|
||||
const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); \
|
||||
const float ln2_hi = 0x1.62e4p-1f; \
|
||||
const float ln2_lo = 0x1.7f7d1cp-20f; \
|
||||
const float c0 = 0x1.0e4020p-7f; \
|
||||
const float c2 = 0x1.555e66p-3f; \
|
||||
const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2}; \
|
||||
const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000); \
|
||||
const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f); \
|
||||
const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f); \
|
||||
const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f); \
|
||||
const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f); \
|
||||
const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound); \
|
||||
const float32x4_t inf = \
|
||||
vdupq_n_f32(std::numeric_limits<float>::infinity()); \
|
||||
const float32x4_t zero = vdupq_n_f32(0.0f); \
|
||||
auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) { \
|
||||
float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); \
|
||||
float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0); \
|
||||
r = vfmsq_laneq_f32(r, n, ln2_c02, 1); \
|
||||
uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
|
||||
float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias)); \
|
||||
float32x4_t r2 = vmulq_f32(r, r); \
|
||||
float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2); \
|
||||
float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3); \
|
||||
q = vfmaq_f32(q, p, r2); \
|
||||
p = vmulq_f32(c4, r); \
|
||||
float32x4_t poly = vfmaq_f32(p, q, r2); \
|
||||
poly = vfmaq_f32(scale, poly, scale); \
|
||||
const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound); \
|
||||
const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound); \
|
||||
poly = vbslq_f32(hi_mask, inf, poly); \
|
||||
return vbslq_f32(lo_mask, zero, poly); \
|
||||
}; \
|
||||
auto fast_exp = [&](vec_op::FP32Vec16& vec) \
|
||||
__attribute__((always_inline)) { \
|
||||
float32x4x4_t result; \
|
||||
result.val[0] = neon_expf(vec.reg.val[0]); \
|
||||
result.val[1] = neon_expf(vec.reg.val[1]); \
|
||||
result.val[2] = neon_expf(vec.reg.val[2]); \
|
||||
result.val[3] = neon_expf(vec.reg.val[3]); \
|
||||
return vec_op::FP32Vec16(result); \
|
||||
};
|
||||
|
||||
#endif // __aarch64__
|
||||
|
||||
#endif
|
||||
@ -118,6 +118,24 @@
|
||||
} \
|
||||
}
|
||||
|
||||
#define VLLM_DISPATCH_BOOL(expr, const_expr, ...) \
|
||||
if (expr) { \
|
||||
constexpr bool const_expr = true; \
|
||||
__VA_ARGS__(); \
|
||||
} else { \
|
||||
constexpr bool const_expr = false; \
|
||||
__VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define VLLM_DISPATCH_GROUP_SIZE(group_size, const_group_size, ...) \
|
||||
if (group_size == 128) { \
|
||||
constexpr int const_group_size = 128; \
|
||||
__VA_ARGS__(); \
|
||||
} else if (group_size == 64) { \
|
||||
constexpr int const_group_size = 64; \
|
||||
__VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \
|
||||
switch (NUM_DIMS) { \
|
||||
case 2: { \
|
||||
|
||||
@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) {
|
||||
return cuda_cast<T, float>(sigmoid_accurate(f));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <ScoringFunc SF, typename T>
|
||||
__device__ inline T apply_scoring(T val) {
|
||||
if constexpr (SF == SCORING_SIGMOID) {
|
||||
return apply_sigmoid(val);
|
||||
} else {
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, ScoringFunc SF>
|
||||
__device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
||||
cg::thread_block_tile<32> const& tile,
|
||||
int32_t const lane_id,
|
||||
int const num_experts_per_group,
|
||||
int const scoring_func) {
|
||||
int const num_experts_per_group) {
|
||||
// Get the top2 per thread
|
||||
T largest = neg_inf<T>();
|
||||
T second_largest = neg_inf<T>();
|
||||
|
||||
if (num_experts_per_group > WARP_SIZE) {
|
||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||
T value = input[i];
|
||||
// Apply scoring function if needed
|
||||
if (scoring_func == SCORING_SIGMOID) {
|
||||
value = apply_sigmoid(value);
|
||||
}
|
||||
T value = apply_scoring<SF>(input[i]);
|
||||
value = value + bias[i];
|
||||
|
||||
if (value > largest) {
|
||||
@ -472,17 +476,11 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
||||
}
|
||||
} else {
|
||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||
T value = input[i];
|
||||
// Apply scoring function if needed
|
||||
if (scoring_func == SCORING_SIGMOID) {
|
||||
value = apply_sigmoid(value);
|
||||
}
|
||||
T value = apply_scoring<SF>(input[i]);
|
||||
value = value + bias[i];
|
||||
largest = value;
|
||||
}
|
||||
}
|
||||
|
||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
||||
// Get the top2 warpwise
|
||||
T max1 = cg::reduce(tile, largest, cg::greater<T>());
|
||||
|
||||
@ -501,13 +499,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename T, ScoringFunc SF>
|
||||
__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
|
||||
int64_t const num_tokens,
|
||||
int64_t const num_cases,
|
||||
int64_t const n_group,
|
||||
int64_t const num_experts_per_group,
|
||||
int const scoring_func) {
|
||||
int64_t const num_experts_per_group) {
|
||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
@ -525,21 +522,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.wait;");
|
||||
#endif
|
||||
topk_with_k2(output, input, group_bias, tile, lane_id,
|
||||
num_experts_per_group, scoring_func);
|
||||
topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
|
||||
num_experts_per_group);
|
||||
}
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.launch_dependents;");
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename IdxT>
|
||||
template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
|
||||
__global__ void group_idx_and_topk_idx_kernel(
|
||||
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
|
||||
T const* bias, int64_t const num_tokens, int64_t const n_group,
|
||||
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
|
||||
int64_t const num_experts_per_group, bool renormalize,
|
||||
double routed_scaling_factor, int scoring_func) {
|
||||
double routed_scaling_factor) {
|
||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||
int32_t case_id =
|
||||
@ -549,6 +546,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
topk_values += case_id * topk;
|
||||
topk_indices += case_id * topk;
|
||||
|
||||
constexpr bool kUseStaticNGroup = (NGroup > 0);
|
||||
// use int32 to avoid implicit conversion
|
||||
int32_t const n_group_i32 =
|
||||
kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
|
||||
|
||||
int32_t align_num_experts_per_group =
|
||||
warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
|
||||
|
||||
@ -574,17 +576,17 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
|
||||
if (case_id < num_tokens) {
|
||||
// calculate group_idx
|
||||
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
|
||||
int32_t target_num_min =
|
||||
WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
|
||||
// The check is necessary to avoid abnormal input
|
||||
if (lane_id < n_group && is_finite(group_scores[lane_id])) {
|
||||
if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
|
||||
value = group_scores[lane_id];
|
||||
}
|
||||
|
||||
int count_equal_to_top_value = WARP_SIZE - n_group;
|
||||
int count_equal_to_top_value = WARP_SIZE - n_group_i32;
|
||||
int pre_count_equal_to_top_value = 0;
|
||||
// Use loop to find the largset top_group
|
||||
while (count_equal_to_top_value < target_num_min) {
|
||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
||||
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
||||
if (value == topk_group_value) {
|
||||
value = neg_inf<T>();
|
||||
@ -604,7 +606,7 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
int count_equalto_topkth_group = 0;
|
||||
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
for (int i_group = 0; i_group < n_group; i_group++) {
|
||||
auto process_group = [&](int i_group) {
|
||||
if ((group_scores[i_group] > topk_group_value) ||
|
||||
((group_scores[i_group] == topk_group_value) &&
|
||||
(count_equalto_topkth_group < num_equalto_topkth_group))) {
|
||||
@ -613,11 +615,10 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
i += WARP_SIZE) {
|
||||
T candidates = neg_inf<T>();
|
||||
if (i < num_experts_per_group) {
|
||||
// Apply scoring function (if any) and add bias
|
||||
// apply scoring function (if any) and add bias
|
||||
T input = scores[offset + i];
|
||||
if (is_finite(input)) {
|
||||
T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
|
||||
: input;
|
||||
T score = apply_scoring<SF>(input);
|
||||
candidates = score + bias[offset + i];
|
||||
}
|
||||
}
|
||||
@ -627,12 +628,21 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
count_equalto_topkth_group++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if constexpr (kUseStaticNGroup) {
|
||||
#pragma unroll
|
||||
for (int i_group = 0; i_group < NGroup; ++i_group) {
|
||||
process_group(i_group);
|
||||
}
|
||||
} else {
|
||||
for (int i_group = 0; i_group < n_group_i32; ++i_group) {
|
||||
process_group(i_group);
|
||||
}
|
||||
}
|
||||
queue.done();
|
||||
__syncwarp();
|
||||
// Get the topk_idx
|
||||
queue.dumpIdx(s_topk_idx);
|
||||
__syncwarp();
|
||||
}
|
||||
|
||||
// Load the valid score value
|
||||
@ -646,12 +656,13 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
if (i < topk) {
|
||||
// Load the score value (without bias) for normalization
|
||||
T input = scores[s_topk_idx[i]];
|
||||
value =
|
||||
(scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
|
||||
value = apply_scoring<SF>(input);
|
||||
s_topk_value[i] = value;
|
||||
}
|
||||
topk_sum +=
|
||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
if (renormalize) {
|
||||
topk_sum +=
|
||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -660,13 +671,9 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
if (case_id < num_tokens) {
|
||||
if (if_proceed_next_topk) {
|
||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||
float value;
|
||||
if (renormalize) {
|
||||
value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
|
||||
routed_scaling_factor;
|
||||
} else {
|
||||
value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
|
||||
}
|
||||
float base = cuda_cast<float, T>(s_topk_value[i]);
|
||||
float value = renormalize ? (base / topk_sum * routed_scaling_factor)
|
||||
: (base * routed_scaling_factor);
|
||||
topk_indices[i] = s_topk_idx[i];
|
||||
topk_values[i] = value;
|
||||
}
|
||||
@ -684,6 +691,45 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename IdxT, ScoringFunc SF>
|
||||
inline void launch_group_idx_and_topk_kernel(
|
||||
cudaLaunchConfig_t const& config, T* scores, T* group_scores,
|
||||
float* topk_values, IdxT* topk_indices, T const* bias,
|
||||
int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
|
||||
int64_t const topk, int64_t const num_experts,
|
||||
int64_t const num_experts_per_group, bool const renormalize,
|
||||
double const routed_scaling_factor) {
|
||||
auto launch = [&](auto* kernel_instance2) {
|
||||
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
|
||||
topk_values, topk_indices, bias, num_tokens, n_group,
|
||||
topk_group, topk, num_experts, num_experts_per_group,
|
||||
renormalize, routed_scaling_factor);
|
||||
};
|
||||
|
||||
switch (n_group) {
|
||||
case 4: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename IdxT>
|
||||
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
IdxT* topk_indices, T const* bias, int64_t const num_tokens,
|
||||
@ -694,7 +740,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
cudaStream_t const stream = 0) {
|
||||
int64_t num_cases = num_tokens * n_group;
|
||||
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
|
||||
cudaLaunchConfig_t config;
|
||||
config.gridDim = topk_with_k2_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
@ -705,16 +750,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
|
||||
num_tokens, num_cases, n_group, num_experts / n_group,
|
||||
scoring_func);
|
||||
auto const sf = static_cast<ScoringFunc>(scoring_func);
|
||||
int64_t const num_experts_per_group = num_experts / n_group;
|
||||
auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
|
||||
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
|
||||
num_tokens, num_cases, n_group, num_experts_per_group);
|
||||
};
|
||||
switch (sf) {
|
||||
case SCORING_NONE: {
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
|
||||
launch_topk_with_k2(kernel_instance1);
|
||||
break;
|
||||
}
|
||||
case SCORING_SIGMOID: {
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
|
||||
launch_topk_with_k2(kernel_instance1);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// should be guarded by higher level checks.
|
||||
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
|
||||
}
|
||||
|
||||
int64_t topk_with_k_group_num_blocks =
|
||||
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
size_t dynamic_smem_in_bytes =
|
||||
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
|
||||
topk);
|
||||
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
|
||||
config.gridDim = topk_with_k_group_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
config.dynamicSmemBytes = dynamic_smem_in_bytes;
|
||||
@ -723,10 +785,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
|
||||
topk_values, topk_indices, bias, num_tokens, n_group,
|
||||
topk_group, topk, num_experts, num_experts / n_group,
|
||||
renormalize, routed_scaling_factor, scoring_func);
|
||||
switch (sf) {
|
||||
case SCORING_NONE: {
|
||||
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
|
||||
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||
num_tokens, n_group, topk_group, topk, num_experts,
|
||||
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||
break;
|
||||
}
|
||||
case SCORING_SIGMOID: {
|
||||
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
|
||||
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||
num_tokens, n_group, topk_group, topk, num_experts,
|
||||
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
|
||||
|
||||
@ -860,4 +860,4 @@ torch::Tensor moe_wna16_marlin_gemm(
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
|
||||
}
|
||||
}
|
||||
@ -14,7 +14,6 @@
|
||||
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
namespace batched_moe_align_block_size {
|
||||
|
||||
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
||||
@ -80,17 +79,32 @@ __global__ void batched_moe_align_block_size_kernel(
|
||||
} // namespace batched_moe_align_block_size
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
__device__ void _moe_align_block_size(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t* __restrict__ total_tokens_post_pad,
|
||||
int32_t* __restrict__ expert_map, int32_t num_experts,
|
||||
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
|
||||
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
|
||||
int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
|
||||
int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
|
||||
extern __shared__ int32_t shared_counts[];
|
||||
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||
sorted_token_ids[it] = numel;
|
||||
// Compute input buffer offsets. Typically these will all be 0, except when
|
||||
// using Multi LoRA.
|
||||
int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
|
||||
int expert_ids_offset = max_num_m_blocks * model_offset;
|
||||
int cumsum_offset = (num_experts + 1) * model_offset;
|
||||
|
||||
// Use separate threadblocks to fill sorted_token_ids.
|
||||
// This is safe since the current kernel does not use sorted_token_ids.
|
||||
if (blockIdx.x % 2) {
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded;
|
||||
it += blockDim.x) {
|
||||
sorted_token_ids[sorted_token_ids_offset + it] = numel;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
@ -112,9 +126,16 @@ __global__ void moe_align_block_size_kernel(
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
if (has_expert_map) {
|
||||
expert_id = expert_map[expert_id];
|
||||
// filter invalid experts
|
||||
if (expert_id == -1) continue;
|
||||
}
|
||||
int warp_idx = expert_id / experts_per_warp;
|
||||
int expert_offset = expert_id % experts_per_warp;
|
||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
||||
int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
|
||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
|
||||
mask);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@ -135,48 +156,196 @@ __global__ void moe_align_block_size_kernel(
|
||||
int cumsum_val;
|
||||
BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
|
||||
if (expert_id <= num_experts) {
|
||||
cumsum[expert_id] = cumsum_val;
|
||||
cumsum[cumsum_offset + expert_id] = cumsum_val;
|
||||
}
|
||||
|
||||
if (expert_id == num_experts) {
|
||||
*total_tokens_post_pad = cumsum_val;
|
||||
total_tokens_post_pad[model_offset] = cumsum_val;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
for (int i = cumsum[cumsum_offset + threadIdx.x];
|
||||
i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
|
||||
expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill remaining expert_ids with 0
|
||||
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
|
||||
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
|
||||
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
|
||||
expert_ids[i] = 0;
|
||||
const size_t fill_start_idx =
|
||||
cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
|
||||
for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
|
||||
expert_ids[expert_ids_offset + i] = inactive_expert_id;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, int32_t fill_threads>
|
||||
__device__ void _moe_align_block_size_small_batch_expert(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad,
|
||||
int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
|
||||
size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
|
||||
int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
|
||||
int32_t* token_mask, bool has_expert_map) {
|
||||
// Compute input buffer offsets. Typically these will all be 0, except when
|
||||
// using Multi LoRA.
|
||||
int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
|
||||
int expert_ids_offset = max_num_m_blocks * model_offset;
|
||||
|
||||
// Use an additional group of threads to fill sorted_token_ids.
|
||||
// Since the current kernel will use sorted_token_ids afterward,
|
||||
// we fill sorted_token_ids within the same threadblock to make
|
||||
// synchronization easier.
|
||||
if (threadIdx.x < fill_threads) {
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded;
|
||||
it += fill_threads) {
|
||||
sorted_token_ids[sorted_token_ids_offset + it] = numel;
|
||||
}
|
||||
// Three __syncthreads() corresponding to the other threads
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t tid = threadIdx.x - fill_threads;
|
||||
const size_t stride = blockDim.x - fill_threads;
|
||||
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[(tid + 1) * num_experts + i] = 0;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (has_expert_map) {
|
||||
expert_id = expert_map[expert_id];
|
||||
// filter invalid expert
|
||||
if (expert_id == -1) continue;
|
||||
}
|
||||
int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
|
||||
tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tid < num_experts) {
|
||||
tokens_cnts[tid] = 0;
|
||||
for (int i = 1; i <= stride; ++i) {
|
||||
tokens_cnts[i * num_experts + tid] +=
|
||||
tokens_cnts[(i - 1) * num_experts + tid];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] =
|
||||
cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
|
||||
block_size;
|
||||
}
|
||||
total_tokens_post_pad[model_offset] =
|
||||
static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tid < num_experts) {
|
||||
for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
|
||||
expert_ids[expert_ids_offset + i / block_size] = tid;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill remaining expert_ids with 0
|
||||
const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
|
||||
for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
|
||||
expert_ids[expert_ids_offset + i] = inactive_expert_id;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (has_expert_map) {
|
||||
expert_id = expert_map[expert_id];
|
||||
// filter invalid expert
|
||||
if (expert_id == -1) continue;
|
||||
}
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
|
||||
|
||||
if (token_mask == nullptr || token_mask[i / topk_num]) {
|
||||
sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
|
||||
++tokens_cnts[tid * num_experts + expert_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void count_and_sort_expert_tokens_kernel(
|
||||
__device__ void _count_and_sort_expert_tokens(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
size_t numel, int32_t num_experts) {
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t stride = blockDim.x * gridDim.x;
|
||||
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||
int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
|
||||
int32_t model_offset, int32_t topk_num, bool has_expert_map) {
|
||||
const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const size_t stride = blockDim.x * gridDim.y;
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
|
||||
if (has_expert_map) {
|
||||
expert_id = expert_map[expert_id];
|
||||
// filter invalid experts
|
||||
if (expert_id == -1) continue;
|
||||
}
|
||||
|
||||
if (token_mask == nullptr || token_mask[i / topk_num]) {
|
||||
int32_t rank_post_pad = atomicAdd(
|
||||
&cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
|
||||
sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
|
||||
i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad,
|
||||
int32_t* __restrict__ expert_map, int32_t num_experts,
|
||||
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
|
||||
int32_t topk_num, bool has_expert_map) {
|
||||
_moe_align_block_size(
|
||||
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||
num_experts, padded_num_experts, experts_per_warp, block_size, numel,
|
||||
cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
|
||||
0, 0, topk_num, nullptr, has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void count_and_sort_expert_tokens_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||
int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
|
||||
_count_and_sort_expert_tokens(
|
||||
topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
|
||||
max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t, int TOPK>
|
||||
__global__ void moe_sum_kernel(
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
@ -193,78 +362,111 @@ __global__ void moe_sum_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
template <typename scalar_t, int32_t fill_threads>
|
||||
__global__ void moe_align_block_size_small_batch_expert_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||
sorted_token_ids[it] = numel;
|
||||
int32_t* __restrict__ total_tokens_post_pad,
|
||||
int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
|
||||
size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
|
||||
bool has_expert_map) {
|
||||
_moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
|
||||
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||
num_experts, block_size, numel, max_num_tokens_padded,
|
||||
CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
|
||||
has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_lora_align_block_size_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
|
||||
int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
|
||||
int max_loras, size_t numel, int max_num_tokens_padded,
|
||||
int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
|
||||
int32_t* __restrict__ expert_ids, int32_t topk_num,
|
||||
int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
|
||||
int32_t* __restrict__ cumsum, int32_t experts_per_warp,
|
||||
int32_t padded_num_experts, int32_t* lora_ids,
|
||||
int32_t* __restrict__ token_mask, bool has_expert_map) {
|
||||
int lora_idx = blockIdx.x / 2;
|
||||
int lora_id = lora_ids[lora_idx];
|
||||
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t tid = threadIdx.x;
|
||||
const size_t stride = blockDim.x;
|
||||
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[threadIdx.x] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[i * num_experts + threadIdx.x] +=
|
||||
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Populate the token_mask based on the token-LoRA mapping
|
||||
int num_tokens = numel / topk_num;
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] =
|
||||
cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
|
||||
block_size;
|
||||
total_tokens_post_pad[lora_id] = 0;
|
||||
|
||||
for (int i = 0; i < num_tokens; i++) {
|
||||
token_mask[(lora_id * num_tokens) + i] =
|
||||
(int)token_lora_mapping[i] == lora_id;
|
||||
}
|
||||
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
_moe_align_block_size(
|
||||
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||
num_experts, padded_num_experts, experts_per_warp, block_size, numel,
|
||||
cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
|
||||
&token_mask[(lora_id * num_tokens)], has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void lora_count_and_sort_expert_tokens_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||
int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
|
||||
int32_t* lora_ids, bool has_expert_map) {
|
||||
int lora_idx = blockIdx.x;
|
||||
int lora_id = lora_ids[lora_idx];
|
||||
if (lora_id == -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
int num_tokens = numel / topk_num;
|
||||
|
||||
_count_and_sort_expert_tokens(
|
||||
topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
|
||||
max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
|
||||
topk_num, has_expert_map);
|
||||
}
|
||||
|
||||
template <typename scalar_t, int32_t fill_threads>
|
||||
__global__ void moe_lora_align_block_size_small_batch_expert_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
|
||||
int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
|
||||
int max_loras, size_t numel, int max_num_tokens_padded,
|
||||
int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
|
||||
int32_t* __restrict__ expert_ids, int topk_num,
|
||||
int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
|
||||
int32_t* token_mask, bool has_expert_map) {
|
||||
int lora_idx = blockIdx.x;
|
||||
int lora_id = lora_ids[lora_idx];
|
||||
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int num_tokens = numel / topk_num;
|
||||
if (threadIdx.x == 0) {
|
||||
total_tokens_post_pad[lora_id] = 0;
|
||||
|
||||
for (int i = 0; i < num_tokens; i++) {
|
||||
token_mask[(lora_id * num_tokens) + i] =
|
||||
(int)token_lora_mapping[i] == lora_id;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill remaining expert_ids with 0
|
||||
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
|
||||
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
|
||||
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
|
||||
expert_ids[i] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
++tokens_cnts[threadIdx.x * num_experts + expert_id];
|
||||
}
|
||||
_moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
|
||||
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||
num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
|
||||
-1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
|
||||
has_expert_map);
|
||||
}
|
||||
|
||||
} // namespace moe
|
||||
@ -275,7 +477,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
|
||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
torch::Tensor num_tokens_post_pad,
|
||||
std::optional<torch::Tensor> maybe_expert_map) {
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int64_t padded_num_experts =
|
||||
@ -287,14 +490,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
// BlockScan uses 1024 threads and assigns one thread per expert.
|
||||
TORCH_CHECK(padded_num_experts < 1024,
|
||||
"padded_num_experts must be less than 1024");
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||
bool has_expert_map = maybe_expert_map.has_value();
|
||||
torch::Tensor expert_map;
|
||||
if (has_expert_map) {
|
||||
expert_map = maybe_expert_map.value();
|
||||
} else {
|
||||
expert_map = torch::empty({0}, options_int);
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||
// calc needed amount of shared mem for `cumsum` tensors
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||
torch::Tensor cumsum_buffer =
|
||||
torch::empty({num_experts + 1}, options_int);
|
||||
bool small_batch_expert_mode =
|
||||
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||
|
||||
@ -304,43 +512,58 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
((threads + 1) * num_experts + (num_experts + 1)) *
|
||||
sizeof(int32_t);
|
||||
|
||||
// threadIdx.x >= fill_threads: counting experts and aligning
|
||||
// threadIdx.x < fill_threads: filling sorted_token_ids
|
||||
constexpr int32_t fill_threads = 256;
|
||||
auto small_batch_expert_kernel =
|
||||
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
|
||||
scalar_t>;
|
||||
small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
|
||||
scalar_t, fill_threads>;
|
||||
small_batch_expert_kernel<<<1, fill_threads + threads,
|
||||
shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel(), sorted_token_ids.size(0));
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
expert_map.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
|
||||
has_expert_map);
|
||||
} else {
|
||||
torch::Tensor cumsum_buffer =
|
||||
torch::empty({num_experts + 1}, options_int);
|
||||
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
||||
|
||||
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
|
||||
size_t shared_mem_size =
|
||||
num_warps * experts_per_warp * sizeof(int32_t);
|
||||
|
||||
align_kernel<<<1, threads, shared_mem_size, stream>>>(
|
||||
// launch two threadblocks
|
||||
// blockIdx.x == 0: counting experts and aligning
|
||||
// blockIdx.x == 1: filling sorted_token_ids
|
||||
align_kernel<<<2, threads, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
|
||||
padded_num_experts, experts_per_warp, block_size,
|
||||
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
|
||||
sorted_token_ids.size(0));
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
|
||||
experts_per_warp, block_size, topk_ids.numel(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
|
||||
topk_ids.size(1), has_expert_map);
|
||||
|
||||
const int block_threads = std::min(256, (int)threads);
|
||||
const int num_blocks =
|
||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||
const int max_blocks = 65535;
|
||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||
dim3 gridDims(1, actual_blocks);
|
||||
|
||||
auto sort_kernel =
|
||||
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
|
||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||
sort_kernel<<<gridDims, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
|
||||
cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
|
||||
topk_ids.numel(), num_experts, sorted_token_ids.size(0),
|
||||
topk_ids.size(1), has_expert_map);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -414,3 +637,123 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void moe_lora_align_block_size(
|
||||
torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
|
||||
int64_t num_experts, int64_t block_size, int64_t max_loras,
|
||||
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
||||
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
||||
torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map) {
|
||||
const int topk_num = topk_ids.size(1);
|
||||
|
||||
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
|
||||
|
||||
int device_max_shared_mem;
|
||||
auto dev = topk_ids.get_device();
|
||||
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int64_t padded_num_experts =
|
||||
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
||||
|
||||
// BlockScan uses 1024 threads and assigns one thread per expert.
|
||||
TORCH_CHECK(padded_num_experts < 1024,
|
||||
"padded_num_experts must be less than 1024");
|
||||
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||
torch::Tensor token_mask =
|
||||
torch::empty({max_loras * topk_ids.size(0)}, options_int);
|
||||
bool has_expert_map = maybe_expert_map.has_value();
|
||||
torch::Tensor expert_map;
|
||||
if (has_expert_map) {
|
||||
expert_map = maybe_expert_map.value();
|
||||
} else {
|
||||
expert_map = torch::empty({0}, options_int);
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_TYPES(
|
||||
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
|
||||
bool small_batch_expert_mode =
|
||||
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||
|
||||
if (small_batch_expert_mode) {
|
||||
const int32_t num_thread = max((int32_t)num_experts, 128);
|
||||
const int32_t shared_mem =
|
||||
(num_thread + 1) * num_experts * sizeof(int32_t) +
|
||||
(num_experts + 1) * sizeof(int32_t);
|
||||
if (shared_mem > device_max_shared_mem) {
|
||||
TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
|
||||
}
|
||||
|
||||
// threadIdx.x >= fill_threads: counting experts and aligning
|
||||
// threadIdx.x < fill_threads: filling sorted_token_ids
|
||||
constexpr int32_t fill_threads = 256;
|
||||
|
||||
dim3 blockDim(num_thread + fill_threads);
|
||||
auto kernel =
|
||||
vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
|
||||
scalar_t, fill_threads>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem));
|
||||
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
token_lora_mapping.data_ptr<int32_t>(), block_size,
|
||||
expert_map.data_ptr<int32_t>(), num_experts, max_loras,
|
||||
topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>(),
|
||||
token_mask.data_ptr<int32_t>(), has_expert_map);
|
||||
} else {
|
||||
int num_thread = 1024;
|
||||
dim3 blockDim(num_thread);
|
||||
size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);
|
||||
|
||||
size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);
|
||||
|
||||
// cumsum buffer
|
||||
torch::Tensor cumsum =
|
||||
torch::zeros({max_loras * (num_experts + 1)}, options_int);
|
||||
|
||||
auto align_kernel =
|
||||
vllm::moe::moe_lora_align_block_size_kernel<scalar_t>;
|
||||
|
||||
// launch two threadblocks for each lora
|
||||
// blockIdx.x % 2 == 0: counting experts and aligning
|
||||
// blockIdx.x % 2 == 1: filling sorted_token_ids
|
||||
align_kernel<<<max_loras * 2, blockDim, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
token_lora_mapping.data_ptr<int32_t>(), block_size,
|
||||
expert_map.data_ptr<int32_t>(), num_experts, max_loras,
|
||||
topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
adapter_enabled.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
|
||||
WARP_SIZE, padded_num_experts, lora_ids.data_ptr<int32_t>(),
|
||||
token_mask.data_ptr<int32_t>(), has_expert_map);
|
||||
|
||||
const int block_threads = std::min(256, (int)num_thread);
|
||||
const int num_blocks =
|
||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||
|
||||
const int max_blocks = 65535;
|
||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||
|
||||
dim3 gridDims(max_loras, actual_blocks);
|
||||
auto sort_kernel =
|
||||
vllm::moe::lora_count_and_sort_expert_tokens_kernel<scalar_t>;
|
||||
|
||||
sort_kernel<<<gridDims, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
|
||||
expert_map.data_ptr<int32_t>(), topk_ids.numel(), num_experts,
|
||||
max_num_tokens_padded, topk_num, token_mask.data_ptr<int32_t>(),
|
||||
lora_ids.data_ptr<int32_t>(), has_expert_map);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -1,174 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/Atomic.cuh>
|
||||
|
||||
#include "../cuda_compat.h"
|
||||
#include "../dispatch_utils.h"
|
||||
#include "core/math.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
||||
int32_t col) {
|
||||
return row * total_col + col;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// TODO: Refactor common parts with moe_align_sum_kernels
|
||||
template <typename scalar_t, typename token_cnts_t>
|
||||
__global__ void moe_lora_align_sum_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
|
||||
int64_t block_size, int num_experts, int max_loras, size_t numel,
|
||||
int max_num_tokens_padded, int max_num_m_blocks,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
|
||||
int32_t* lora_ids) {
|
||||
const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
|
||||
int lora_idx = blockIdx.x;
|
||||
int lora_id = lora_ids[lora_idx];
|
||||
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
||||
return;
|
||||
}
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
// Initialize sorted_token_ids with numel
|
||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
||||
sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
|
||||
}
|
||||
|
||||
// Initialize expert_ids with -1
|
||||
for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
|
||||
expert_ids[lora_id * max_num_m_blocks + it] = -1;
|
||||
}
|
||||
|
||||
// Initialize total_tokens_post_pad with 0
|
||||
if (threadIdx.x == 0) {
|
||||
total_tokens_post_pad[lora_id] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||
}
|
||||
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int mask = token_lora_mapping[i / topk_num] == lora_id;
|
||||
int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
|
||||
tokens_cnts[idx] += mask;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// For each expert we accumulate the token counts from the different threads.
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// We accumulate the token counts of all experts in thread 0.
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] = cumsum[i - 1] +
|
||||
div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||
block_size) *
|
||||
block_size;
|
||||
}
|
||||
total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/**
|
||||
* For each expert, each thread processes the tokens of the corresponding
|
||||
* blocks and stores the corresponding expert_id for each block.
|
||||
*/
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
|
||||
threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||
* expert with expert_id needs to process, and
|
||||
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||
* processed by the expert with expert_id within the current thread's token
|
||||
* shard.
|
||||
*/
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||
cumsum[expert_id];
|
||||
|
||||
int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
|
||||
atomicAdd(
|
||||
&sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
|
||||
(i - numel) * mask);
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
|
||||
}
|
||||
}
|
||||
|
||||
void moe_lora_align_block_size(
|
||||
torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
|
||||
int64_t num_experts, int64_t block_size, int64_t max_loras,
|
||||
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
||||
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
||||
torch::Tensor lora_ids) {
|
||||
const int topk_num = topk_ids.size(1);
|
||||
|
||||
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
|
||||
|
||||
int device_max_shared_mem;
|
||||
auto dev = topk_ids.get_device();
|
||||
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE,
|
||||
TORCH_CHECK(num_thread <= 1024,
|
||||
"num_thread must be less than 1024, "
|
||||
"and fallback is not implemented yet.");
|
||||
const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
|
||||
(num_experts + 1) * sizeof(int32_t);
|
||||
|
||||
if (shared_mem > device_max_shared_mem) {
|
||||
TORCH_CHECK(false,
|
||||
"Shared memory usage exceeds device limit, and global memory "
|
||||
"fallback is not implemented yet.");
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_TYPES(
|
||||
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
|
||||
dim3 blockDim(num_thread);
|
||||
auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem));
|
||||
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
|
||||
max_loras, topk_ids.numel(), max_num_tokens_padded,
|
||||
max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
|
||||
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
|
||||
});
|
||||
}
|
||||
@ -11,7 +11,8 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
torch::Tensor num_tokens_post_pad,
|
||||
std::optional<torch::Tensor> maybe_expert_map);
|
||||
|
||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
int64_t block_size,
|
||||
@ -26,7 +27,7 @@ void moe_lora_align_block_size(
|
||||
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
||||
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
||||
torch::Tensor lora_ids);
|
||||
torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map);
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
|
||||
@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
"moe_align_block_size(Tensor topk_ids, int num_experts,"
|
||||
" int block_size, Tensor! sorted_token_ids,"
|
||||
" Tensor! experts_ids,"
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
" Tensor! num_tokens_post_pad,"
|
||||
" Tensor? maybe_expert_map) -> ()");
|
||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||
|
||||
// Aligning the number of tokens to be processed by each expert such
|
||||
@ -46,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
" Tensor !experts_ids,"
|
||||
" Tensor !num_tokens_post_pad,"
|
||||
" Tensor !adapter_enabled,"
|
||||
" Tensor !lora_ids) -> () ");
|
||||
" Tensor !lora_ids,"
|
||||
" Tensor? maybe_expert_map) -> () ");
|
||||
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
||||
31
csrc/ops.h
31
csrc/ops.h
@ -102,13 +102,16 @@ void apply_repetition_penalties_(torch::Tensor& logits,
|
||||
const torch::Tensor& output_mask,
|
||||
const torch::Tensor& repetition_penalties);
|
||||
|
||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1);
|
||||
void top_k_per_row_prefill(const torch::Tensor& logits,
|
||||
const torch::Tensor& rowStarts,
|
||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1,
|
||||
int64_t topK);
|
||||
|
||||
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
||||
const torch::Tensor& seq_lens, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1);
|
||||
const torch::Tensor& seqLens, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1,
|
||||
int64_t topK);
|
||||
|
||||
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||
torch::Tensor& weight, torch::Tensor& scale,
|
||||
@ -128,6 +131,13 @@ void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
|
||||
std::optional<torch::Tensor> scale_ub,
|
||||
std::optional<torch::Tensor> residual);
|
||||
|
||||
void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor const& weight,
|
||||
torch::Tensor& scales, double const epsilon,
|
||||
std::optional<torch::Tensor> scale_ub,
|
||||
std::optional<torch::Tensor> residual,
|
||||
int64_t group_size, bool is_scale_transposed);
|
||||
|
||||
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||
std::optional<torch::Tensor> key, int64_t head_size,
|
||||
torch::Tensor& cos_sin_cache, bool is_neox);
|
||||
@ -252,7 +262,8 @@ void get_cutlass_moe_mm_data(
|
||||
void get_cutlass_moe_mm_problem_sizes(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
std::optional<bool> force_swap_ab = std::nullopt);
|
||||
|
||||
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
@ -299,6 +310,14 @@ void per_token_group_quant_int8(const torch::Tensor& input,
|
||||
torch::Tensor& output_q,
|
||||
torch::Tensor& output_s, int64_t group_size,
|
||||
double eps, double int8_min, double int8_max);
|
||||
|
||||
// Fused activation quantisation + DeepGEMM-compatible UE8M0-packed scales.
|
||||
void per_token_group_quant_8bit_packed(const torch::Tensor& input,
|
||||
torch::Tensor& output_q,
|
||||
torch::Tensor& output_s_packed,
|
||||
int64_t group_size, double eps,
|
||||
double min_8bit, double max_8bit);
|
||||
|
||||
#endif
|
||||
|
||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
|
||||
104
csrc/quantization/cutlass_w4a8/get_group_starts.cuh
Normal file
104
csrc/quantization/cutlass_w4a8/get_group_starts.cuh
Normal file
@ -0,0 +1,104 @@
|
||||
// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
|
||||
#pragma once
|
||||
|
||||
#include <cuda.h>
|
||||
#include <torch/all.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
|
||||
#include "core/scalar_type.hpp"
|
||||
#include "cutlass/bfloat16.h"
|
||||
#include "cutlass/float8.h"
|
||||
|
||||
// ElementB is int32 (packed int4)
|
||||
// ElementGroupScale is cutlass::Array<cutlass::float_e4m3_t, 8> (packed fp8)
|
||||
template <typename ElementA, typename ElementB, typename ElementC,
|
||||
typename ElementAccumulator, typename ElementGroupScale>
|
||||
__global__ void get_group_gemm_starts(
|
||||
int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
|
||||
ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
|
||||
ElementAccumulator** b_scales_offsets,
|
||||
ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
|
||||
ElementB* b_base_as_int, ElementC* out_base_as_int,
|
||||
ElementAccumulator* a_scales_base_as_int,
|
||||
ElementAccumulator* b_scales_base_as_int,
|
||||
ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
|
||||
int64_t scale_k) {
|
||||
int expert_id = threadIdx.x;
|
||||
|
||||
int64_t expert_offset = expert_offsets[expert_id];
|
||||
|
||||
// same as w8a8
|
||||
a_offsets[expert_id] = a_base_as_int + expert_offset * k;
|
||||
out_offsets[expert_id] = out_base_as_int + expert_offset * n;
|
||||
a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
|
||||
b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);
|
||||
|
||||
// w4a8 specific
|
||||
constexpr int pack_factor = 8; // pack 8 int4 into int32
|
||||
b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
|
||||
b_group_scales_offsets[expert_id] =
|
||||
b_group_scales_base_as_int + (expert_id * scale_k * n);
|
||||
}
|
||||
|
||||
#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE) \
|
||||
else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
|
||||
get_group_gemm_starts<cutlass::float_e4m3_t, int32_t, C_TYPE, float, \
|
||||
cutlass::Array<cutlass::float_e4m3_t, 8>> \
|
||||
<<<1, num_experts, 0, stream>>>( \
|
||||
static_cast<int64_t*>(expert_offsets.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()), \
|
||||
static_cast<int32_t**>(b_ptrs.data_ptr()), \
|
||||
static_cast<C_TYPE**>(out_ptrs.data_ptr()), \
|
||||
static_cast<float**>(a_scales_ptrs.data_ptr()), \
|
||||
static_cast<float**>(b_scales_ptrs.data_ptr()), \
|
||||
static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>**>( \
|
||||
b_group_scales_ptrs.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()), \
|
||||
static_cast<int32_t*>(b_tensors.data_ptr()), \
|
||||
static_cast<C_TYPE*>(out_tensors.data_ptr()), \
|
||||
static_cast<float*>(a_scales.data_ptr()), \
|
||||
static_cast<float*>(b_scales.data_ptr()), \
|
||||
static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>*>( \
|
||||
b_group_scales.data_ptr()), \
|
||||
n, k, scale_k); \
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void run_get_group_gemm_starts(
|
||||
torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
|
||||
torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
|
||||
torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
|
||||
torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
|
||||
torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
|
||||
torch::Tensor const& a_scales, torch::Tensor const& b_scales,
|
||||
torch::Tensor const& b_group_scales, const int64_t b_group_size) {
|
||||
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
|
||||
TORCH_CHECK(b_tensors.dtype() == torch::kInt32); // int4 8x packed into int32
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_group_scales.dtype() ==
|
||||
torch::kFloat8_e4m3fn); // the underlying torch type is e4m3
|
||||
TORCH_CHECK(out_tensors.dtype() ==
|
||||
torch::kBFloat16); // only support bf16 for now
|
||||
// expect int64_t to avoid overflow during offset calculations
|
||||
TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
|
||||
|
||||
int num_experts = static_cast<int>(expert_offsets.size(0));
|
||||
// logical k, n
|
||||
int64_t n = out_tensors.size(1);
|
||||
int64_t k = a_tensors.size(1);
|
||||
int64_t scale_k = cutlass::ceil_div(k, b_group_size);
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
|
||||
|
||||
if (false) {
|
||||
}
|
||||
__CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
|
||||
__CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
|
||||
else {
|
||||
TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
483
csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
Normal file
483
csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
Normal file
@ -0,0 +1,483 @@
|
||||
#include <vector>
|
||||
#include <tuple>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/group_array_problem_shape.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass/util/mixed_dtype_utils.hpp"
|
||||
|
||||
// vllm includes
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <torch/all.h>
|
||||
#include "cutlass_extensions/torch_utils.hpp"
|
||||
#include "cutlass_extensions/common.hpp"
|
||||
|
||||
#include "core/registration.h"
|
||||
#include "get_group_starts.cuh"
|
||||
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
|
||||
#include "w4a8_utils.cuh"
|
||||
|
||||
namespace vllm::cutlass_w4a8_moe {
|
||||
|
||||
using namespace cute;
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
// Static configuration shared across all instantiations
|
||||
// -------------------------------------------------------------------------------------
|
||||
using ProblemShape =
|
||||
cutlass::gemm::GroupProblemShape<Shape<int, int, int>>; // <M,N,K> per
|
||||
// group
|
||||
using MmaType = cutlass::float_e4m3_t;
|
||||
using QuantType = cutlass::int4b_t;
|
||||
|
||||
constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
|
||||
static int constexpr PackFactor = 8; // 8 int4 packed into int32
|
||||
|
||||
// A matrix configuration
|
||||
using ElementA = MmaType;
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA =
|
||||
128 /
|
||||
cutlass::sizeof_bits<ElementA>::value; // Alignment of A matrix in units of
|
||||
// elements (up to 16 bytes)
|
||||
|
||||
// B matrix configuration
|
||||
using ElementB = QuantType; // Element type for B matrix operand
|
||||
using LayoutB =
|
||||
cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB =
|
||||
128 / cutlass::sizeof_bits<
|
||||
ElementB>::value; // Memory access granularity/alignment of B
|
||||
// matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// This example manually swaps and transposes, so keep transpose of input
|
||||
// layouts
|
||||
using LayoutA_Transpose =
|
||||
typename cutlass::layout::LayoutTranspose<LayoutA>::type;
|
||||
using LayoutB_Transpose =
|
||||
typename cutlass::layout::LayoutTranspose<LayoutB>::type;
|
||||
|
||||
// Need to pass a pointer type to make the 3rd dimension of Stride be _0
|
||||
using StrideA =
|
||||
cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
|
||||
using StrideB =
|
||||
cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
|
||||
|
||||
// Define the CuTe layout for reoredered quantized tensor B
|
||||
// LayoutAtomQuant places values that will be read by the same thread in
|
||||
// contiguous locations in global memory. It specifies the reordering within a
|
||||
// single warp's fragment
|
||||
using LayoutAtomQuant =
|
||||
decltype(cutlass::compute_memory_reordering_atom<MmaType>());
|
||||
using LayoutB_Reordered = decltype(cute::tile_to_shape(
|
||||
LayoutAtomQuant{}, Layout<Shape<int, int, Int<1>>, StrideB>{}));
|
||||
|
||||
using ElementScale = cutlass::float_e4m3_t;
|
||||
using LayoutScale = cutlass::layout::RowMajor;
|
||||
|
||||
// C/D matrix configuration
|
||||
using ElementC =
|
||||
cutlass::bfloat16_t; // Element type for C and D matrix operands
|
||||
using LayoutC =
|
||||
cutlass::layout::RowMajor; // Layout type for C and D matrix operands
|
||||
constexpr int AlignmentC =
|
||||
128 / cutlass::sizeof_bits<
|
||||
ElementC>::value; // Memory access granularity/alignment of C
|
||||
// matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// D matrix configuration
|
||||
using ElementD = ElementC;
|
||||
using LayoutD = LayoutC;
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that
|
||||
// supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using StageCountType =
|
||||
cutlass::gemm::collective::StageCountAuto; // Stage count maximized based
|
||||
// on the tile size
|
||||
|
||||
// per-channel and per-token scales for epilogue
|
||||
using ElementSChannel = float;
|
||||
|
||||
template <class TileShape_MN, class ClusterShape_MNK, class KernelSchedule,
|
||||
class EpilogueSchedule>
|
||||
struct W4A8GroupedGemmKernel {
|
||||
using TileShape =
|
||||
decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
|
||||
using ClusterShape = ClusterShape_MNK;
|
||||
|
||||
// per-channel, per-token scales epilogue
|
||||
using ChTokScalesEpilogue =
|
||||
typename vllm::c3x::ScaledEpilogueArray<ElementAccumulator, ElementD,
|
||||
TileShape>;
|
||||
using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, TileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||
ElementSChannel, ElementC,
|
||||
typename cutlass::layout::LayoutTranspose<LayoutC>::type*, AlignmentC,
|
||||
ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*,
|
||||
AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;
|
||||
|
||||
// =========================================================== MIXED INPUT
|
||||
// WITH SCALES
|
||||
// ===========================================================================
|
||||
// The Scale information must get paired with the operand that will be scaled.
|
||||
// In this example, B is scaled so we make a tuple of B's information and the
|
||||
// scale information.
|
||||
using CollectiveMainloopShuffled =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>,
|
||||
LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
|
||||
AlignmentA, ElementAccumulator, TileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
KernelSchedule>::CollectiveOp;
|
||||
|
||||
using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
|
||||
ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;
|
||||
|
||||
using GemmShuffled =
|
||||
cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
|
||||
|
||||
using StrideC = typename GemmKernelShuffled::InternalStrideC;
|
||||
using StrideD = typename GemmKernelShuffled::InternalStrideD;
|
||||
|
||||
using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
|
||||
using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
|
||||
using StrideS = typename CollectiveMainloopShuffled::StrideScale;
|
||||
using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
|
||||
|
||||
// static asserts for passing in strides/layouts
|
||||
// pack to 2x int64
|
||||
static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
|
||||
// pack to 3xint32,
|
||||
static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
|
||||
"LayoutB_Reordered size must be divisible by 4 bytes");
|
||||
|
||||
static void grouped_mm(
|
||||
torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
|
||||
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
|
||||
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
|
||||
const int64_t b_group_size, const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
|
||||
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
|
||||
const torch::Tensor& group_scale_strides) {
|
||||
auto device = a_tensors.device();
|
||||
auto device_id = device.index();
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device);
|
||||
auto stream = at::cuda::getCurrentCUDAStream(device_id);
|
||||
|
||||
int num_experts = static_cast<int>(expert_offsets.size(0));
|
||||
int n = static_cast<int>(b_tensors.size(1));
|
||||
int k = static_cast<int>(b_tensors.size(2)) * PackFactor;
|
||||
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(device);
|
||||
torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);
|
||||
|
||||
// get the correct offsets to pass to gemm
|
||||
run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
|
||||
a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
|
||||
a_tensors, b_tensors, out_tensors, a_scales,
|
||||
b_scales, b_group_scales, b_group_size);
|
||||
|
||||
// construct args
|
||||
using Args = typename GemmShuffled::Arguments;
|
||||
using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
|
||||
using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
|
||||
Args arguments;
|
||||
|
||||
ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
|
||||
static_cast<ProblemShape::UnderlyingProblemShape*>(
|
||||
problem_sizes_torch.data_ptr());
|
||||
ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
|
||||
|
||||
// SwapAB so B operands come first
|
||||
MainloopArguments mainloop_arguments{
|
||||
static_cast<const QuantType**>(b_ptrs.data_ptr()),
|
||||
static_cast<LayoutB_Reordered*>(b_strides.data_ptr()),
|
||||
static_cast<const MmaType**>(a_ptrs.data_ptr()),
|
||||
static_cast<StrideA*>(a_strides.data_ptr()),
|
||||
static_cast<const cutlass::Array<ElementScale, 8>**>(
|
||||
b_group_scales_ptrs.data_ptr()),
|
||||
static_cast<StrideS*>(group_scale_strides.data_ptr()),
|
||||
static_cast<int>(b_group_size)};
|
||||
|
||||
EpilogueArguments epilogue_arguments{
|
||||
// since we are doing SwapAB the channel scales comes first, then token
|
||||
// scales
|
||||
ChTokScalesEpilogue::prepare_args( // see ScaledEpilogueArray
|
||||
static_cast<const ElementAccumulator**>(
|
||||
b_scales_ptrs.data_ptr()), // per-channel
|
||||
static_cast<const ElementAccumulator**>(
|
||||
a_scales_ptrs.data_ptr()), // per-token
|
||||
true, true),
|
||||
nullptr, // C
|
||||
static_cast<StrideC*>(c_strides.data_ptr()), // C
|
||||
static_cast<ElementD**>(out_ptrs.data_ptr()), // D
|
||||
static_cast<StrideC*>(c_strides.data_ptr()) // D
|
||||
};
|
||||
|
||||
static const cutlass::KernelHardwareInfo hw_info{
|
||||
device_id,
|
||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||
device_id)};
|
||||
|
||||
arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
|
||||
mainloop_arguments, epilogue_arguments, hw_info};
|
||||
|
||||
// Allocate workspace
|
||||
size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
|
||||
torch::Tensor workspace =
|
||||
torch::empty(workspace_size,
|
||||
torch::TensorOptions().dtype(torch::kU8).device(device));
|
||||
|
||||
// Run GEMM
|
||||
GemmShuffled gemm;
|
||||
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||
CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
|
||||
CUTLASS_CHECK(gemm.run(stream));
|
||||
}
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Kernel instantiations and dispatch logic
|
||||
// ----------------------------------------------------------------------------
|
||||
using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
|
||||
using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
|
||||
|
||||
// Kernel_TileShape_ClusterShape_Schedule
|
||||
using Kernel_128x16_1x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||
using Kernel_128x16_2x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||
|
||||
using Kernel_256x16_1x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||
using Kernel_256x16_2x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||
|
||||
using Kernel_256x32_1x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||
using Kernel_256x32_2x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||
|
||||
using Kernel_256x64_1x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||
using Kernel_256x64_2x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||
|
||||
using Kernel_256x128_1x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||
using Kernel_256x128_2x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||
|
||||
using Kernel_128x256_2x1x1_Coop =
|
||||
W4A8GroupedGemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||
|
||||
void mm_dispatch(
|
||||
torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
|
||||
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
|
||||
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
|
||||
const int64_t b_group_size, const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
|
||||
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
|
||||
const torch::Tensor& group_scale_strides, const std::string& schedule) {
|
||||
if (schedule == "Kernel_128x16_1x1x1_Coop") {
|
||||
Kernel_128x16_1x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_128x16_2x1x1_Coop") {
|
||||
Kernel_128x16_2x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_256x16_1x1x1_Coop") {
|
||||
Kernel_256x16_1x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_256x16_2x1x1_Coop") {
|
||||
Kernel_256x16_2x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_256x32_1x1x1_Coop") {
|
||||
Kernel_256x32_1x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_256x32_2x1x1_Coop") {
|
||||
Kernel_256x32_2x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_256x64_1x1x1_Coop") {
|
||||
Kernel_256x64_1x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_256x64_2x1x1_Coop") {
|
||||
Kernel_256x64_2x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_256x128_1x1x1_Coop") {
|
||||
Kernel_256x128_1x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_256x128_2x1x1_Coop") {
|
||||
Kernel_256x128_2x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else if (schedule == "Kernel_128x256_2x1x1_Coop") {
|
||||
Kernel_128x256_2x1x1_Coop::grouped_mm(
|
||||
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, group_scale_strides);
|
||||
} else {
|
||||
TORCH_CHECK(false,
|
||||
"cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
|
||||
}
|
||||
}
|
||||
|
||||
void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
|
||||
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
|
||||
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
|
||||
const int64_t b_group_size, const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
|
||||
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
|
||||
const torch::Tensor& group_scale_strides,
|
||||
std::optional<std::string> maybe_schedule) {
|
||||
// user has specified a schedule
|
||||
if (maybe_schedule) {
|
||||
mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||
b_group_scales, b_group_size, expert_offsets, problem_sizes,
|
||||
a_strides, b_strides, c_strides, group_scale_strides,
|
||||
*maybe_schedule);
|
||||
return;
|
||||
}
|
||||
|
||||
// use heuristic
|
||||
int m_full = a_tensors.size(0);
|
||||
int n = b_tensors.size(1);
|
||||
int k = b_tensors.size(2) * PackFactor; // logical k
|
||||
int num_experts = b_tensors.size(0);
|
||||
// per-expert batch size assuming uniform distribution
|
||||
int m_expert = m_full / num_experts;
|
||||
|
||||
std::string schedule;
|
||||
if (m_expert <= 16) {
|
||||
schedule = "Kernel_128x16_2x1x1_Coop";
|
||||
} else if (m_expert <= 32) {
|
||||
schedule = "Kernel_256x32_1x1x1_Coop";
|
||||
} else if (m_expert <= 64) {
|
||||
schedule = "Kernel_256x64_1x1x1_Coop";
|
||||
} else if (m_expert <= 128) {
|
||||
schedule = "Kernel_256x128_2x1x1_Coop";
|
||||
} else { // m_expert > 128
|
||||
schedule = "Kernel_128x256_2x1x1_Coop";
|
||||
}
|
||||
|
||||
mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||
b_group_scales, b_group_size, expert_offsets, problem_sizes,
|
||||
a_strides, b_strides, c_strides, group_scale_strides, schedule);
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> encode_and_reorder_int4b(
|
||||
torch::Tensor const& b_tensors) {
|
||||
TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
|
||||
TORCH_CHECK(b_tensors.dim() == 3); // (experts, n, k)
|
||||
TORCH_CHECK(b_tensors.is_contiguous());
|
||||
TORCH_CHECK(b_tensors.is_cuda());
|
||||
|
||||
int n = static_cast<int>(b_tensors.size(1));
|
||||
int k = static_cast<int>(b_tensors.size(2)) * PackFactor; // logical k
|
||||
|
||||
// CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
|
||||
// These misalignments cause silent OOB unless run under Compute Sanitizer.
|
||||
TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
|
||||
TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");
|
||||
|
||||
// we will store the layout to an int32 tensor;
|
||||
// this is the number of elements we need per layout
|
||||
constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);
|
||||
|
||||
torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
|
||||
int num_experts = static_cast<int>(b_tensors.size(0));
|
||||
|
||||
auto b_ptr = static_cast<QuantType const*>(b_tensors.const_data_ptr());
|
||||
auto b_packed_ptr = static_cast<QuantType*>(b_tensors_packed.data_ptr());
|
||||
|
||||
// multiply by ull so result does not overflow int32
|
||||
size_t num_int4_elems = 1ull * num_experts * n * k;
|
||||
bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
|
||||
num_int4_elems);
|
||||
TORCH_CHECK(ok, "unified_encode_int4b failed");
|
||||
|
||||
// construct the layout once; assumes each expert has the same layout
|
||||
using LayoutType = LayoutB_Reordered;
|
||||
std::vector<LayoutType> layout_B_reordered_host(num_experts);
|
||||
auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
|
||||
auto shape_B = cute::make_shape(n, k, Int<1>{});
|
||||
auto layout_B = make_layout(shape_B, stride_B);
|
||||
LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
|
||||
|
||||
// reorder weights for each expert
|
||||
for (int i = 0; i < num_experts; i++) {
|
||||
// since the storage type of int4b is 1 byte but one element is 4 bits
|
||||
// we need to adjust the offset
|
||||
int64_t offset =
|
||||
1ull * i * n * k * cutlass::sizeof_bits<QuantType>::value / 8;
|
||||
cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
|
||||
layout_B_reordered);
|
||||
}
|
||||
|
||||
// save the packed layout to torch tensor so we can re-use it
|
||||
auto cpu_opts =
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
|
||||
torch::Tensor layout_cpu =
|
||||
torch::empty({num_experts, layout_width}, cpu_opts);
|
||||
|
||||
int32_t* layout_data = layout_cpu.data_ptr<int32_t>();
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
std::memcpy(layout_data + i * layout_width, // dst (int32*)
|
||||
&layout_B_reordered, // src (LayoutType*)
|
||||
sizeof(LayoutType)); // number of bytes
|
||||
}
|
||||
|
||||
torch::Tensor packed_layout =
|
||||
layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);
|
||||
|
||||
return {b_tensors_packed, packed_layout};
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("cutlass_w4a8_moe_mm", &mm);
|
||||
m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
|
||||
}
|
||||
|
||||
} // namespace vllm::cutlass_w4a8_moe
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -7,6 +7,7 @@
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <torch/all.h>
|
||||
#include "cutlass_extensions/torch_utils.hpp"
|
||||
#include "w4a8_utils.cuh"
|
||||
|
||||
#include "core/registration.h"
|
||||
|
||||
@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
|
||||
return packed_scales;
|
||||
}
|
||||
|
||||
/*
|
||||
GPU-accelerated implementation of cutlass::unified_encode_int4b.
|
||||
Constructs a lookup table in constant memory to map 8 bits
|
||||
(two 4-bit values) at a time. Assumes memory is contiguous
|
||||
and pointers are 16-byte aligned.
|
||||
*/
|
||||
__constant__ uint8_t kNibbleLUT[256];
|
||||
|
||||
__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
|
||||
size_t nbytes) {
|
||||
constexpr size_t V = sizeof(uint4); // 16 bytes
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t nthreads = size_t(gridDim.x) * blockDim.x;
|
||||
const size_t nvec = nbytes / V;
|
||||
|
||||
// 1-D grid-stride loop over 16-byte chunks
|
||||
for (size_t vec = tid; vec < nvec; vec += nthreads) {
|
||||
uint4 v = reinterpret_cast<const uint4*>(in)[vec];
|
||||
uint8_t* b = reinterpret_cast<uint8_t*>(&v);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
|
||||
reinterpret_cast<uint4*>(out)[vec] = v;
|
||||
}
|
||||
}
|
||||
|
||||
static bool upload_lut() {
|
||||
std::array<uint8_t, 256> lut{};
|
||||
auto map_nib = [](uint8_t v) -> uint8_t {
|
||||
// 1..7 -> (8 - v); keep 0 and 8..15
|
||||
return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
|
||||
};
|
||||
for (int b = 0; b < 256; ++b) {
|
||||
uint8_t lo = b & 0xF;
|
||||
uint8_t hi = (b >> 4) & 0xF;
|
||||
lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
|
||||
}
|
||||
cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
|
||||
/*offset=*/0, cudaMemcpyHostToDevice);
|
||||
|
||||
return (e == cudaSuccess);
|
||||
}
|
||||
|
||||
static bool unified_encode_int4b(cutlass::int4b_t const* in,
|
||||
cutlass::int4b_t* out, size_t num_int4_elems) {
|
||||
// Build/upload LUT
|
||||
if (!upload_lut()) return false;
|
||||
|
||||
static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
|
||||
"int4 storage must be 1 byte");
|
||||
const size_t nbytes = num_int4_elems >> 1;
|
||||
|
||||
auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
|
||||
auto* out_bytes = reinterpret_cast<uint8_t*>(out);
|
||||
|
||||
// kernel launch params
|
||||
constexpr int block = 256;
|
||||
const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors
|
||||
int grid = int((nvec + block - 1) / block);
|
||||
if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel
|
||||
|
||||
unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
|
||||
cudaError_t err = cudaGetLastError();
|
||||
return (err == cudaSuccess);
|
||||
}
|
||||
|
||||
torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
|
||||
TORCH_CHECK(B.dtype() == torch::kInt32);
|
||||
TORCH_CHECK(B.dim() == 2);
|
||||
@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
|
||||
LayoutB_Reordered layout_B_reordered =
|
||||
cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
|
||||
|
||||
bool ok =
|
||||
vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
|
||||
bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr,
|
||||
n * k);
|
||||
TORCH_CHECK(ok, "unified_encode_int4b failed");
|
||||
cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
|
||||
|
||||
|
||||
90
csrc/quantization/cutlass_w4a8/w4a8_utils.cu
Normal file
90
csrc/quantization/cutlass_w4a8/w4a8_utils.cu
Normal file
@ -0,0 +1,90 @@
|
||||
#include "w4a8_utils.cuh"
|
||||
|
||||
#include <array>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdio>
|
||||
|
||||
namespace vllm::cutlass_w4a8_utils {
|
||||
|
||||
/*
|
||||
GPU-accelerated implementation of cutlass::unified_encode_int4b.
|
||||
Constructs a lookup table in constant memory to map 8 bits
|
||||
(two 4-bit values) at a time. Assumes memory is contiguous
|
||||
and pointers are 16-byte aligned.
|
||||
*/
|
||||
__constant__ uint8_t kNibbleLUT[256];
|
||||
|
||||
__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
|
||||
size_t nbytes) {
|
||||
constexpr size_t V = sizeof(uint4); // 16 bytes
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t nthreads = size_t(gridDim.x) * blockDim.x;
|
||||
const size_t nvec = nbytes / V;
|
||||
|
||||
// 1-D grid-stride loop over 16-byte chunks
|
||||
for (size_t vec = tid; vec < nvec; vec += nthreads) {
|
||||
uint4 v = reinterpret_cast<const uint4*>(in)[vec];
|
||||
uint8_t* b = reinterpret_cast<uint8_t*>(&v);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
|
||||
reinterpret_cast<uint4*>(out)[vec] = v;
|
||||
}
|
||||
}
|
||||
|
||||
static bool upload_lut() {
|
||||
std::array<uint8_t, 256> lut{};
|
||||
auto map_nib = [](uint8_t v) -> uint8_t {
|
||||
// 1..7 -> (8 - v); keep 0 and 8..15
|
||||
return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
|
||||
};
|
||||
for (int b = 0; b < 256; ++b) {
|
||||
uint8_t lo = b & 0xF;
|
||||
uint8_t hi = (b >> 4) & 0xF;
|
||||
lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
|
||||
}
|
||||
cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
|
||||
/*offset=*/0, cudaMemcpyHostToDevice);
|
||||
|
||||
return (e == cudaSuccess);
|
||||
}
|
||||
|
||||
bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
|
||||
size_t num_int4_elems) {
|
||||
// Build/upload LUT
|
||||
if (!upload_lut()) return false;
|
||||
|
||||
static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
|
||||
"int4 storage must be 1 byte");
|
||||
const size_t nbytes = num_int4_elems >> 1;
|
||||
|
||||
auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
|
||||
auto* out_bytes = reinterpret_cast<uint8_t*>(out);
|
||||
|
||||
// kernel launch params
|
||||
constexpr int block = 256;
|
||||
const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors
|
||||
int grid = int((nvec + block - 1) / block);
|
||||
if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel
|
||||
|
||||
unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
|
||||
|
||||
// launch errors
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
printf("unified_encode_int4b_device launch error: %s (%d)\n",
|
||||
cudaGetErrorString(err), err);
|
||||
return false;
|
||||
}
|
||||
|
||||
// runtime errors
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
printf("unified_encode_int4b_device runtime error: %s (%d)\n",
|
||||
cudaGetErrorString(err), err);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace vllm::cutlass_w4a8_utils
|
||||
11
csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
Normal file
11
csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
Normal file
@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
namespace vllm::cutlass_w4a8_utils {
|
||||
|
||||
bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
|
||||
size_t num_int4_elems);
|
||||
|
||||
} // namespace vllm::cutlass_w4a8_utils
|
||||
@ -31,14 +31,15 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
|
||||
|
||||
// RMS Norm + Quant
|
||||
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
|
||||
token_scale = 1.0f / token_scale;
|
||||
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
|
||||
has_residual>(
|
||||
out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
|
||||
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||
} else {
|
||||
// FP8 - Do not invert token_scale for exact match with FBGemm
|
||||
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
|
||||
has_residual>(
|
||||
out, input, weight, rms, token_scale, hidden_size, residual);
|
||||
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||
}
|
||||
}
|
||||
|
||||
@ -75,14 +76,52 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
|
||||
|
||||
// RMS Norm + Quant
|
||||
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
|
||||
token_scale = 1.0f / token_scale;
|
||||
vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
|
||||
out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
|
||||
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||
} else {
|
||||
// FP8 - Do not invert s_token_scale for exact match with FBGemm
|
||||
vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
|
||||
out, input, weight, rms, token_scale, hidden_size, residual);
|
||||
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||
}
|
||||
}
|
||||
|
||||
// RMS norm + quant kernel
|
||||
template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
|
||||
bool is_scale_transposed = false, int32_t group_size = 0>
|
||||
__global__ void rms_norm_per_block_quant_kernel(
|
||||
scalar_out_t* __restrict__ out, // [..., hidden_size]
|
||||
float* __restrict__ scales, // [num_tokens, hidden_size / group_size]
|
||||
// or
|
||||
// [hidden_size / group_size, num_tokens]
|
||||
scalar_t const* __restrict__ input, // [..., hidden_size]
|
||||
scalar_t const* __restrict__ weight, // [hidden_size]
|
||||
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
|
||||
scalar_t* __restrict__ residual = nullptr) {
|
||||
float rms;
|
||||
// Compute RMS
|
||||
// Always able to vectorize due to constraints on hidden_size
|
||||
vllm::vectorized::compute_rms<scalar_t, has_residual>(
|
||||
&rms, input, hidden_size, var_epsilon, residual);
|
||||
|
||||
// Compute Scale
|
||||
// Always able to vectorize due to constraints on hidden_size and group_size
|
||||
vllm::vectorized::compute_dynamic_per_token_scales<
|
||||
scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
|
||||
nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual);
|
||||
|
||||
// RMS Norm + Quant
|
||||
// Always able to vectorize due to constraints on hidden_size
|
||||
// For int8, don't invert token_scale here: do it inside the norm_and_quant
|
||||
// kernel. We do it because particular elements of token_scale can be shared
|
||||
// between multiple threads, so this way, we avoid extra synchronization
|
||||
// overhead.
|
||||
vllm::vectorized::norm_and_quant<
|
||||
scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
|
||||
has_residual, is_scale_transposed, group_size>(
|
||||
out, input, weight, rms, scales, hidden_size, residual);
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// Residual add + RMS norm + dynamic per token
|
||||
@ -103,30 +142,19 @@ void rms_norm_dynamic_per_token_quant_dispatch(
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (residual.has_value()) {
|
||||
VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
|
||||
VLLM_DISPATCH_QUANT_TYPES(
|
||||
out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
|
||||
vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
|
||||
true>
|
||||
has_residual>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
|
||||
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
|
||||
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
|
||||
var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
|
||||
var_epsilon, hidden_size,
|
||||
has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
|
||||
});
|
||||
|
||||
} else {
|
||||
VLLM_DISPATCH_QUANT_TYPES(
|
||||
out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
|
||||
vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
|
||||
false>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
|
||||
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
|
||||
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
|
||||
var_epsilon, hidden_size, nullptr);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void rms_norm_dynamic_per_token_quant(
|
||||
@ -157,3 +185,79 @@ void rms_norm_dynamic_per_token_quant(
|
||||
out, input, weight, scales, var_epsilon, scale_ub, residual);
|
||||
});
|
||||
}
|
||||
|
||||
// Residual add + RMS norm + dynamic per token
|
||||
void rms_norm_per_block_quant_dispatch(
|
||||
torch::Tensor& out, // [..., hidden_size]
|
||||
torch::Tensor const& input, // [..., hidden_size]
|
||||
torch::Tensor const& weight, // [hidden_size]
|
||||
torch::Tensor& scales, // [num_tokens, hidden_size / group_size] or
|
||||
// [hidden_size / group_size, num_tokens]
|
||||
int32_t group_size,
|
||||
double const var_epsilon, // Variance epsilon used in norm calculation
|
||||
std::optional<at::Tensor> const& scale_ub,
|
||||
std::optional<at::Tensor>& residual, bool is_scale_transposed) {
|
||||
int32_t hidden_size = input.size(-1);
|
||||
auto num_tokens = input.numel() / hidden_size;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
const int max_block_size = (num_tokens <= 256) ? 512 : 256;
|
||||
dim3 block(std::min(hidden_size, max_block_size));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "rms_norm_per_block_quant_fp_dispatch", [&] {
|
||||
using scalar_in_t = scalar_t;
|
||||
VLLM_DISPATCH_GROUP_SIZE(group_size, gs, [&] {
|
||||
VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
|
||||
VLLM_DISPATCH_BOOL(is_scale_transposed, transpose_scale, [&] {
|
||||
VLLM_DISPATCH_QUANT_TYPES(
|
||||
out.scalar_type(), "rms_norm_per_block_quant_kernel", [&] {
|
||||
vllm::rms_norm_per_block_quant_kernel<scalar_in_t, scalar_t,
|
||||
has_residual,
|
||||
transpose_scale, gs>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
|
||||
input.data_ptr<scalar_in_t>(),
|
||||
weight.data_ptr<scalar_in_t>(),
|
||||
scale_ub.has_value() ? scale_ub->data_ptr<float>()
|
||||
: nullptr,
|
||||
var_epsilon, hidden_size,
|
||||
has_residual ? residual->data_ptr<scalar_in_t>()
|
||||
: nullptr);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor const& weight,
|
||||
torch::Tensor& scales, double const var_epsilon,
|
||||
std::optional<torch::Tensor> scale_ub,
|
||||
std::optional<torch::Tensor> residual,
|
||||
int64_t group_size, bool is_scale_transposed) {
|
||||
static c10::ScalarType kFp8Type = is_fp8_ocp()
|
||||
? c10::ScalarType::Float8_e4m3fn
|
||||
: c10::ScalarType::Float8_e4m3fnuz;
|
||||
TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
|
||||
TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
|
||||
|
||||
if (scale_ub.has_value()) {
|
||||
TORCH_CHECK(out.dtype() == kFp8Type);
|
||||
}
|
||||
TORCH_CHECK(weight.dtype() == input.dtype());
|
||||
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
||||
if (residual) {
|
||||
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
|
||||
}
|
||||
|
||||
TORCH_CHECK(group_size == 128 || group_size == 64,
|
||||
"Unsupported group size: ", group_size);
|
||||
|
||||
rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
|
||||
var_epsilon, scale_ub, residual,
|
||||
is_scale_transposed);
|
||||
}
|
||||
@ -9,6 +9,7 @@
|
||||
#include "quant_conversions.cuh"
|
||||
|
||||
#include "../../cub_helpers.h"
|
||||
#include "../../cuda_compat.h"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
@ -43,62 +44,150 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
*rms = s_rms;
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
|
||||
__device__ float warpReduceMaxSpecialized(volatile float* val, int64_t tid,
|
||||
int64_t thread_in_warp,
|
||||
int64_t reduced_elems) {
|
||||
static_assert(WARP_SIZE == 32 || WARP_SIZE == 64);
|
||||
if constexpr (WARP_SIZE == 64) {
|
||||
if (thread_in_warp + 64 < reduced_elems)
|
||||
val[tid] = fmaxf(val[tid], val[tid + 64]);
|
||||
}
|
||||
if (thread_in_warp + 32 < reduced_elems)
|
||||
val[tid] = fmaxf(val[tid], val[tid + 32]);
|
||||
if (thread_in_warp + 16 < reduced_elems)
|
||||
val[tid] = fmaxf(val[tid], val[tid + 16]);
|
||||
if (thread_in_warp + 8 < reduced_elems)
|
||||
val[tid] = fmaxf(val[tid], val[tid + 8]);
|
||||
if (thread_in_warp + 4 < reduced_elems)
|
||||
val[tid] = fmaxf(val[tid], val[tid + 4]);
|
||||
if (thread_in_warp + 2 < reduced_elems)
|
||||
val[tid] = fmaxf(val[tid], val[tid + 2]);
|
||||
if (thread_in_warp + 1 < reduced_elems)
|
||||
val[tid] = fmaxf(val[tid], val[tid + 1]);
|
||||
return val[tid];
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
|
||||
bool is_scale_transposed = false>
|
||||
__device__ void compute_dynamic_per_token_scales(
|
||||
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
|
||||
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
|
||||
float const rms, float const* __restrict__ scale_ub,
|
||||
int32_t const hidden_size,
|
||||
scalar_t const* __restrict__ residual = nullptr) {
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
;
|
||||
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
||||
|
||||
int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
|
||||
int32_t const group_size = 0) {
|
||||
float block_absmax_val_maybe = 0.0f;
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
}
|
||||
|
||||
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
||||
block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
|
||||
}
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
block_absmax_val_maybe =
|
||||
BlockReduce(reduceStore)
|
||||
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||
|
||||
__shared__ float s_token_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
float scale = 0.0f;
|
||||
if (scale_ub) {
|
||||
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||
} else {
|
||||
scale = block_absmax_val_maybe;
|
||||
}
|
||||
// token scale computation
|
||||
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||
s_token_scale = scale; // Shared memory store
|
||||
all_token_scales[blockIdx.x] = scale; // Global output store
|
||||
}
|
||||
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
||||
__syncthreads();
|
||||
if (group_size > 0) {
|
||||
__shared__ float s_max_vals[1024];
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
int64_t num_groups = hidden_size / group_size;
|
||||
int64_t const threads_per_group = blockDim.x / num_groups;
|
||||
int64_t const thread_in_group = threadIdx.x % threads_per_group;
|
||||
int64_t const group_offset = threadIdx.x / threads_per_group * group_size;
|
||||
int64_t const thread_offset = group_offset + thread_in_group;
|
||||
int64_t const thread_end =
|
||||
min(group_offset + group_size, static_cast<int64_t>(hidden_size));
|
||||
for (auto i = thread_offset; i < thread_end; i += threads_per_group) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
}
|
||||
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
||||
block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
|
||||
}
|
||||
s_max_vals[threadIdx.x] = block_absmax_val_maybe;
|
||||
__syncthreads();
|
||||
|
||||
*token_scale = s_token_scale;
|
||||
int64_t const warp_size = WARP_SIZE;
|
||||
int64_t const num_warps = blockDim.x / warp_size;
|
||||
int64_t const warp_id = threadIdx.x / warp_size;
|
||||
int64_t const thread_in_warp = threadIdx.x % warp_size;
|
||||
int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
|
||||
for (auto i = 0; i < groups_per_warp; ++i) {
|
||||
int64_t const group_id = i * num_warps + warp_id;
|
||||
if (group_id < num_groups) {
|
||||
int64_t warp_start = group_id * threads_per_group;
|
||||
int64_t const start = warp_start + thread_in_warp;
|
||||
int64_t const warp_end = min(warp_start + threads_per_group,
|
||||
static_cast<int64_t>(hidden_size));
|
||||
for (auto j = start; j + warp_size < warp_end; j += warp_size) {
|
||||
s_max_vals[start] =
|
||||
fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
|
||||
}
|
||||
warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
|
||||
min(warp_end - warp_start, warp_size));
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (thread_in_group == 0 && thread_offset < thread_end) {
|
||||
block_absmax_val_maybe = s_max_vals[threadIdx.x];
|
||||
float scale = 0.0f;
|
||||
if (scale_ub) {
|
||||
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||
} else {
|
||||
scale = block_absmax_val_maybe;
|
||||
}
|
||||
// token scale computation
|
||||
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||
// Global output store
|
||||
if constexpr (is_scale_transposed) {
|
||||
all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
|
||||
blockIdx.x] = scale;
|
||||
} else {
|
||||
all_token_scales[blockIdx.x * num_groups +
|
||||
threadIdx.x / threads_per_group] = scale;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
} else {
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
}
|
||||
|
||||
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
||||
block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
|
||||
}
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
block_absmax_val_maybe =
|
||||
BlockReduce(reduceStore)
|
||||
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||
|
||||
__shared__ float s_token_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
float scale = 0.0f;
|
||||
if (scale_ub) {
|
||||
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||
} else {
|
||||
scale = block_absmax_val_maybe;
|
||||
}
|
||||
// token scale computation
|
||||
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||
s_token_scale = scale; // Shared memory store
|
||||
all_token_scales[blockIdx.x] = scale; // Global output store
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
*token_scale = s_token_scale;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
|
||||
bool has_residual = false>
|
||||
bool has_residual = false, bool is_scale_transposed = false>
|
||||
__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
scalar_t const* __restrict__ input,
|
||||
scalar_t const* __restrict__ weight,
|
||||
float const rms, float const scale,
|
||||
float const rms, float* const scale,
|
||||
int32_t const hidden_size,
|
||||
scalar_t* __restrict__ residual = nullptr) {
|
||||
scalar_t* __restrict__ residual = nullptr,
|
||||
int32_t const group_size = 0) {
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
;
|
||||
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
@ -109,8 +198,21 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
// Norm
|
||||
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
||||
// Quant
|
||||
// If groupwise is_scale_inverted is true, so we invert the scale here.
|
||||
int64_t scale_idx = 0;
|
||||
if (group_size > 0) {
|
||||
if constexpr (is_scale_transposed) {
|
||||
scale_idx = (i / group_size) * gridDim.x + blockIdx.x;
|
||||
} else {
|
||||
scale_idx = blockIdx.x * (hidden_size / group_size) + i / group_size;
|
||||
}
|
||||
}
|
||||
auto scale_val =
|
||||
(group_size > 0
|
||||
? (is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx])
|
||||
: *scale);
|
||||
output[token_offset + i] =
|
||||
ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
|
||||
ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale_val);
|
||||
}
|
||||
}
|
||||
|
||||
@ -178,95 +280,191 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
|
||||
// Vectorized version of vllm::compute_dynamic_per_token_scales
|
||||
// hidden_size must be a multiple of 4
|
||||
template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
|
||||
template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
|
||||
bool is_scale_transposed = false, int32_t group_size = 0>
|
||||
__device__ void compute_dynamic_per_token_scales(
|
||||
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
|
||||
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
|
||||
float const rms, float const* __restrict__ scale_ub,
|
||||
int32_t const hidden_size,
|
||||
scalar_t const* __restrict__ residual = nullptr) {
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
;
|
||||
|
||||
// Vectorized input/weight/residual to better utilize memory bandwidth.
|
||||
vec4_t<scalar_t> const* vec_input =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||
vec4_t<scalar_t> const* vec_weight =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
||||
vec4_t<scalar_t> const* vec_residual = nullptr;
|
||||
if constexpr (has_residual) {
|
||||
vec_residual =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
|
||||
}
|
||||
|
||||
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
||||
|
||||
const int VEC_SIZE = 4;
|
||||
int32_t const num_vec_elems = hidden_size >> 2;
|
||||
float block_absmax_val_maybe = 0.0f;
|
||||
|
||||
#pragma unroll 4
|
||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
vec4_t<scalar_t> in = vec_input[i];
|
||||
vec4_t<scalar_t> const w = vec_weight[i];
|
||||
// Vectorized input/weight/residual to better utilize memory bandwidth.
|
||||
vec4_t<scalar_t> const* vec_input = nullptr;
|
||||
vec4_t<scalar_t> const* vec_weight = nullptr;
|
||||
vec4_t<scalar_t> const* vec_residual = nullptr;
|
||||
|
||||
vec4_t<float> x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
x.val[j] = static_cast<float>(in.val[j]);
|
||||
}
|
||||
if constexpr (group_size > 0) {
|
||||
__shared__ float s_max_vals[1024];
|
||||
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
int64_t const num_groups = hidden_size / group_size;
|
||||
int64_t const threads_per_group = blockDim.x / num_groups;
|
||||
int64_t const thread_in_group = threadIdx.x % threads_per_group;
|
||||
int64_t const group_offset =
|
||||
threadIdx.x / threads_per_group * (group_size >> 2);
|
||||
int64_t const thread_offset = group_offset + thread_in_group;
|
||||
int64_t const thread_end = min(group_offset + (group_size >> 2),
|
||||
static_cast<int64_t>(hidden_size >> 2));
|
||||
vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||
vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
||||
if constexpr (has_residual) {
|
||||
vec4_t<scalar_t> r = vec_residual[i];
|
||||
vec_residual =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
|
||||
}
|
||||
int32_t const num_vec_elems = thread_end;
|
||||
|
||||
#pragma unroll 4
|
||||
for (auto i = thread_offset; i < num_vec_elems; i += threads_per_group) {
|
||||
vec4_t<scalar_t> in = vec_input[i];
|
||||
vec4_t<scalar_t> const w = vec_weight[i];
|
||||
|
||||
vec4_t<float> x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
x.val[j] += static_cast<float>(r.val[j]);
|
||||
x.val[j] = static_cast<float>(in.val[j]);
|
||||
}
|
||||
|
||||
if constexpr (has_residual) {
|
||||
vec4_t<scalar_t> r = vec_residual[i];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
x.val[j] += static_cast<float>(r.val[j]);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
block_absmax_val_maybe =
|
||||
fmaxf(block_absmax_val_maybe,
|
||||
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
|
||||
}
|
||||
}
|
||||
|
||||
s_max_vals[threadIdx.x] = block_absmax_val_maybe;
|
||||
__syncthreads();
|
||||
|
||||
int64_t const warp_size = WARP_SIZE;
|
||||
int64_t const num_warps = blockDim.x / warp_size;
|
||||
int64_t const warp_id = threadIdx.x / warp_size;
|
||||
int64_t const thread_in_warp = threadIdx.x % warp_size;
|
||||
int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
|
||||
for (auto i = 0; i < groups_per_warp; ++i) {
|
||||
int64_t const group_id = i * num_warps + warp_id;
|
||||
if (group_id < num_groups) {
|
||||
int64_t warp_start = group_id * threads_per_group;
|
||||
int64_t const start = warp_start + thread_in_warp;
|
||||
int64_t const warp_end = min(warp_start + threads_per_group,
|
||||
static_cast<int64_t>(hidden_size));
|
||||
for (auto j = start; j + warp_size < warp_end; j += warp_size) {
|
||||
s_max_vals[start] =
|
||||
fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
|
||||
}
|
||||
warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
|
||||
min(warp_end - warp_start, warp_size));
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (thread_in_group == 0 && thread_offset < thread_end) {
|
||||
block_absmax_val_maybe = s_max_vals[threadIdx.x];
|
||||
float scale = 0.0f;
|
||||
if (scale_ub) {
|
||||
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||
} else {
|
||||
scale = block_absmax_val_maybe;
|
||||
}
|
||||
// token scale computation
|
||||
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||
// Global output store
|
||||
if constexpr (is_scale_transposed) {
|
||||
all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
|
||||
blockIdx.x] = scale;
|
||||
} else {
|
||||
all_token_scales[blockIdx.x * num_groups +
|
||||
threadIdx.x / threads_per_group] = scale;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
} else {
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||
vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
||||
if constexpr (has_residual) {
|
||||
vec_residual =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
|
||||
}
|
||||
|
||||
int32_t const num_vec_elems = (hidden_size >> 2);
|
||||
|
||||
#pragma unroll 4
|
||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
vec4_t<scalar_t> in = vec_input[i];
|
||||
vec4_t<scalar_t> const w = vec_weight[i];
|
||||
|
||||
vec4_t<float> x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
block_absmax_val_maybe =
|
||||
fmaxf(block_absmax_val_maybe,
|
||||
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
x.val[j] = static_cast<float>(in.val[j]);
|
||||
}
|
||||
|
||||
if constexpr (has_residual) {
|
||||
vec4_t<scalar_t> r = vec_residual[i];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
x.val[j] += static_cast<float>(r.val[j]);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
block_absmax_val_maybe =
|
||||
fmaxf(block_absmax_val_maybe,
|
||||
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
block_absmax_val_maybe =
|
||||
BlockReduce(reduceStore)
|
||||
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
block_absmax_val_maybe =
|
||||
BlockReduce(reduceStore)
|
||||
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||
|
||||
__shared__ float s_token_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
float scale = 0.0f;
|
||||
if (scale_ub) {
|
||||
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||
} else {
|
||||
scale = block_absmax_val_maybe;
|
||||
__shared__ float s_token_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
float scale = 0.0f;
|
||||
if (scale_ub) {
|
||||
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||
} else {
|
||||
scale = block_absmax_val_maybe;
|
||||
}
|
||||
// token scale computation
|
||||
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||
s_token_scale = scale; // shared memory store
|
||||
all_token_scales[blockIdx.x] = scale; // global output store
|
||||
}
|
||||
// token scale computation
|
||||
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||
s_token_scale = scale; // shared memory store
|
||||
all_token_scales[blockIdx.x] = scale; // global output store
|
||||
}
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
|
||||
*token_scale = s_token_scale;
|
||||
*token_scale = s_token_scale;
|
||||
}
|
||||
}
|
||||
|
||||
// hidden_size must be a multiple of 4
|
||||
template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
|
||||
bool has_residual = false>
|
||||
bool has_residual = false, bool is_scale_transposed = false,
|
||||
int32_t group_size = 0>
|
||||
__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
scalar_t const* __restrict__ input,
|
||||
scalar_t const* __restrict__ weight,
|
||||
float const rms, float const scale,
|
||||
float const rms, float* const scale,
|
||||
int32_t const hidden_size,
|
||||
scalar_t* __restrict__ residual = nullptr) {
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
;
|
||||
|
||||
// Vectorized input/output/weight/residual to better utilize memory bandwidth.
|
||||
vec4_t<scalar_t> const* vec_input =
|
||||
@ -311,10 +509,26 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
}
|
||||
|
||||
q8x4_t<scalar_out_t> out;
|
||||
|
||||
float scale_val;
|
||||
|
||||
if constexpr (group_size > 0) {
|
||||
int64_t const num_groups = hidden_size / group_size;
|
||||
int64_t scale_idx = 0;
|
||||
if constexpr (is_scale_transposed) {
|
||||
scale_idx = (i * VEC_SIZE / group_size) * gridDim.x + blockIdx.x;
|
||||
} else {
|
||||
scale_idx = blockIdx.x * num_groups + i * VEC_SIZE / group_size;
|
||||
}
|
||||
scale_val =
|
||||
is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx];
|
||||
} else {
|
||||
scale_val = *scale;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||
out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
|
||||
static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
|
||||
static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale_val);
|
||||
}
|
||||
vec_output[i] = out;
|
||||
}
|
||||
|
||||
@ -617,7 +617,7 @@ struct MacheteCollectiveMma {
|
||||
|
||||
// Same as upstream, should be kept the same when possible, not formatted for
|
||||
// easier comparison
|
||||
// with `SwapAB ? N : M -> M` since we dont support SwapAB
|
||||
// with `SwapAB ? N : M -> M` since we don't support SwapAB
|
||||
// clang-format off
|
||||
template<class ProblemShape>
|
||||
static bool
|
||||
|
||||
@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
|
||||
void get_cutlass_moe_mm_problem_sizes_caller(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
std::optional<bool> force_swap_ab = std::nullopt) {
|
||||
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
|
||||
auto options_int32 =
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
|
||||
torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
|
||||
|
||||
// Swap-AB should be disabled for FP4 path
|
||||
bool may_swap_ab = (!blockscale_offsets.has_value()) &&
|
||||
(topk_ids.numel() <= SWAP_AB_THRESHOLD);
|
||||
bool may_swap_ab =
|
||||
force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
|
||||
(topk_ids.numel() <= SWAP_AB_THRESHOLD));
|
||||
|
||||
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
|
||||
atomic_buffer, num_experts, n, k, stream,
|
||||
|
||||
@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller(
|
||||
void get_cutlass_moe_mm_problem_sizes_caller(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
std::optional<bool> force_swap_ab = std::nullopt);
|
||||
|
||||
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data(
|
||||
void get_cutlass_moe_mm_problem_sizes(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
std::optional<bool> force_swap_ab = std::nullopt) {
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
|
||||
problem_sizes2, num_experts, n, k,
|
||||
blockscale_offsets);
|
||||
blockscale_offsets, force_swap_ab);
|
||||
return;
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
|
||||
@ -22,6 +22,62 @@ __device__ __forceinline__ float GroupReduceMax(float val) {
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T, bool SCALE_UE8M0>
|
||||
__device__ __forceinline__ float ComputeGroupScale(
|
||||
const T* __restrict__ group_input, T* __restrict__ smem_group,
|
||||
const int group_size, const int lane_id, const int threads_per_group,
|
||||
const float eps, const float max_8bit) {
|
||||
float local_absmax = eps;
|
||||
|
||||
constexpr int vec_size = 16 / sizeof(T);
|
||||
|
||||
// copy global -> shared & compute absmax
|
||||
auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
|
||||
float abs_v = fabsf(static_cast<float>(src));
|
||||
local_absmax = fmaxf(local_absmax, abs_v);
|
||||
dst = src;
|
||||
};
|
||||
|
||||
vllm::vectorize_with_alignment<vec_size>(
|
||||
group_input, // in
|
||||
smem_group, // out (shared)
|
||||
group_size, // elements per group
|
||||
lane_id, // thread id
|
||||
threads_per_group, // stride in group
|
||||
scalar_op_cache); // scalar handler
|
||||
|
||||
local_absmax = GroupReduceMax(local_absmax);
|
||||
|
||||
float y_s = local_absmax / max_8bit;
|
||||
if constexpr (SCALE_UE8M0) {
|
||||
y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
|
||||
}
|
||||
|
||||
return y_s;
|
||||
}
|
||||
|
||||
template <typename T, typename DST_DTYPE>
|
||||
__device__ __forceinline__ void QuantizeGroup(
|
||||
const T* __restrict__ smem_group, DST_DTYPE* __restrict__ group_output,
|
||||
const int group_size, const int lane_id, const int threads_per_group,
|
||||
const float y_s, const float min_8bit, const float max_8bit) {
|
||||
constexpr int vec_size = 16 / sizeof(T);
|
||||
|
||||
// quantize shared -> global 8-bit
|
||||
auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
|
||||
float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
|
||||
dst = DST_DTYPE(q);
|
||||
};
|
||||
|
||||
vllm::vectorize_with_alignment<vec_size>(
|
||||
smem_group, // in (shared)
|
||||
group_output, // out (global quant tensor)
|
||||
group_size, // elements
|
||||
lane_id, // tid
|
||||
threads_per_group, // stride
|
||||
scalar_op_quant); // scalar handler
|
||||
}
|
||||
|
||||
template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
|
||||
bool SCALE_UE8M0 = false, typename scale_packed_t = float>
|
||||
__global__ void per_token_group_quant_8bit_kernel(
|
||||
@ -38,8 +94,6 @@ __global__ void per_token_group_quant_8bit_kernel(
|
||||
const int64_t global_group_id = block_group_id + local_group_id;
|
||||
const int64_t block_group_offset = global_group_id * group_size;
|
||||
|
||||
float local_absmax = eps;
|
||||
|
||||
using scale_element_t = float;
|
||||
static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);
|
||||
|
||||
@ -68,30 +122,9 @@ __global__ void per_token_group_quant_8bit_kernel(
|
||||
T* smem = reinterpret_cast<T*>(smem_raw);
|
||||
T* smem_group = smem + local_group_id * group_size;
|
||||
|
||||
constexpr int vec_size = 16 / sizeof(T);
|
||||
using vec_t = vllm::vec_n_t<T, vec_size>;
|
||||
|
||||
// copy global -> shared & compute absmax
|
||||
auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
|
||||
float abs_v = fabsf(static_cast<float>(src));
|
||||
local_absmax = fmaxf(local_absmax, abs_v);
|
||||
dst = src;
|
||||
};
|
||||
|
||||
vllm::vectorize_with_alignment<vec_size>(
|
||||
group_input, // in
|
||||
smem_group, // out (shared)
|
||||
group_size, // elements per group
|
||||
lane_id, // thread id
|
||||
threads_per_group, // stride in group
|
||||
scalar_op_cache); // scalar handler
|
||||
|
||||
local_absmax = GroupReduceMax(local_absmax);
|
||||
|
||||
float y_s = local_absmax / max_8bit;
|
||||
if constexpr (SCALE_UE8M0) {
|
||||
y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
|
||||
}
|
||||
const float y_s = ComputeGroupScale<T, SCALE_UE8M0>(
|
||||
group_input, smem_group, group_size, lane_id, threads_per_group, eps,
|
||||
max_8bit);
|
||||
|
||||
scale_element_t y_s_quant = y_s;
|
||||
|
||||
@ -101,19 +134,24 @@ __global__ void per_token_group_quant_8bit_kernel(
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// quantize shared -> global 8-bit
|
||||
auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
|
||||
float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
|
||||
dst = DST_DTYPE(q);
|
||||
};
|
||||
QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
|
||||
threads_per_group, y_s, min_8bit, max_8bit);
|
||||
}
|
||||
|
||||
vllm::vectorize_with_alignment<vec_size>(
|
||||
smem_group, // in (shared)
|
||||
group_output, // out (global quant tensor)
|
||||
group_size, // elements
|
||||
lane_id, // tid
|
||||
threads_per_group, // stride
|
||||
scalar_op_quant); // scalar handler
|
||||
inline int GetGroupsPerBlock(int64_t num_groups) {
|
||||
if (num_groups % 16 == 0) {
|
||||
return 16;
|
||||
}
|
||||
if (num_groups % 8 == 0) {
|
||||
return 8;
|
||||
}
|
||||
if (num_groups % 4 == 0) {
|
||||
return 4;
|
||||
}
|
||||
if (num_groups % 2 == 0) {
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||
@ -133,17 +171,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||
|
||||
constexpr int THREADS_PER_GROUP = 16;
|
||||
|
||||
int groups_per_block = 1;
|
||||
|
||||
if (num_groups % 16 == 0) {
|
||||
groups_per_block = 16;
|
||||
} else if (num_groups % 8 == 0) {
|
||||
groups_per_block = 8;
|
||||
} else if (num_groups % 4 == 0) {
|
||||
groups_per_block = 4;
|
||||
} else if (num_groups % 2 == 0) {
|
||||
groups_per_block = 2;
|
||||
}
|
||||
const int groups_per_block = GetGroupsPerBlock(num_groups);
|
||||
|
||||
auto dst_type = output_q.scalar_type();
|
||||
const int num_blocks = num_groups / groups_per_block;
|
||||
@ -206,6 +234,148 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||
#undef LAUNCH_KERNEL
|
||||
}
|
||||
|
||||
template <typename T, typename DST_DTYPE>
|
||||
__global__ void per_token_group_quant_8bit_packed_kernel(
|
||||
const T* __restrict__ input, void* __restrict__ output_q,
|
||||
unsigned int* __restrict__ output_s_packed, const int group_size,
|
||||
const int num_groups, const int groups_per_block, const int groups_per_row,
|
||||
const int mn, const int tma_aligned_mn, const float eps,
|
||||
const float min_8bit, const float max_8bit) {
|
||||
const int threads_per_group = 16;
|
||||
const int64_t local_group_id = threadIdx.x / threads_per_group;
|
||||
const int lane_id = threadIdx.x % threads_per_group;
|
||||
|
||||
const int64_t block_group_id = blockIdx.x * groups_per_block;
|
||||
const int64_t global_group_id = block_group_id + local_group_id;
|
||||
if (global_group_id >= num_groups) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t block_group_offset = global_group_id * group_size;
|
||||
|
||||
const T* group_input = input + block_group_offset;
|
||||
DST_DTYPE* group_output =
|
||||
static_cast<DST_DTYPE*>(output_q) + block_group_offset;
|
||||
|
||||
// shared memory to cache each group's data to avoid double DRAM reads.
|
||||
extern __shared__ __align__(16) char smem_raw[];
|
||||
T* smem = reinterpret_cast<T*>(smem_raw);
|
||||
T* smem_group = smem + local_group_id * group_size;
|
||||
const float y_s =
|
||||
ComputeGroupScale<T, true>(group_input, smem_group, group_size, lane_id,
|
||||
threads_per_group, eps, max_8bit);
|
||||
|
||||
// pack 4 scales into a uint32
|
||||
if (lane_id == 0) {
|
||||
// map flat group id to 2D indices (mn_idx, sf_k_idx)
|
||||
const int sf_k_idx = static_cast<int>(global_group_id % groups_per_row);
|
||||
const int mn_idx = static_cast<int>(global_group_id / groups_per_row);
|
||||
|
||||
if (mn_idx < mn) {
|
||||
// each uint32 in output_s_packed stores 4 packed scales
|
||||
const int sf_k_pack_idx = sf_k_idx / 4;
|
||||
const int pos = sf_k_idx % 4;
|
||||
|
||||
// reinterpret the UE8M0 scale y_s as IEEE bits, extract the 8-bit
|
||||
// exponent, and place it into the correct byte of the 32-bit word.
|
||||
const unsigned int bits = __float_as_uint(y_s);
|
||||
const unsigned int exponent = (bits >> 23u) & 0xffu;
|
||||
const unsigned int contrib = exponent << (pos * 8u);
|
||||
|
||||
const int out_idx = sf_k_pack_idx * tma_aligned_mn + mn_idx;
|
||||
// atomically OR 8-bit exponent into the packed scales buffer
|
||||
atomicOr(output_s_packed + out_idx, contrib);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
|
||||
threads_per_group, y_s, min_8bit, max_8bit);
|
||||
}
|
||||
|
||||
void per_token_group_quant_8bit_packed(const torch::Tensor& input,
|
||||
torch::Tensor& output_q,
|
||||
torch::Tensor& output_s_packed,
|
||||
int64_t group_size, double eps,
|
||||
double min_8bit, double max_8bit) {
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(output_q.is_contiguous());
|
||||
|
||||
const int64_t k = input.size(-1);
|
||||
TORCH_CHECK(k % group_size == 0, "Last dimension (", k,
|
||||
") must be divisible by group_size (", group_size, ").");
|
||||
|
||||
const int64_t mn = input.numel() / k;
|
||||
const int64_t groups_per_row = k / group_size;
|
||||
const int64_t num_groups = mn * groups_per_row;
|
||||
|
||||
TORCH_CHECK(output_s_packed.dim() == 2,
|
||||
"output_s_packed must be 2D, got dim=", output_s_packed.dim(),
|
||||
".");
|
||||
|
||||
const int64_t k_num_packed_sfk = (groups_per_row + 3) / 4;
|
||||
const int64_t tma_aligned_mn = ((mn + 3) / 4) * 4;
|
||||
|
||||
TORCH_CHECK(output_s_packed.scalar_type() == at::ScalarType::Int,
|
||||
"output_s_packed must have dtype int32 for UE8M0-packed scales.");
|
||||
// DeepGEMM expects SFA scales in MN-major form with shape
|
||||
// [mn, ceil_div(K, 128 * 4)] and TMA-aligned stride on the last
|
||||
// dimension.
|
||||
TORCH_CHECK(output_s_packed.size(0) == mn &&
|
||||
output_s_packed.size(1) == k_num_packed_sfk,
|
||||
"output_s_packed shape must be [", mn, ", ", k_num_packed_sfk,
|
||||
"], but got [", output_s_packed.size(0), ", ",
|
||||
output_s_packed.size(1), "].");
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
constexpr int THREADS_PER_GROUP = 16;
|
||||
|
||||
const int groups_per_block = GetGroupsPerBlock(num_groups);
|
||||
|
||||
auto dst_type = output_q.scalar_type();
|
||||
const int num_blocks = num_groups / groups_per_block;
|
||||
const int num_threads = groups_per_block * THREADS_PER_GROUP;
|
||||
|
||||
// zero-initialize packed scales, since we use atomicOr to accumulate
|
||||
// exponents from different groups.
|
||||
output_s_packed.zero_();
|
||||
|
||||
#define LAUNCH_PACKED_KERNEL(T, DST_DTYPE) \
|
||||
do { \
|
||||
dim3 grid(num_blocks); \
|
||||
dim3 block(num_threads); \
|
||||
size_t smem_bytes = \
|
||||
static_cast<size_t>(groups_per_block) * group_size * sizeof(T); \
|
||||
per_token_group_quant_8bit_packed_kernel<T, DST_DTYPE> \
|
||||
<<<grid, block, smem_bytes, stream>>>( \
|
||||
static_cast<const T*>(input.data_ptr()), output_q.data_ptr(), \
|
||||
reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()), \
|
||||
static_cast<int>(group_size), static_cast<int>(num_groups), \
|
||||
groups_per_block, static_cast<int>(groups_per_row), \
|
||||
static_cast<int>(mn), static_cast<int>(tma_aligned_mn), \
|
||||
static_cast<float>(eps), static_cast<float>(min_8bit), \
|
||||
static_cast<float>(max_8bit)); \
|
||||
} while (0)
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "per_token_group_quant_8bit_packed", ([&] {
|
||||
if (dst_type == at::ScalarType::Float8_e4m3fn) {
|
||||
LAUNCH_PACKED_KERNEL(scalar_t, __nv_fp8_e4m3);
|
||||
} else if (dst_type == at::ScalarType::Char) {
|
||||
LAUNCH_PACKED_KERNEL(scalar_t, int8_t);
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"per_token_group_quant_8bit_packed only supports FP8/INT8 "
|
||||
"outputs.");
|
||||
}
|
||||
}));
|
||||
|
||||
#undef LAUNCH_PACKED_KERNEL
|
||||
}
|
||||
|
||||
void per_token_group_quant_fp8(const torch::Tensor& input,
|
||||
torch::Tensor& output_q, torch::Tensor& output_s,
|
||||
int64_t group_size, double eps, double fp8_min,
|
||||
|
||||
@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
|
||||
// Find the min val of div2 that doesn't increase N/(div1*div2)
|
||||
int mindiv(int N, int div1, int div2) {
|
||||
int nPrRnd = div1 * div2;
|
||||
int rnds0 = N / nPrRnd;
|
||||
nPrRnd -= div1 * 3;
|
||||
int rnds3 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds4 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds5 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds6 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds7 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds8 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds9 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rtn = div2;
|
||||
if (rnds0 == rnds3) rtn = div2 - 3;
|
||||
if (rnds0 == rnds4) rtn = div2 - 4;
|
||||
if (rnds0 == rnds5) rtn = div2 - 5;
|
||||
if (rnds0 == rnds6) rtn = div2 - 6;
|
||||
if (rnds0 == rnds7) rtn = div2 - 7;
|
||||
if (rnds0 == rnds8) rtn = div2 - 8;
|
||||
if (rnds0 == rnds9) rtn = div2 - 9;
|
||||
return rtn;
|
||||
int rnds[13];
|
||||
for (int i = 0; i < 13; i++) {
|
||||
rnds[i] = (N + nPrRnd - 1) / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
}
|
||||
for (int i = 12; i >= 0; i--)
|
||||
if (rnds[0] == rnds[i]) return (div2 - i);
|
||||
}
|
||||
|
||||
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
||||
_N) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
||||
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
} else if (K_in * N_in <= max_lds_len * 1.2) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
|
||||
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
} else { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \
|
||||
wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
} \
|
||||
#define WVSPLITK(_YTILE, _UNRL, _N) \
|
||||
{ \
|
||||
dim3 block(64, 16); \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \
|
||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \
|
||||
wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
else if (K_in * N_in <= max_lds_len * 1.2) \
|
||||
wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
else \
|
||||
wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
}
|
||||
|
||||
#define WVSPLIT_TILE(_sYT, __N) \
|
||||
{ \
|
||||
bool fit_lds = (K_in * N_in <= max_lds_len); \
|
||||
if (_sYT <= 1) \
|
||||
WVSPLITK(1, 4, __N) \
|
||||
else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
|
||||
WVSPLITK(2, 2, __N) \
|
||||
else if (_sYT <= 4 * 3) \
|
||||
WVSPLITK(3, 2, __N) \
|
||||
else if (__N == 4) \
|
||||
WVSPLITK(4, 1, __N) \
|
||||
else \
|
||||
WVSPLITK(4, 2, __N) \
|
||||
}
|
||||
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
|
||||
@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
|
||||
: nullptr;
|
||||
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
|
||||
|
||||
// first shoot for biggest tile-size that keeps all simd busy,
|
||||
// then cut the active waves to balance their distribution...
|
||||
int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
|
||||
|
||||
switch (N_in) {
|
||||
case 1:
|
||||
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
|
||||
WVSPLIT_TILE(sYT, 1)
|
||||
break;
|
||||
case 2:
|
||||
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
|
||||
WVSPLIT_TILE(sYT, 2)
|
||||
break;
|
||||
case 3:
|
||||
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
|
||||
WVSPLIT_TILE(sYT, 3)
|
||||
break;
|
||||
case 4:
|
||||
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
|
||||
WVSPLIT_TILE(sYT, 4)
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
|
||||
743
csrc/sampler.cu
743
csrc/sampler.cu
@ -44,41 +44,300 @@ __global__ void apply_repetition_penalties_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
static inline __device__ uint16_t extractBinIdx(float x) {
|
||||
union {
|
||||
__half h;
|
||||
uint16_t u16;
|
||||
} tmp;
|
||||
tmp.h = __float2half_rn(x);
|
||||
tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
|
||||
return 511 - (tmp.u16 >> 7);
|
||||
__device__ __forceinline__ auto convert_to_uint32(float x) -> uint32_t {
|
||||
uint32_t bits = __float_as_uint(x);
|
||||
return (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
|
||||
}
|
||||
|
||||
template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
|
||||
__device__ void topKPerRowJob(const float* logits, const int rowStart,
|
||||
const int rowEnd, const int rowIdx,
|
||||
int* outIndices, int stride0, int stride1) {
|
||||
// The number of elements per thread for the final top-k sort.
|
||||
static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
|
||||
// The class to sort the elements during the final top-k sort.
|
||||
using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
|
||||
kNumTopKItemsPerThread, int>;
|
||||
template <int step>
|
||||
static inline __device__ uint32_t extractBinIdx(float x) {
|
||||
if constexpr (step == 0) {
|
||||
__half hx = __float2half(x);
|
||||
uint16_t bits = __half_as_ushort(hx);
|
||||
bits = (bits & 0x8000) ? bits : ~bits & 0x7fff;
|
||||
return bits >> 5;
|
||||
} else {
|
||||
uint32_t bits = __float_as_uint(x);
|
||||
bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
|
||||
|
||||
if constexpr (step == 1) {
|
||||
return bits >> 21;
|
||||
} else if constexpr (step == 2) {
|
||||
return (bits >> 10) & 0x7ff;
|
||||
} else if constexpr (step == 3) {
|
||||
return bits & 0x3ff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int shift>
|
||||
static inline __device__ bool isPartialMatch(float x, uint32_t pattern) {
|
||||
if constexpr (shift == 0) {
|
||||
return true;
|
||||
}
|
||||
uint32_t bits = __float_as_uint(x);
|
||||
bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
|
||||
return (bits ^ pattern) >> shift == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Map a Func over the input data, using vectorized load instructions if
|
||||
* possible.
|
||||
*
|
||||
* @tparam T element type
|
||||
* @tparam IdxT indexing type
|
||||
* @tparam Func void (T x, IdxT idx)
|
||||
*
|
||||
* @param thread_rank rank of the calling thread among all participating threads
|
||||
* @param num_threads number of the threads that participate in processing
|
||||
* @param in the input data
|
||||
* @param len the number of elements to read
|
||||
* @param f the lambda taking two arguments (T x, IdxT idx)
|
||||
*/
|
||||
template <typename T, typename idxT, typename Func>
|
||||
__device__ void vectorized_process(size_t thread_rank, size_t num_threads,
|
||||
const T* in, idxT len, Func f) {
|
||||
constexpr int WARP_SIZE = 32;
|
||||
using WideT = float4;
|
||||
if constexpr (sizeof(T) >= sizeof(WideT)) {
|
||||
for (idxT i = thread_rank; i < len; i += num_threads) {
|
||||
f(in[i], i);
|
||||
}
|
||||
} else {
|
||||
static_assert(sizeof(WideT) % sizeof(T) == 0);
|
||||
constexpr int items_per_scalar = sizeof(WideT) / sizeof(T);
|
||||
// TODO: it's UB
|
||||
union {
|
||||
WideT scalar;
|
||||
T array[items_per_scalar];
|
||||
} wide;
|
||||
|
||||
int skip_cnt =
|
||||
(reinterpret_cast<size_t>(in) % sizeof(WideT))
|
||||
? ((sizeof(WideT) - reinterpret_cast<size_t>(in) % sizeof(WideT)) /
|
||||
sizeof(T))
|
||||
: 0;
|
||||
if (skip_cnt > len) {
|
||||
skip_cnt = len;
|
||||
}
|
||||
const WideT* in_cast = reinterpret_cast<decltype(in_cast)>(in + skip_cnt);
|
||||
const idxT len_cast = (len - skip_cnt) / items_per_scalar;
|
||||
|
||||
for (idxT i = thread_rank; i < len_cast; i += num_threads) {
|
||||
wide.scalar = in_cast[i];
|
||||
const idxT real_i = skip_cnt + i * items_per_scalar;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < items_per_scalar; ++j) {
|
||||
f(wide.array[j], real_i + j);
|
||||
}
|
||||
}
|
||||
|
||||
static_assert(WARP_SIZE >= items_per_scalar);
|
||||
// and because items_per_scalar > skip_cnt, WARP_SIZE > skip_cnt
|
||||
// no need to use loop
|
||||
if (thread_rank < skip_cnt) {
|
||||
f(in[thread_rank], thread_rank);
|
||||
}
|
||||
// because len_cast = (len - skip_cnt) / items_per_scalar,
|
||||
// len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
|
||||
// and so
|
||||
// len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <=
|
||||
// WARP_SIZE no need to use loop
|
||||
const idxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank;
|
||||
if (remain_i < len) {
|
||||
f(in[remain_i], remain_i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int step, int kNumThreadsPerBlock, int kNumBins, int kNumFinalItems,
|
||||
bool multipleBlocksPerRow, bool mergeBlocks, typename SmemFinalType,
|
||||
typename SmemOutputType>
|
||||
__device__ bool processHistogramStep(
|
||||
const int* indices, const float* logits, int rowEnd, uint32_t& logitPattern,
|
||||
int& thresholdBinIdx, SmemOutputType& smemOutput, int* smemThresholdBinIdx,
|
||||
int* smemFinalDstIdx, int* smemFinalBinSize, int* smemFoundTopKValues,
|
||||
SmemFinalType& smemFinal, int stride1, int rowStart, int topK) {
|
||||
// Clear the histogram.
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < kNumBins; idx += kNumThreadsPerBlock) {
|
||||
smemFinal.histo.data[idx] = 0;
|
||||
}
|
||||
|
||||
// Make sure the histogram is ready.
|
||||
__syncthreads();
|
||||
|
||||
// Update pattern
|
||||
constexpr auto patternShift = step < 2 ? 0 : step == 2 ? 21 : 10;
|
||||
if constexpr (step == 2) {
|
||||
logitPattern = static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
|
||||
<< patternShift;
|
||||
} else if constexpr (step == 3) {
|
||||
logitPattern |= static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
|
||||
<< patternShift;
|
||||
}
|
||||
|
||||
auto distributeToBins = [&](float logit, int /* idx */ = 0) {
|
||||
if (isPartialMatch<patternShift>(logit, logitPattern)) {
|
||||
uint32_t binIdx = extractBinIdx<step>(logit);
|
||||
atomicAdd(&smemFinal.histo.data[binIdx], 1);
|
||||
}
|
||||
};
|
||||
|
||||
// Distribute the elements to the histogram bins.
|
||||
if (stride1 == 1) {
|
||||
vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
|
||||
rowEnd - rowStart, distributeToBins);
|
||||
} else {
|
||||
for (int idx = rowStart + threadIdx.x; idx < rowEnd;
|
||||
idx += kNumThreadsPerBlock) {
|
||||
float logit = logits[idx * stride1];
|
||||
distributeToBins(logit, idx);
|
||||
}
|
||||
}
|
||||
// Make sure the histogram is ready.
|
||||
__syncthreads();
|
||||
|
||||
// Reads the value of the starting position in the smemOutput array
|
||||
int lastValue = smemFoundTopKValues[0];
|
||||
|
||||
for (int round = 0; round < kNumBins / kNumThreadsPerBlock; round++) {
|
||||
// Read the values from SMEM.
|
||||
int idx = threadIdx.x + kNumThreadsPerBlock * round;
|
||||
int binCount{0};
|
||||
binCount = smemFinal.histo.data[idx];
|
||||
|
||||
// Make sure each thread has read its value.
|
||||
__syncthreads();
|
||||
|
||||
// Compute the prefix sum.
|
||||
int prefixSum{0}, totalSum{0};
|
||||
using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
|
||||
Scan(smemFinal.histo.scan).ExclusiveSum(binCount, prefixSum, totalSum);
|
||||
|
||||
// Update the histogram with the prefix sums.
|
||||
prefixSum += lastValue;
|
||||
totalSum += lastValue;
|
||||
smemFinal.histo.data[idx] = prefixSum;
|
||||
|
||||
// Make sure the data is in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// Find the last valid bin.
|
||||
bool foundThreshold = false;
|
||||
if (prefixSum < topK) {
|
||||
int nextPrefixSum = threadIdx.x == kNumThreadsPerBlock - 1
|
||||
? totalSum
|
||||
: smemFinal.histo.data[idx + 1];
|
||||
|
||||
if (nextPrefixSum >= topK) {
|
||||
smemThresholdBinIdx[0] = idx;
|
||||
smemFinalBinSize[0] = nextPrefixSum - prefixSum;
|
||||
foundThreshold = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Early exit: if any thread found the threshold, we can skip remaining
|
||||
// rounds
|
||||
if (__syncthreads_or(foundThreshold)) {
|
||||
break;
|
||||
}
|
||||
|
||||
lastValue = totalSum;
|
||||
}
|
||||
|
||||
// Make sure the data is in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// The threshold bin.
|
||||
thresholdBinIdx = smemThresholdBinIdx[0];
|
||||
|
||||
auto processBins = [&](float logit, int idx) {
|
||||
if (isPartialMatch<patternShift>(logit, logitPattern)) {
|
||||
uint32_t binIdx = extractBinIdx<step>(logit);
|
||||
if (binIdx < thresholdBinIdx) {
|
||||
// The element is part of the top-k selection
|
||||
int dstIdx = atomicAdd(&smemFoundTopKValues[0], 1);
|
||||
|
||||
if constexpr (mergeBlocks) {
|
||||
smemOutput[dstIdx] = indices[idx];
|
||||
} else if constexpr (multipleBlocksPerRow) {
|
||||
smemOutput[dstIdx] = idx + rowStart;
|
||||
reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
|
||||
} else {
|
||||
smemOutput[dstIdx] = idx;
|
||||
}
|
||||
}
|
||||
if constexpr (step < 3) {
|
||||
// Only fill the final items for sorting if the threshold bin fits
|
||||
if (binIdx == thresholdBinIdx &&
|
||||
smemFinalBinSize[0] <= kNumFinalItems) {
|
||||
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
|
||||
smemFinal.items.logits[dstIdx] = logit;
|
||||
if constexpr (mergeBlocks) {
|
||||
smemFinal.items.indices[dstIdx] = indices[idx];
|
||||
} else if constexpr (multipleBlocksPerRow) {
|
||||
smemFinal.items.indices[dstIdx] = idx + rowStart;
|
||||
} else {
|
||||
smemFinal.items.indices[dstIdx] = idx;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (binIdx == thresholdBinIdx) {
|
||||
// The elements in the threshold bin share the same 32 bits at step 3
|
||||
int dstIdx = atomicAdd(&smemFinal.histo.data[binIdx], 1);
|
||||
if (dstIdx < topK) {
|
||||
if constexpr (mergeBlocks) {
|
||||
smemOutput[dstIdx] = indices[idx];
|
||||
} else if constexpr (multipleBlocksPerRow) {
|
||||
smemOutput[dstIdx] = idx + rowStart;
|
||||
reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
|
||||
} else {
|
||||
smemOutput[dstIdx] = idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (stride1 == 1) {
|
||||
vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
|
||||
rowEnd - rowStart, processBins);
|
||||
} else {
|
||||
for (int idx = rowStart + threadIdx.x; idx < rowEnd;
|
||||
idx += kNumThreadsPerBlock) {
|
||||
float logit = logits[idx * stride1];
|
||||
processBins(logit, idx);
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure the elements are in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// Check if we should continue to next step
|
||||
return smemFinalBinSize[0] > kNumFinalItems;
|
||||
}
|
||||
|
||||
// Follows half - 11 - 11 - 10 bit iterations
|
||||
template <int kNumThreadsPerBlock, int kNumBins, bool useRadixSort,
|
||||
bool multipleBlocksPerRow = false, bool mergeBlocks = false>
|
||||
static __device__ void topKPerRowJob(const int* indices, const float* logits,
|
||||
int rowStart, int rowEnd, int* outIndices,
|
||||
float* outLogits, int stride1, int topK) {
|
||||
// The number of slots for the final pass.
|
||||
static constexpr int kNumFinalItems = 3072;
|
||||
static constexpr int kNumFinalItems = 2048;
|
||||
// The number of elements per thread for the final sort.
|
||||
static constexpr int kNumFinalItemsPerThread =
|
||||
kNumFinalItems / kNumThreadsPerBlock;
|
||||
// The class to sort the elements during the final pass.
|
||||
using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
|
||||
kNumFinalItemsPerThread, int>;
|
||||
|
||||
using FinalSortTempStorage =
|
||||
std::conditional_t<useRadixSort, typename FinalSort::TempStorage, int>;
|
||||
// The class to compute the inclusive prefix-sum over the histogram.
|
||||
using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
|
||||
|
||||
// Shared memory to compute the block scan.
|
||||
__shared__ typename Scan::TempStorage smemScan;
|
||||
|
||||
// The structure to store the final items (for the final pass).
|
||||
struct FinalItems {
|
||||
// Shared memory to store the indices for the final pass.
|
||||
@ -87,200 +346,225 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
|
||||
float logits[kNumFinalItems];
|
||||
};
|
||||
|
||||
struct Histogram {
|
||||
typename Scan::TempStorage scan;
|
||||
int data[kNumBins];
|
||||
};
|
||||
|
||||
// Shared memory to compute the block sort.
|
||||
__shared__ union {
|
||||
FinalItems items;
|
||||
typename FinalSort::TempStorage finalSort;
|
||||
typename TopKSort::TempStorage topKSort;
|
||||
FinalSortTempStorage finalSort;
|
||||
Histogram histo;
|
||||
} smemFinal;
|
||||
|
||||
// Shared memory to store the histogram.
|
||||
__shared__ int smemHistogram[kNumBins];
|
||||
// Shared memory to store the selected indices.
|
||||
__shared__ int smemIndices[kTopK];
|
||||
// If we are processing using multiple blocks, we need to store the logits and
|
||||
// indices.
|
||||
extern __shared__ int32_t smemOutput[];
|
||||
|
||||
// Shared memory to store the threshold bin.
|
||||
__shared__ int smemThresholdBinIdx[1];
|
||||
// Shared memory counter to register the candidates for the final phase.
|
||||
__shared__ int smemFinalDstIdx[1];
|
||||
// Shared memory to determine if the threshold bin fits in the final items.
|
||||
__shared__ int smemFinalBinSize[1];
|
||||
// Shared memory to keep track of the top-k values found so far by the
|
||||
// previous iterations
|
||||
__shared__ int smemFoundTopKValues[1];
|
||||
|
||||
// The length of the row.
|
||||
int rowLen = rowEnd - rowStart;
|
||||
|
||||
// Shortcut if the length of the row is smaller than Top-K. Indices are not
|
||||
// sorted by their corresponding logit.
|
||||
if (rowLen <= kTopK) {
|
||||
if (rowLen <= topK) {
|
||||
for (int rowIt = threadIdx.x; rowIt < rowLen;
|
||||
rowIt += kNumThreadsPerBlock) {
|
||||
int idx = rowStart + rowIt;
|
||||
outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
|
||||
if constexpr (multipleBlocksPerRow) {
|
||||
outIndices[rowIt] = rowIt + rowStart;
|
||||
outLogits[rowIt] = logits[rowIt + rowStart];
|
||||
} else {
|
||||
outIndices[rowIt] = rowIt;
|
||||
}
|
||||
}
|
||||
for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
|
||||
for (int rowIt = rowLen + threadIdx.x; rowIt < topK;
|
||||
rowIt += kNumThreadsPerBlock) {
|
||||
outIndices[rowIdx * kTopK + rowIt] = -1;
|
||||
outIndices[rowIt] = -1;
|
||||
if constexpr (multipleBlocksPerRow) {
|
||||
outLogits[rowIt] = -FLT_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Clear the histogram.
|
||||
if (threadIdx.x < kNumBins) {
|
||||
smemHistogram[threadIdx.x] = 0;
|
||||
}
|
||||
|
||||
// Make sure the histogram is ready.
|
||||
__syncthreads();
|
||||
|
||||
// Fetch elements one-by-one.
|
||||
for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
|
||||
rowIt += kNumThreadsPerBlock) {
|
||||
uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
|
||||
atomicAdd(&smemHistogram[idx], 1);
|
||||
}
|
||||
|
||||
// Make sure the histogram is ready.
|
||||
__syncthreads();
|
||||
|
||||
// Read the values from SMEM.
|
||||
int binCount{0};
|
||||
if (threadIdx.x < kNumBins) {
|
||||
binCount = smemHistogram[threadIdx.x];
|
||||
}
|
||||
|
||||
// Make sure each thread has read its value.
|
||||
__syncthreads();
|
||||
|
||||
// Compute the prefix sum.
|
||||
int prefixSum{0}, totalSum{0};
|
||||
Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);
|
||||
|
||||
// Update the histogram with the prefix sums.
|
||||
if (threadIdx.x < kNumBins) {
|
||||
smemHistogram[threadIdx.x] = prefixSum;
|
||||
}
|
||||
|
||||
// Make sure the data is in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// Find the last valid bin.
|
||||
if (threadIdx.x < kNumBins) {
|
||||
int nextPrefixSum =
|
||||
threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
|
||||
if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
|
||||
smemThresholdBinIdx[0] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
// Clear the counter to store the items for the final phase.
|
||||
// Initialize values
|
||||
if (threadIdx.x == 0) {
|
||||
smemFinalDstIdx[0] = 0;
|
||||
smemFoundTopKValues[0] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
int thresholdBinIdx = -1;
|
||||
uint32_t logitPattern = 0;
|
||||
|
||||
// Step 0: Process first 11 bits of half representation
|
||||
bool continueToNextStep =
|
||||
processHistogramStep<0, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
|
||||
multipleBlocksPerRow, mergeBlocks>(
|
||||
indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
|
||||
smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
|
||||
smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
|
||||
|
||||
if (continueToNextStep) {
|
||||
// Step 1: Process next 11 bits
|
||||
continueToNextStep =
|
||||
processHistogramStep<1, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
|
||||
multipleBlocksPerRow, mergeBlocks>(
|
||||
indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
|
||||
smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
|
||||
smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
|
||||
}
|
||||
|
||||
// Make sure the data is in shared memory.
|
||||
__syncthreads();
|
||||
if (continueToNextStep) {
|
||||
// Step 2: Process next 11 bits
|
||||
continueToNextStep =
|
||||
processHistogramStep<2, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
|
||||
multipleBlocksPerRow, mergeBlocks>(
|
||||
indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
|
||||
smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
|
||||
smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
|
||||
}
|
||||
|
||||
// The threshold bin.
|
||||
int thresholdBinIdx = smemThresholdBinIdx[0];
|
||||
if (continueToNextStep) {
|
||||
// Step 3: Process last 10 bits
|
||||
processHistogramStep<3, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
|
||||
multipleBlocksPerRow, mergeBlocks>(
|
||||
indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
|
||||
smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
|
||||
smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
|
||||
}
|
||||
|
||||
// Fetch elements one-by-one and populate the shared memory buffers.
|
||||
for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
|
||||
rowIt += kNumThreadsPerBlock) {
|
||||
float logit = logits[rowIdx * stride0 + rowIt * stride1];
|
||||
uint16_t idx = extractBinIdx(logit);
|
||||
if (idx < thresholdBinIdx) {
|
||||
int dstIdx = atomicAdd(&smemHistogram[idx], 1);
|
||||
smemIndices[dstIdx] = rowIt;
|
||||
} else if (idx == thresholdBinIdx) {
|
||||
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
|
||||
if (dstIdx < kNumFinalItems) {
|
||||
smemFinal.items.logits[dstIdx] = logit;
|
||||
smemFinal.items.indices[dstIdx] = rowIt;
|
||||
if (!continueToNextStep) {
|
||||
// The histogram did not proceed to the final 10 bits, therefore we need to
|
||||
// sort the final items The logits of the elements to be sorted in the final
|
||||
// pass.
|
||||
if constexpr (useRadixSort) {
|
||||
// Sorting with radix sort
|
||||
float finalLogits[kNumFinalItemsPerThread];
|
||||
// The indices of the elements to be sorted in the final pass.
|
||||
int finalIndices[kNumFinalItemsPerThread];
|
||||
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||
finalLogits[ii] = -FLT_MAX;
|
||||
}
|
||||
|
||||
// Read the elements from SMEM.
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
||||
if (srcIdx < smemFinalDstIdx[0]) {
|
||||
finalLogits[ii] = smemFinal.items.logits[srcIdx];
|
||||
finalIndices[ii] = smemFinal.items.indices[srcIdx];
|
||||
}
|
||||
}
|
||||
// Make sure the shared memory has been read.
|
||||
__syncthreads();
|
||||
|
||||
// Sort the elements.
|
||||
FinalSort(smemFinal.finalSort)
|
||||
.SortDescendingBlockedToStriped(finalLogits, finalIndices);
|
||||
|
||||
// Copy the data back to the shared memory storage.
|
||||
int baseIdx = smemFoundTopKValues[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
||||
int dstIdx = baseIdx + srcIdx;
|
||||
|
||||
if (dstIdx < topK) {
|
||||
smemOutput[dstIdx] = finalIndices[ii];
|
||||
if constexpr (multipleBlocksPerRow) {
|
||||
reinterpret_cast<float*>(smemOutput + topK)[dstIdx] =
|
||||
finalLogits[ii];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Sorting with insertion sort
|
||||
auto baseIdx = smemFoundTopKValues[0];
|
||||
for (int i = threadIdx.x; i < smemFinalDstIdx[0];
|
||||
i += kNumThreadsPerBlock) {
|
||||
int outIndex = 0;
|
||||
auto logit = smemFinal.items.logits[i];
|
||||
for (int j = 0; j < smemFinalDstIdx[0]; j++) {
|
||||
auto otherLogit = smemFinal.items.logits[j];
|
||||
if (logit < otherLogit || (logit == otherLogit && i < j)) {
|
||||
outIndex++;
|
||||
}
|
||||
}
|
||||
// Store if outIndex is in bounds
|
||||
if (outIndex + baseIdx < topK) {
|
||||
smemOutput[outIndex + baseIdx] = smemFinal.items.indices[i];
|
||||
if constexpr (multipleBlocksPerRow) {
|
||||
reinterpret_cast<float*>(smemOutput + topK)[outIndex + baseIdx] =
|
||||
smemFinal.items.logits[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Store to global memory.
|
||||
for (int i = threadIdx.x; i < topK; i += kNumThreadsPerBlock) {
|
||||
if constexpr (multipleBlocksPerRow) {
|
||||
outIndices[i] = smemOutput[i];
|
||||
outLogits[i] = reinterpret_cast<float*>(smemOutput + topK)[i];
|
||||
} else {
|
||||
if (stride1 == 1) {
|
||||
// stride1 == 1 will use vectorized_process, which indexes already skip
|
||||
// the rowStart.
|
||||
outIndices[i] = smemOutput[i];
|
||||
} else {
|
||||
outIndices[i] = smemOutput[i] - rowStart;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure the elements are in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// The logits of the elements to be sorted in the final pass.
|
||||
float finalLogits[kNumFinalItemsPerThread];
|
||||
// The indices of the elements to be sorted in the final pass.
|
||||
int finalIndices[kNumFinalItemsPerThread];
|
||||
|
||||
// Init.
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||
finalLogits[ii] = -FLT_MAX;
|
||||
}
|
||||
|
||||
// Read the elements from SMEM.
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
||||
if (srcIdx < smemFinalDstIdx[0]) {
|
||||
finalLogits[ii] = smemFinal.items.logits[srcIdx];
|
||||
finalIndices[ii] = smemFinal.items.indices[srcIdx];
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure the shared memory has been read.
|
||||
__syncthreads();
|
||||
|
||||
// Sort the elements.
|
||||
FinalSort(smemFinal.finalSort)
|
||||
.SortDescendingBlockedToStriped(finalLogits, finalIndices);
|
||||
|
||||
// Copy the data back to the shared memory storage.
|
||||
int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
||||
int dstIdx = baseIdx + srcIdx;
|
||||
if (dstIdx < kTopK) {
|
||||
smemIndices[dstIdx] = finalIndices[ii];
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure the data is in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// Store to global memory.
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
|
||||
int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
|
||||
outIndices[offset] =
|
||||
smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
|
||||
}
|
||||
}
|
||||
|
||||
template <int kNumThreadsPerBlock = 512>
|
||||
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
||||
const int* rowEnds, int* outIndices,
|
||||
int stride0, int stride1) {
|
||||
template <int kNumThreadsPerBlock, bool useRadixSort>
|
||||
static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
|
||||
const float* logits, const int* rowStarts, const int* rowEnds,
|
||||
int* outIndices, int stride0, int stride1, const int topK,
|
||||
const int offsetIndex) {
|
||||
// The number of bins in the histogram.
|
||||
static constexpr int kNumBins = 512;
|
||||
|
||||
// The top-k width.
|
||||
static constexpr int kTopK = 2048;
|
||||
static constexpr int kNumBins = 2048;
|
||||
|
||||
// The row computed by this block.
|
||||
int rowIdx = blockIdx.x;
|
||||
int rowIdx = blockIdx.x + offsetIndex;
|
||||
|
||||
// The range of logits within the row.
|
||||
int rowStart = rowStarts[rowIdx];
|
||||
int rowEnd = rowEnds[rowIdx];
|
||||
|
||||
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
|
||||
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
|
||||
// Local pointers to this block
|
||||
outIndices += rowIdx * topK;
|
||||
logits += rowIdx * stride0;
|
||||
|
||||
topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
|
||||
nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
|
||||
}
|
||||
|
||||
template <int kNumThreadsPerBlock = 512>
|
||||
static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
|
||||
int* outIndices, int stride0,
|
||||
int stride1, int next_n) {
|
||||
template <int kNumThreadsPerBlock, bool useRadixSort,
|
||||
bool multipleBlocksPerRow = false, bool mergeBlocks = false>
|
||||
static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
|
||||
const float* logits, const int* seqLens, int* outIndices, int stride0,
|
||||
int stride1, const int topK, int next_n, float* outLogits = nullptr,
|
||||
const int numBlocksToMerge = 0, const int* indices = nullptr) {
|
||||
// The number of bins in the histogram.
|
||||
static constexpr int kNumBins = 512;
|
||||
|
||||
// The top-k width.
|
||||
static constexpr int kTopK = 2048;
|
||||
static constexpr int kNumBins = 2048;
|
||||
|
||||
// The row computed by this block.
|
||||
int rowIdx = blockIdx.x;
|
||||
@ -290,8 +574,25 @@ static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
|
||||
int seq_len = seqLens[rowIdx / next_n];
|
||||
int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
|
||||
|
||||
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
|
||||
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
|
||||
// Local pointers to this block
|
||||
if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
|
||||
outIndices += rowIdx * topK;
|
||||
} else if constexpr (multipleBlocksPerRow) {
|
||||
const auto blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192
|
||||
rowStart = blockSize * blockIdx.y; // 8192 * 1 = 8192
|
||||
rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
|
||||
outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
|
||||
outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
|
||||
} else if constexpr (mergeBlocks) {
|
||||
rowEnd = numBlocksToMerge * topK;
|
||||
indices += rowIdx * numBlocksToMerge * topK;
|
||||
outIndices += rowIdx * topK;
|
||||
}
|
||||
logits += rowIdx * stride0;
|
||||
|
||||
topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
|
||||
multipleBlocksPerRow, mergeBlocks>(
|
||||
indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK);
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
@ -339,28 +640,84 @@ void apply_repetition_penalties_(
|
||||
|
||||
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
||||
const torch::Tensor& seqLens, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1) {
|
||||
// Compute the results on the device.
|
||||
int64_t numRows, int64_t stride0, int64_t stride1,
|
||||
int64_t topK) {
|
||||
constexpr int kSortingAlgorithmThreshold = 12288;
|
||||
constexpr int kSplitWorkThreshold = 200 * 1000;
|
||||
constexpr int kNumThreadsPerBlock = 512;
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const auto numColumns = logits.size(1);
|
||||
|
||||
if (numColumns < kSortingAlgorithmThreshold) {
|
||||
// Use insertion sort
|
||||
vllm::topKPerRowDecode<kNumThreadsPerBlock, false>
|
||||
<<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
|
||||
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||
indices.data_ptr<int>(), static_cast<int>(stride0),
|
||||
static_cast<int>(stride1), static_cast<int>(topK),
|
||||
static_cast<int>(next_n));
|
||||
} else if (numColumns < kSplitWorkThreshold) {
|
||||
// From this threshold, use radix sort instead
|
||||
vllm::topKPerRowDecode<kNumThreadsPerBlock, true>
|
||||
<<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
|
||||
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||
indices.data_ptr<int>(), static_cast<int>(stride0),
|
||||
static_cast<int>(stride1), static_cast<int>(topK),
|
||||
static_cast<int>(next_n));
|
||||
} else {
|
||||
// Long sequences are run in two steps
|
||||
constexpr auto multipleBlocksPerRowConfig = 10;
|
||||
|
||||
const auto outIndicesAux =
|
||||
torch::empty({numRows, multipleBlocksPerRowConfig, topK},
|
||||
torch::dtype(torch::kInt32).device(logits.device()));
|
||||
const auto outLogitsAux =
|
||||
torch::empty({numRows, multipleBlocksPerRowConfig, topK},
|
||||
torch::dtype(torch::kFloat).device(logits.device()));
|
||||
|
||||
vllm::topKPerRowDecode<kNumThreadsPerBlock, true, true>
|
||||
<<<dim3(numRows, multipleBlocksPerRowConfig), kNumThreadsPerBlock,
|
||||
2 * topK * sizeof(int32_t), stream>>>(
|
||||
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||
outIndicesAux.data_ptr<int>(), static_cast<int>(stride0),
|
||||
static_cast<int>(stride1), static_cast<int>(topK),
|
||||
static_cast<int>(next_n), outLogitsAux.data_ptr<float>());
|
||||
|
||||
constexpr int kNumThreadsPerBlockMerge = 1024;
|
||||
vllm::topKPerRowDecode<kNumThreadsPerBlockMerge, true, false, true>
|
||||
<<<numRows, kNumThreadsPerBlockMerge, topK * sizeof(int32_t), stream>>>(
|
||||
outLogitsAux.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||
indices.data_ptr<int>(), multipleBlocksPerRowConfig * topK, 1,
|
||||
static_cast<int>(topK), static_cast<int>(next_n), nullptr,
|
||||
multipleBlocksPerRowConfig, outIndicesAux.data_ptr<int>());
|
||||
}
|
||||
}
|
||||
|
||||
void top_k_per_row_prefill(const torch::Tensor& logits,
|
||||
const torch::Tensor& rowStarts,
|
||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1,
|
||||
int64_t topK) {
|
||||
constexpr int kSortingAlgorithmThreshold = 12288;
|
||||
constexpr int kNumThreadsPerBlock = 512;
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
vllm::topKPerRowDecode<kNumThreadsPerBlock>
|
||||
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
||||
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||
indices.data_ptr<int>(), static_cast<int>(stride0),
|
||||
static_cast<int>(stride1), static_cast<int>(next_n));
|
||||
}
|
||||
int numInsertionBlocks =
|
||||
std::min(static_cast<int>(numRows), kSortingAlgorithmThreshold);
|
||||
vllm::topKPerRowPrefill<kNumThreadsPerBlock, false>
|
||||
<<<numInsertionBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
|
||||
stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
||||
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
||||
static_cast<int>(stride0), static_cast<int>(stride1),
|
||||
static_cast<int>(topK), 0);
|
||||
|
||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||
int64_t numRows, int64_t stride0, int64_t stride1) {
|
||||
// Compute the results on the device.
|
||||
constexpr int kNumThreadsPerBlock = 512;
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
vllm::topKPerRow<kNumThreadsPerBlock>
|
||||
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
||||
logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
||||
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
||||
static_cast<int>(stride0), static_cast<int>(stride1));
|
||||
if (numRows > kSortingAlgorithmThreshold) {
|
||||
int numRadixBlocks = numRows - kSortingAlgorithmThreshold;
|
||||
vllm::topKPerRowPrefill<kNumThreadsPerBlock, true>
|
||||
<<<numRadixBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
|
||||
stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
||||
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
||||
static_cast<int>(stride0), static_cast<int>(stride1),
|
||||
static_cast<int>(topK), kSortingAlgorithmThreshold);
|
||||
}
|
||||
}
|
||||
|
||||
@ -179,15 +179,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
|
||||
// Optimized top-k per row operation
|
||||
ops.def(
|
||||
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
|
||||
"top_k_per_row_prefill(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
|
||||
"Tensor! indices, int numRows, int stride0, "
|
||||
"int stride1) -> ()");
|
||||
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
|
||||
"int stride1, int topK) -> ()");
|
||||
ops.impl("top_k_per_row_prefill", torch::kCUDA, &top_k_per_row_prefill);
|
||||
|
||||
ops.def(
|
||||
"top_k_per_row_decode(Tensor logits, int next_n, "
|
||||
"Tensor seq_lens, Tensor! indices, int numRows, "
|
||||
"int stride0, int stride1) -> ()");
|
||||
"Tensor seq_lens, Tensor! indices, "
|
||||
"int numRows, int stride0, int stride1, int topK) -> ()");
|
||||
ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
|
||||
|
||||
// Layernorm-quant
|
||||
@ -215,6 +215,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
|
||||
&rms_norm_dynamic_per_token_quant);
|
||||
|
||||
// Fused Layernorm + Block quant kernels
|
||||
ops.def(
|
||||
"rms_norm_per_block_quant(Tensor! result, Tensor input, "
|
||||
"Tensor weight, Tensor! scale, float epsilon, "
|
||||
"Tensor? scale_ub, Tensor!? residual, int group_size, "
|
||||
"bool is_scale_transposed) -> ()");
|
||||
ops.impl("rms_norm_per_block_quant", torch::kCUDA, &rms_norm_per_block_quant);
|
||||
|
||||
// Rotary embedding
|
||||
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
|
||||
ops.def(
|
||||
@ -342,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
// CUTLASS w4a8 grouped GEMM
|
||||
ops.def(
|
||||
"cutlass_w4a8_moe_mm("
|
||||
" Tensor! out_tensors,"
|
||||
" Tensor a_tensors,"
|
||||
" Tensor b_tensors,"
|
||||
" Tensor a_scales,"
|
||||
" Tensor b_scales,"
|
||||
" Tensor b_group_scales,"
|
||||
" int b_group_size,"
|
||||
" Tensor expert_offsets,"
|
||||
" Tensor problem_sizes,"
|
||||
" Tensor a_strides,"
|
||||
" Tensor b_strides,"
|
||||
" Tensor c_strides,"
|
||||
" Tensor group_scale_strides,"
|
||||
" str? maybe_schedule"
|
||||
") -> ()");
|
||||
ops.def(
|
||||
"cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, "
|
||||
"Tensor)");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
#endif
|
||||
|
||||
// Dequantization for GGML.
|
||||
@ -458,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor! problem_sizes1, "
|
||||
" Tensor! problem_sizes2, "
|
||||
" int num_experts, int n, int k, "
|
||||
" Tensor? blockscale_offsets) -> ()");
|
||||
" Tensor? blockscale_offsets, "
|
||||
" bool? force_swap_ab) -> ()");
|
||||
ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
|
||||
&get_cutlass_moe_mm_problem_sizes);
|
||||
|
||||
@ -617,6 +649,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
||||
&per_token_group_quant_fp8);
|
||||
|
||||
// Compute per-token-group 8-bit quantized tensor and UE8M0-packed,
|
||||
// TMA-aligned scales for DeepGEMM.
|
||||
ops.def(
|
||||
"per_token_group_fp8_quant_packed(Tensor input, Tensor! output_q, "
|
||||
"Tensor! output_s_packed, int group_size, float eps, float fp8_min, "
|
||||
"float fp8_max) -> ()");
|
||||
ops.impl("per_token_group_fp8_quant_packed", torch::kCUDA,
|
||||
&per_token_group_quant_8bit_packed);
|
||||
|
||||
// Compute per-token-group INT8 quantized tensor and scaling factor.
|
||||
ops.def(
|
||||
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
|
||||
@ -713,6 +754,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
"Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
|
||||
cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
|
||||
|
||||
cache_ops.def(
|
||||
"cp_gather_and_upconvert_fp8_kv_cache(Tensor src_cache, Tensor! dst, "
|
||||
"Tensor block_table, Tensor seq_lens, Tensor workspace_starts, int "
|
||||
"batch_size) -> ()");
|
||||
cache_ops.impl("cp_gather_and_upconvert_fp8_kv_cache", torch::kCUDA,
|
||||
&cp_gather_and_upconvert_fp8_kv_cache);
|
||||
|
||||
cache_ops.def(
|
||||
"indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache, Tensor "
|
||||
"slot_mapping, "
|
||||
|
||||
@ -32,7 +32,7 @@ ARG DEADSNAKES_GPGKEY_URL
|
||||
|
||||
# The PyPA get-pip.py script is a self contained script+zip file, that provides
|
||||
# both the installer script and the pip base85-encoded zip archive. This allows
|
||||
# bootstrapping pip in environment where a dsitribution package does not exist.
|
||||
# bootstrapping pip in environment where a distribution package does not exist.
|
||||
#
|
||||
# By parameterizing the URL for get-pip.py installation script, we allow
|
||||
# third-party to use their own copy of the script stored in a private mirror.
|
||||
@ -73,15 +73,13 @@ ARG INSTALL_KV_CONNECTORS=false
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# prepare basic build environment
|
||||
FROM ${BUILD_BASE_IMAGE} AS base
|
||||
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
ARG TARGETPLATFORM
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG GET_PIP_URL
|
||||
|
||||
# Install system dependencies and uv, then create Python virtual environment
|
||||
# Install system dependencies including build tools
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
@ -107,32 +105,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Activate virtual environment and add uv to PATH
|
||||
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
|
||||
ENV VIRTUAL_ENV="/opt/venv"
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
# Environment for uv
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN <<EOF
|
||||
gcc --version
|
||||
EOF
|
||||
# Verify GCC version
|
||||
RUN gcc --version
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
# Workaround for triton/pytorch issues
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# ============================================================
|
||||
# SLOW-CHANGING DEPENDENCIES BELOW
|
||||
# These are the expensive layers that we want to cache
|
||||
# ============================================================
|
||||
|
||||
# Install PyTorch and core CUDA dependencies
|
||||
# This is ~2GB and rarely changes
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# install build and runtime dependencies
|
||||
@ -142,13 +138,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# cuda arch list used by torch
|
||||
# can be useful for both `dev` and `test`
|
||||
# explicitly set the list to avoid issues with torch 2.2
|
||||
# see https://github.com/pytorch/pytorch/pull/123243
|
||||
# CUDA arch list used by torch
|
||||
# Explicitly set the list to avoid issues with torch 2.2
|
||||
# See https://github.com/pytorch/pytorch/pull/123243
|
||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
#################### BUILD BASE IMAGE ####################
|
||||
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
FROM base AS csrc-build
|
||||
@ -196,6 +191,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
ARG VLLM_MERGE_BASE_COMMIT=""
|
||||
ARG VLLM_MAIN_CUDA_VERSION=""
|
||||
|
||||
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
|
||||
@ -216,6 +212,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||
&& export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
|
||||
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
@ -233,11 +230,54 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
rm -rf .deps && \
|
||||
mkdir -p .deps && \
|
||||
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||
export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
|
||||
#################### EXTENSIONS BUILD IMAGE ####################
|
||||
# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
|
||||
# This stage is independent and doesn't affect csrc cache
|
||||
FROM base AS extensions-build
|
||||
ARG CUDA_VERSION
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# Build DeepGEMM wheel
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
mkdir -p /tmp/deepgemm/dist && \
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh \
|
||||
--cuda-version "${CUDA_VERSION}" \
|
||||
${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
|
||||
--wheel-dir /tmp/deepgemm/dist || \
|
||||
echo "DeepGEMM build skipped (CUDA version requirement not met)"
|
||||
|
||||
# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
|
||||
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
|
||||
|
||||
# Build pplx-kernels and DeepEP wheels
|
||||
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
|
||||
ARG PPLX_COMMIT_HASH
|
||||
ARG DEEPEP_COMMIT_HASH
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
mkdir -p /tmp/ep_kernels_workspace/dist && \
|
||||
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
|
||||
/tmp/install_python_libraries.sh \
|
||||
--workspace /tmp/ep_kernels_workspace \
|
||||
--mode wheel \
|
||||
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
|
||||
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
|
||||
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
|
||||
#################### EXTENSIONS BUILD IMAGE ####################
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM base AS build
|
||||
ARG TARGETPLATFORM
|
||||
@ -262,6 +302,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# Copy pre-built csrc wheel directly
|
||||
COPY --from=csrc-build /workspace/dist /precompiled-wheels
|
||||
|
||||
COPY . .
|
||||
@ -283,27 +324,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
fi && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
|
||||
|
||||
# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
|
||||
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
|
||||
|
||||
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
|
||||
# Install EP kernels(pplx-kernels and DeepEP)
|
||||
ARG PPLX_COMMIT_HASH
|
||||
ARG DEEPEP_COMMIT_HASH
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
|
||||
/tmp/install_python_libraries.sh \
|
||||
--workspace /tmp/ep_kernels_workspace \
|
||||
--mode wheel \
|
||||
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
|
||||
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
|
||||
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
|
||||
# Copy extension wheels from extensions-build stage for later use
|
||||
COPY --from=extensions-build /tmp/deepgemm/dist /tmp/deepgemm/dist
|
||||
COPY --from=extensions-build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels_workspace/dist
|
||||
|
||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
@ -341,32 +364,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
#################### DEV IMAGE ####################
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||
|
||||
ARG CUDA_VERSION
|
||||
ARG PYTHON_VERSION
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
WORKDIR /vllm-workspace
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
|
||||
ARG GDRCOPY_CUDA_VERSION=12.8
|
||||
# Keep in line with FINAL_BASE_IMAGE
|
||||
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ARG DEADSNAKES_MIRROR_URL
|
||||
ARG DEADSNAKES_GPGKEY_URL
|
||||
ARG GET_PIP_URL
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
WORKDIR /vllm-workspace
|
||||
|
||||
|
||||
# Python version string for paths (e.g., "312" for 3.12)
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies
|
||||
# Install Python and system dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
@ -405,63 +421,104 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Install CUDA development tools and build essentials for runtime JIT compilation
|
||||
# Install CUDA development tools for runtime JIT compilation
|
||||
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
|
||||
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
cuda-nvcc-${CUDA_VERSION_DASH} \
|
||||
cuda-cudart-${CUDA_VERSION_DASH} \
|
||||
cuda-nvrtc-${CUDA_VERSION_DASH} \
|
||||
cuda-cuobjdump-${CUDA_VERSION_DASH} \
|
||||
# https://github.com/vllm-project/vllm/issues/29590
|
||||
libcurand-dev-${CUDA_VERSION_DASH} \
|
||||
libcublas-${CUDA_VERSION_DASH} \
|
||||
# Fixes nccl_allocator requiring nccl.h at runtime
|
||||
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
|
||||
libnccl-dev && \
|
||||
cuda-nvcc-${CUDA_VERSION_DASH} \
|
||||
cuda-cudart-${CUDA_VERSION_DASH} \
|
||||
cuda-nvrtc-${CUDA_VERSION_DASH} \
|
||||
cuda-cuobjdump-${CUDA_VERSION_DASH} \
|
||||
libcurand-dev-${CUDA_VERSION_DASH} \
|
||||
libcublas-${CUDA_VERSION_DASH} \
|
||||
# Fixes nccl_allocator requiring nccl.h at runtime
|
||||
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
|
||||
libnccl-dev && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install uv for faster pip installs
|
||||
RUN python3 -m pip install uv
|
||||
|
||||
# Environment for uv
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Workaround for triton/pytorch issues
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# ============================================================
|
||||
# SLOW-CHANGING DEPENDENCIES BELOW
|
||||
# These are the expensive layers that we want to cache
|
||||
# ============================================================
|
||||
|
||||
# Install PyTorch and core CUDA dependencies
|
||||
# This is ~2GB and rarely changes
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
COPY requirements/common.txt /tmp/common.txt
|
||||
COPY requirements/cuda.txt /tmp/requirements-cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r /tmp/requirements-cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
|
||||
rm /tmp/requirements-cuda.txt /tmp/common.txt
|
||||
|
||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||
# This is ~1.1GB and only changes when FlashInfer version bumps
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
ARG FLASHINFER_VERSION=0.5.3
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
|
||||
&& uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
|
||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& flashinfer show-config
|
||||
|
||||
# ============================================================
|
||||
# OPENAI API SERVER DEPENDENCIES
|
||||
# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
|
||||
# ============================================================
|
||||
|
||||
# Install gdrcopy (saves ~6s per build)
|
||||
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
|
||||
ARG GDRCOPY_CUDA_VERSION=12.8
|
||||
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
|
||||
ARG TARGETPLATFORM
|
||||
COPY tools/install_gdrcopy.sh /tmp/install_gdrcopy.sh
|
||||
RUN set -eux; \
|
||||
case "${TARGETPLATFORM}" in \
|
||||
linux/arm64) UUARCH="aarch64" ;; \
|
||||
linux/amd64) UUARCH="x64" ;; \
|
||||
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
||||
esac; \
|
||||
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}" && \
|
||||
rm /tmp/install_gdrcopy.sh
|
||||
|
||||
# Install vllm-openai dependencies (saves ~2.6s per build)
|
||||
# These are stable packages that don't depend on vLLM itself
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
BITSANDBYTES_VERSION="0.42.0"; \
|
||||
else \
|
||||
BITSANDBYTES_VERSION="0.46.1"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer modelscope \
|
||||
"bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
|
||||
|
||||
# ============================================================
|
||||
# VLLM INSTALLATION (depends on build stage)
|
||||
# ============================================================
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system flashinfer-cubin==0.5.3 \
|
||||
&& uv pip install --system flashinfer-jit-cache==0.5.3 \
|
||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& flashinfer show-config
|
||||
|
||||
COPY examples examples
|
||||
COPY benchmarks benchmarks
|
||||
COPY ./vllm/collect_env.py .
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
uv pip list
|
||||
@ -475,7 +532,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
echo "No DeepGEMM wheels to install; skipping."; \
|
||||
fi'
|
||||
|
||||
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
|
||||
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||||
|
||||
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
|
||||
@ -484,23 +541,17 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
|
||||
uv pip install --system ep_kernels/dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
|
||||
set -eux; \
|
||||
case "${TARGETPLATFORM}" in \
|
||||
linux/arm64) UUARCH="aarch64" ;; \
|
||||
linux/amd64) UUARCH="x64" ;; \
|
||||
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
||||
esac; \
|
||||
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
|
||||
|
||||
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
|
||||
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
|
||||
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
|
||||
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
|
||||
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
|
||||
|
||||
# Copy examples and benchmarks at the end to minimize cache invalidation
|
||||
COPY examples examples
|
||||
COPY benchmarks benchmarks
|
||||
COPY ./vllm/collect_env.py .
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
#################### TEST IMAGE ####################
|
||||
# image to run unit testing suite
|
||||
# note that this uses vllm installed by `pip`
|
||||
@ -566,18 +617,12 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
# install kv_connectors if requested
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
|
||||
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
|
||||
uv pip install --system -r /tmp/kv_connectors.txt; \
|
||||
fi; \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
BITSANDBYTES_VERSION="0.42.0"; \
|
||||
else \
|
||||
BITSANDBYTES_VERSION="0.46.1"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
|
||||
fi
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
|
||||
@ -76,6 +76,9 @@ RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
ENV NIXL_VERSION=0.7.0
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
|
||||
# PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
|
||||
RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf
|
||||
|
||||
# remove torch bundled oneccl to avoid conflicts
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip uninstall oneccl oneccl-devel -y
|
||||
|
||||
@ -59,6 +59,7 @@ nav:
|
||||
- CLI Reference: cli
|
||||
- Community:
|
||||
- community/*
|
||||
- Governance: governance
|
||||
- Blog: https://blog.vllm.ai
|
||||
- Forum: https://discuss.vllm.ai
|
||||
- Slack: https://slack.vllm.ai
|
||||
|
||||
@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
|
||||
- [vllm.config.MultiModalConfig][]
|
||||
- [vllm.config.PoolerConfig][]
|
||||
- [vllm.config.StructuredOutputsConfig][]
|
||||
- [vllm.config.ProfilerConfig][]
|
||||
- [vllm.config.ObservabilityConfig][]
|
||||
- [vllm.config.KVTransferConfig][]
|
||||
- [vllm.config.CompilationConfig][]
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 174 KiB After Width: | Height: | Size: 205 KiB |
@ -84,7 +84,7 @@ Total input tokens: 1369
|
||||
Total generated tokens: 2212
|
||||
Request throughput (req/s): 1.73
|
||||
Output token throughput (tok/s): 382.89
|
||||
Total Token throughput (tok/s): 619.85
|
||||
Total token throughput (tok/s): 619.85
|
||||
---------------Time to First Token----------------
|
||||
Mean TTFT (ms): 71.54
|
||||
Median TTFT (ms): 73.88
|
||||
|
||||
@ -24,11 +24,13 @@ Compute Resources:
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Google Cloud
|
||||
- IBM
|
||||
- Intel
|
||||
- Lambda Lab
|
||||
- Nebius
|
||||
- Novita AI
|
||||
- NVIDIA
|
||||
- Red Hat
|
||||
- Replicate
|
||||
- Roblox
|
||||
- RunPod
|
||||
|
||||
@ -7,7 +7,7 @@ This guide covers optimization strategies and performance tuning for vLLM V1.
|
||||
|
||||
## Preemption
|
||||
|
||||
Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
|
||||
Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
|
||||
In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
|
||||
available again. When this occurs, you may see the following warning:
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user