mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-18 00:46:59 +08:00
Merge branch 'main' into tpopp/fix_aiter_triton_rope
This commit is contained in:
commit
a8afa2d5ce
24
.buildkite/ci_config.yaml
Normal file
24
.buildkite/ci_config.yaml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
name: vllm_ci
|
||||||
|
job_dirs:
|
||||||
|
- ".buildkite/test_areas"
|
||||||
|
- ".buildkite/image_build"
|
||||||
|
run_all_patterns:
|
||||||
|
- "docker/Dockerfile"
|
||||||
|
- "CMakeLists.txt"
|
||||||
|
- "requirements/common.txt"
|
||||||
|
- "requirements/cuda.txt"
|
||||||
|
- "requirements/build.txt"
|
||||||
|
- "requirements/test.txt"
|
||||||
|
- "setup.py"
|
||||||
|
- "csrc/"
|
||||||
|
- "cmake/"
|
||||||
|
run_all_exclude_patterns:
|
||||||
|
- "docker/Dockerfile."
|
||||||
|
- "csrc/cpu/"
|
||||||
|
- "csrc/rocm/"
|
||||||
|
- "cmake/hipify.py"
|
||||||
|
- "cmake/cpu_extension.cmake"
|
||||||
|
registries: public.ecr.aws/q9t5s3a7
|
||||||
|
repositories:
|
||||||
|
main: "vllm-ci-postmerge-repo"
|
||||||
|
premerge: "vllm-ci-test-repo"
|
||||||
@ -1,46 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
template = """<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h1>Links for vLLM</h1/>
|
|
||||||
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
|
||||||
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
filename = os.path.basename(args.wheel)
|
|
||||||
|
|
||||||
with open("index.html", "w") as f:
|
|
||||||
print(f"Generated index.html for {args.wheel}")
|
|
||||||
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
|
||||||
if "x86_64" in filename:
|
|
||||||
x86_wheel = filename
|
|
||||||
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
|
||||||
"manylinux1", "manylinux2014"
|
|
||||||
)
|
|
||||||
elif "aarch64" in filename:
|
|
||||||
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
|
||||||
"manylinux2014", "manylinux1"
|
|
||||||
)
|
|
||||||
arm_wheel = filename
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported wheel: {filename}")
|
|
||||||
# cloudfront requires escaping the '+' character
|
|
||||||
f.write(
|
|
||||||
template.format(
|
|
||||||
x86_wheel=x86_wheel,
|
|
||||||
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
|
||||||
arm_wheel=arm_wheel,
|
|
||||||
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
56
.buildkite/image_build/image_build.sh
Executable file
56
.buildkite/image_build/image_build.sh
Executable file
@ -0,0 +1,56 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ $# -lt 8 ]]; then
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
REGISTRY=$1
|
||||||
|
REPO=$2
|
||||||
|
BUILDKITE_COMMIT=$3
|
||||||
|
BRANCH=$4
|
||||||
|
VLLM_USE_PRECOMPILED=$5
|
||||||
|
VLLM_MERGE_BASE_COMMIT=$6
|
||||||
|
CACHE_FROM=$7
|
||||||
|
CACHE_TO=$8
|
||||||
|
|
||||||
|
# authenticate with AWS ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||||
|
|
||||||
|
# docker buildx
|
||||||
|
docker buildx create --name vllm-builder --driver docker-container --use
|
||||||
|
docker buildx inspect --bootstrap
|
||||||
|
docker buildx ls
|
||||||
|
|
||||||
|
# skip build if image already exists
|
||||||
|
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||||
|
echo "Image not found, proceeding with build..."
|
||||||
|
else
|
||||||
|
echo "Image found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||||
|
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||||
|
else
|
||||||
|
merge_base_commit_build_args=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
docker buildx build --file docker/Dockerfile \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||||
|
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||||
|
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||||
|
${merge_base_commit_build_args} \
|
||||||
|
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||||
|
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||||
|
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||||
|
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||||
|
--push \
|
||||||
|
--target test \
|
||||||
|
--progress plain .
|
||||||
57
.buildkite/image_build/image_build.yaml
Normal file
57
.buildkite/image_build/image_build.yaml
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
group: Abuild
|
||||||
|
steps:
|
||||||
|
- label: ":docker: Build image"
|
||||||
|
key: image-build
|
||||||
|
depends_on: []
|
||||||
|
commands:
|
||||||
|
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
|
||||||
|
- label: ":docker: Build CPU image"
|
||||||
|
key: image-build-cpu
|
||||||
|
depends_on: []
|
||||||
|
commands:
|
||||||
|
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
|
||||||
|
- label: ":docker: Build HPU image"
|
||||||
|
soft_fail: true
|
||||||
|
depends_on: []
|
||||||
|
key: image-build-hpu
|
||||||
|
commands:
|
||||||
|
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
|
||||||
|
- label: ":docker: Build CPU arm64 image"
|
||||||
|
key: cpu-arm64-image-build
|
||||||
|
depends_on: []
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 2
|
||||||
36
.buildkite/image_build/image_build_cpu.sh
Executable file
36
.buildkite/image_build/image_build_cpu.sh
Executable file
@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ $# -lt 3 ]]; then
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
REGISTRY=$1
|
||||||
|
REPO=$2
|
||||||
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
|
# authenticate with AWS ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
|
||||||
|
# skip build if image already exists
|
||||||
|
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||||
|
echo "Image not found, proceeding with build..."
|
||||||
|
else
|
||||||
|
echo "Image found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
docker build --file docker/Dockerfile.cpu \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||||
|
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||||
|
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||||
|
--build-arg VLLM_CPU_AMXBF16=true \
|
||||||
|
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||||
|
--target vllm-test \
|
||||||
|
--progress plain .
|
||||||
|
|
||||||
|
# push
|
||||||
|
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||||
33
.buildkite/image_build/image_build_cpu_arm64.sh
Executable file
33
.buildkite/image_build/image_build_cpu_arm64.sh
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ $# -lt 3 ]]; then
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
REGISTRY=$1
|
||||||
|
REPO=$2
|
||||||
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
|
# authenticate with AWS ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
|
||||||
|
# skip build if image already exists
|
||||||
|
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||||
|
echo "Image not found, proceeding with build..."
|
||||||
|
else
|
||||||
|
echo "Image found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
docker build --file docker/Dockerfile.cpu \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||||
|
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||||
|
--target vllm-test \
|
||||||
|
--progress plain .
|
||||||
|
|
||||||
|
# push
|
||||||
|
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||||
34
.buildkite/image_build/image_build_hpu.sh
Executable file
34
.buildkite/image_build/image_build_hpu.sh
Executable file
@ -0,0 +1,34 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ $# -lt 3 ]]; then
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
REGISTRY=$1
|
||||||
|
REPO=$2
|
||||||
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
|
# authenticate with AWS ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
|
||||||
|
# skip build if image already exists
|
||||||
|
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
||||||
|
echo "Image not found, proceeding with build..."
|
||||||
|
else
|
||||||
|
echo "Image found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
docker build \
|
||||||
|
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||||
|
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
||||||
|
--progress plain \
|
||||||
|
https://github.com/vllm-project/vllm-gaudi.git
|
||||||
|
|
||||||
|
# push
|
||||||
|
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
||||||
@ -8,3 +8,4 @@ tasks:
|
|||||||
value: 0.80
|
value: 0.80
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
rtol: 0.05
|
||||||
|
|||||||
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||||
@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
|
|||||||
--tp-size=1
|
--tp-size=1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
DEFAULT_RTOL = 0.08
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def scoped_env_vars(new_env: dict[str, str]):
|
||||||
|
if not new_env:
|
||||||
|
# Fast path: nothing to do
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
old_values = {}
|
||||||
|
new_keys = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for key, value in new_env.items():
|
||||||
|
if key in os.environ:
|
||||||
|
old_values[key] = os.environ[key]
|
||||||
|
else:
|
||||||
|
new_keys.append(key)
|
||||||
|
os.environ[key] = str(value)
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
# Restore / clean up
|
||||||
|
for key, value in old_values.items():
|
||||||
|
os.environ[key] = value
|
||||||
|
for key in new_keys:
|
||||||
|
os.environ.pop(key, None)
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len},"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
|
||||||
model=backend,
|
env_vars = eval_config.get("env_vars", None)
|
||||||
model_args=model_args,
|
with scoped_env_vars(env_vars):
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
results = lm_eval.simple_evaluate(
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
model=backend,
|
||||||
limit=eval_config["limit"],
|
model_args=model_args,
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
# text models. however, this is regressing measured strict-match for
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
limit=eval_config["limit"],
|
||||||
apply_chat_template=eval_config.get(
|
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||||
"apply_chat_template", backend == "vllm-vlm"
|
# text models. however, this is regressing measured strict-match for
|
||||||
),
|
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
apply_chat_template=eval_config.get(
|
||||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
"apply_chat_template", backend == "vllm-vlm"
|
||||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
),
|
||||||
batch_size=batch_size,
|
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||||
)
|
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||||
|
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||||
|
batch_size=batch_size,
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
results = launch_lm_eval(eval_config, tp_size)
|
||||||
|
|
||||||
|
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||||
|
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(
|
print(
|
||||||
f"{task['name']} | {metric['name']}: "
|
f"{task['name']} | {metric['name']}: "
|
||||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
f"ground_truth={ground_truth:.3f} | "
|
||||||
|
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||||
)
|
)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||||
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -15,6 +15,21 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- label: "Build arm64 wheel - CUDA 13.0"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-arm64-cuda-13-0
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# aarch64 build
|
# aarch64 build
|
||||||
- label: "Build arm64 CPU wheel"
|
- label: "Build arm64 CPU wheel"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
@ -25,7 +40,7 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -39,7 +54,7 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -52,7 +67,21 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# x86 CPU wheel build
|
||||||
|
- label: "Build x86 CPU wheel"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-x86-cpu
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
|||||||
@ -7,18 +7,21 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
|
import regex as re
|
||||||
|
|
||||||
if not sys.version_info >= (3, 12):
|
if not sys.version_info >= (3, 12):
|
||||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||||
|
|
||||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
||||||
<html>
|
<html>
|
||||||
|
<!-- {comment} -->
|
||||||
<meta name="pypi:repository-version" content="1.0">
|
<meta name="pypi:repository-version" content="1.0">
|
||||||
<body>
|
<body>
|
||||||
{items}
|
{items}
|
||||||
@ -89,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def generate_project_list(subdir_names: list[str]) -> str:
|
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||||
"""
|
"""
|
||||||
Generate project list HTML content linking to each project & variant sub-directory.
|
Generate project list HTML content linking to each project & variant sub-directory.
|
||||||
"""
|
"""
|
||||||
@ -97,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
|
|||||||
for name in sorted(subdir_names):
|
for name in sorted(subdir_names):
|
||||||
name = name.strip("/").strip(".")
|
name = name.strip("/").strip(".")
|
||||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
||||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||||
|
|
||||||
|
|
||||||
def generate_package_index_and_metadata(
|
def generate_package_index_and_metadata(
|
||||||
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
|
wheel_files: list[WheelFileInfo],
|
||||||
|
wheel_base_dir: Path,
|
||||||
|
index_base_dir: Path,
|
||||||
|
comment: str = "",
|
||||||
) -> tuple[str, str]:
|
) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
Generate package index HTML content for a specific package, linking to actual wheel files.
|
||||||
@ -119,7 +125,7 @@ def generate_package_index_and_metadata(
|
|||||||
file_meta = asdict(file)
|
file_meta = asdict(file)
|
||||||
file_meta["path"] = file_path_quoted
|
file_meta["path"] = file_path_quoted
|
||||||
metadata.append(file_meta)
|
metadata.append(file_meta)
|
||||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||||
metadata_str = json.dumps(metadata, indent=2)
|
metadata_str = json.dumps(metadata, indent=2)
|
||||||
return index_str, metadata_str
|
return index_str, metadata_str
|
||||||
|
|
||||||
@ -130,6 +136,7 @@ def generate_index_and_metadata(
|
|||||||
index_base_dir: Path,
|
index_base_dir: Path,
|
||||||
default_variant: str | None = None,
|
default_variant: str | None = None,
|
||||||
alias_to_default: str | None = None,
|
alias_to_default: str | None = None,
|
||||||
|
comment: str = "",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate index for all wheel files.
|
Generate index for all wheel files.
|
||||||
@ -140,6 +147,7 @@ def generate_index_and_metadata(
|
|||||||
index_base_dir (Path): Base directory to store index files.
|
index_base_dir (Path): Base directory to store index files.
|
||||||
default_variant (str | None): The default variant name, if any.
|
default_variant (str | None): The default variant name, if any.
|
||||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
||||||
|
comment (str | None): Optional comment to include in the generated HTML files.
|
||||||
|
|
||||||
First, parse all wheel files to extract metadata.
|
First, parse all wheel files to extract metadata.
|
||||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||||
@ -233,6 +241,10 @@ def generate_index_and_metadata(
|
|||||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
||||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
||||||
|
|
||||||
|
# Generate comment in HTML header
|
||||||
|
comment_str = f" ({comment})" if comment else ""
|
||||||
|
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
||||||
|
|
||||||
# Generate index for each variant
|
# Generate index for each variant
|
||||||
subdir_names = set()
|
subdir_names = set()
|
||||||
for variant, files in variant_to_files.items():
|
for variant, files in variant_to_files.items():
|
||||||
@ -252,7 +264,7 @@ def generate_index_and_metadata(
|
|||||||
subdir_names = subdir_names.union(packages)
|
subdir_names = subdir_names.union(packages)
|
||||||
else:
|
else:
|
||||||
# generate project list for this variant directly
|
# generate project list for this variant directly
|
||||||
project_list_str = generate_project_list(sorted(packages))
|
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
||||||
with open(variant_dir / "index.html", "w") as f:
|
with open(variant_dir / "index.html", "w") as f:
|
||||||
f.write(project_list_str)
|
f.write(project_list_str)
|
||||||
|
|
||||||
@ -262,7 +274,7 @@ def generate_index_and_metadata(
|
|||||||
package_dir = variant_dir / package
|
package_dir = variant_dir / package
|
||||||
package_dir.mkdir(parents=True, exist_ok=True)
|
package_dir.mkdir(parents=True, exist_ok=True)
|
||||||
index_str, metadata_str = generate_package_index_and_metadata(
|
index_str, metadata_str = generate_package_index_and_metadata(
|
||||||
package_files, wheel_base_dir, package_dir
|
package_files, wheel_base_dir, package_dir, comment
|
||||||
)
|
)
|
||||||
with open(package_dir / "index.html", "w") as f:
|
with open(package_dir / "index.html", "w") as f:
|
||||||
f.write(index_str)
|
f.write(index_str)
|
||||||
@ -270,7 +282,7 @@ def generate_index_and_metadata(
|
|||||||
f.write(metadata_str)
|
f.write(metadata_str)
|
||||||
|
|
||||||
# Generate top-level project list index
|
# Generate top-level project list index
|
||||||
project_list_str = generate_project_list(sorted(subdir_names))
|
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
||||||
with open(index_base_dir / "index.html", "w") as f:
|
with open(index_base_dir / "index.html", "w") as f:
|
||||||
f.write(project_list_str)
|
f.write(project_list_str)
|
||||||
|
|
||||||
@ -282,6 +294,7 @@ if __name__ == "__main__":
|
|||||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||||
--output-dir <output_directory> : directory to store generated index files
|
--output-dir <output_directory> : directory to store generated index files
|
||||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||||
|
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
||||||
"""
|
"""
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -311,6 +324,12 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Alias variant name for the default variant",
|
help="Alias variant name for the default variant",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--comment",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="Optional comment string to include in generated HTML files",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@ -353,6 +372,17 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||||
|
|
||||||
|
# keep only "official" files for a non-nightly version (specifed by cli args)
|
||||||
|
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
||||||
|
if PY_VERSION_RE.match(version):
|
||||||
|
# upload-wheels.sh ensures no "dev" is in args.version
|
||||||
|
wheel_files = list(
|
||||||
|
filter(lambda x: version in x and "dev" not in x, wheel_files)
|
||||||
|
)
|
||||||
|
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
|
||||||
|
else:
|
||||||
|
print("Nightly version detected, keeping all wheel files.")
|
||||||
|
|
||||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||||
# s3://vllm-wheels/{version}/<wheel files>
|
# s3://vllm-wheels/{version}/<wheel files>
|
||||||
# s3://vllm-wheels/<anything>/<index files>
|
# s3://vllm-wheels/<anything>/<index files>
|
||||||
@ -365,5 +395,6 @@ if __name__ == "__main__":
|
|||||||
index_base_dir=index_base_dir,
|
index_base_dir=index_base_dir,
|
||||||
default_variant=None,
|
default_variant=None,
|
||||||
alias_to_default=args.alias_to_default,
|
alias_to_default=args.alias_to_default,
|
||||||
|
comment=args.comment.strip(),
|
||||||
)
|
)
|
||||||
print(f"Successfully generated index and metadata in {output_dir}")
|
print(f"Successfully generated index and metadata in {output_dir}")
|
||||||
|
|||||||
@ -36,11 +36,17 @@ function cpu_tests() {
|
|||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
|
# Run model tests
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
||||||
|
|
||||||
# Run kernel tests
|
# Run kernel tests
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py
|
pytest -x -v -s tests/kernels/test_onednn.py
|
||||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
|
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||||
|
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
|
||||||
|
|
||||||
# basic online serving
|
# basic online serving
|
||||||
docker exec cpu-test bash -c '
|
docker exec cpu-test bash -c '
|
||||||
|
|||||||
@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
|
|||||||
|
|
||||||
# Define environments
|
# Define environments
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV SOC_VERSION="ascend910b1"
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||||
|
|||||||
@ -38,6 +38,7 @@ docker run \
|
|||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
@ -46,6 +47,6 @@ docker run \
|
|||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||||
|
|
||||||
|
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
||||||
|
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
echo "Setting up Prime-RL integration test environment..."
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
# Clean up any existing Prime-RL directory
|
||||||
|
|||||||
@ -1,73 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
|
||||||
THRESHOLD=${1:-0.25}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8030}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
|
||||||
|
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--tensor-parallel-size 2 \
|
|
||||||
--data-parallel-size 2 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--enable-eplb \
|
|
||||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
|
|||||||
--data-parallel-size 2 \
|
--data-parallel-size 2 \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--enable-eplb \
|
--enable-eplb \
|
||||||
--eplb-config '{"window_size":200,"step_interval":600}' \
|
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--port $PORT &
|
--port $PORT &
|
||||||
|
|||||||
@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
|||||||
fi
|
fi
|
||||||
wheel="${wheel_files[0]}"
|
wheel="${wheel_files[0]}"
|
||||||
|
|
||||||
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||||
|
# we also accept params as manylinux tag
|
||||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||||
manylinux_version="manylinux_2_31"
|
manylinux_version="${1:-manylinux_2_31}"
|
||||||
|
|
||||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||||
if [[ "$wheel" != *"linux"* ]]; then
|
if [[ "$wheel" != *"linux"* ]]; then
|
||||||
@ -81,7 +82,10 @@ else
|
|||||||
alias_arg=""
|
alias_arg=""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
|
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||||
|
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||||
|
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||||
|
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
||||||
|
|
||||||
# copy indices to /<commit>/ unconditionally
|
# copy indices to /<commit>/ unconditionally
|
||||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||||
@ -93,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]];
|
|||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# copy to /<pure_version>/ only if it does not have "dev" in the version
|
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||||
if [[ "$version" != *"dev"* ]]; then
|
if [[ "$version" != *"dev"* ]]; then
|
||||||
echo "Uploading indices to overwrite /$pure_version/"
|
echo "Re-generating indices for /$pure_version/"
|
||||||
|
rm -rf "$INDICES_OUTPUT_DIR/*"
|
||||||
|
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||||
|
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -61,8 +61,8 @@ steps:
|
|||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
- pytest -v -s utils_
|
- pytest -v -s utils_
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 30
|
||||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
grade: Blocking
|
||||||
@ -73,6 +73,7 @@ steps:
|
|||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
- tests/tokenizers_
|
- tests/tokenizers_
|
||||||
|
- tests/tool_parsers
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
- tests/config
|
- tests/config
|
||||||
no_gpu: true
|
no_gpu: true
|
||||||
@ -82,6 +83,7 @@ steps:
|
|||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
- pytest -v -s tokenizers_
|
- pytest -v -s tokenizers_
|
||||||
|
- pytest -v -s tool_parsers
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s transformers_utils
|
||||||
- pytest -v -s config
|
- pytest -v -s config
|
||||||
|
|
||||||
@ -326,10 +328,10 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
|
|
||||||
- label: V1 Test e2e + engine # 30min
|
- label: V1 Test e2e + engine # 65min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 90
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -398,7 +400,8 @@ steps:
|
|||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
gpu: h100
|
gpu: h100
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/v1/attention
|
||||||
|
- vllm/model_executor/layers
|
||||||
- tests/v1/determinism/
|
- tests/v1/determinism/
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
@ -434,29 +437,34 @@ steps:
|
|||||||
|
|
||||||
- label: Examples Test # 30min
|
- label: Examples Test # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
|
- vllm/multimodal
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
|
# for basic
|
||||||
|
- python3 offline_inference/basic/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/chat.py
|
|
||||||
- python3 offline_inference/prefix_caching.py
|
|
||||||
- python3 offline_inference/llm_engine_example.py
|
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
|
# for multi-modal models
|
||||||
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
|
# for pooling models
|
||||||
|
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||||
|
# for features demo
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
@ -718,14 +726,15 @@ steps:
|
|||||||
- uv pip install --system conch-triton-kernels
|
- uv pip install --system conch-triton-kernels
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 15min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 75
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
|
autorun_on_main: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
@ -738,7 +747,7 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/entrypoints/openai/
|
- vllm/entrypoints/openai/
|
||||||
- vllm/model_executor/models/whisper.py
|
- vllm/model_executor/models/whisper.py
|
||||||
commands: # LMEval
|
commands: # LMEval+Transcription WER check
|
||||||
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
@ -752,19 +761,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s -m 'not cpu_test' tool_use
|
- pytest -v -s tool_use
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/tool_use
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'cpu_test' tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@ -974,8 +971,8 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||||
timeout_in_minutes: 70
|
timeout_in_minutes: 180
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@ -987,7 +984,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||||
|
timeout_in_minutes: 120
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@ -1011,7 +1009,8 @@ steps:
|
|||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 3
|
- label: Multi-Modal Models Test (Extended) 3 # 75min
|
||||||
|
timeout_in_minutes: 150
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@ -1120,7 +1119,6 @@ steps:
|
|||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- vllm/model_executor/layers/fused_moe/layer.py
|
|
||||||
- tests/compile/test_fusion_attn.py
|
- tests/compile/test_fusion_attn.py
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
@ -1154,17 +1152,15 @@ steps:
|
|||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all e2e fusion tests
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: ROCm GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
agent_pool: mi325_1
|
gpu: b200
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
optional: true # run on nightlies
|
optional: true # run on nightlies
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/evals/gpt_oss
|
- tests/evals/gpt_oss
|
||||||
@ -1173,7 +1169,7 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
- label: Blackwell Quantized MoE Test
|
- label: Blackwell Quantized MoE Test
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -1444,12 +1440,13 @@ steps:
|
|||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
|
|
||||||
- label: LM Eval Large Models # optional
|
- label: LM Eval Large Models # optional
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
gpu: a100
|
gpu: a100
|
||||||
optional: true
|
optional: true
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -1461,11 +1458,11 @@ steps:
|
|||||||
|
|
||||||
##### H100 test #####
|
##### H100 test #####
|
||||||
- label: LM Eval Large Models (H100) # optional
|
- label: LM Eval Large Models (H100) # optional
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
agent_pool: mi325_4
|
|
||||||
# grade: Blocking
|
|
||||||
gpu: h100
|
gpu: h100
|
||||||
optional: true
|
optional: true
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -1475,6 +1472,7 @@ steps:
|
|||||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distributed Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -1506,6 +1504,57 @@ steps:
|
|||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
|
##### E2E Eval Tests #####
|
||||||
|
- label: LM Eval Small Models (1 Card) # 15min
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (4 Card)
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
|
- label: ROCm LM Eval Large Models (8 Card)
|
||||||
|
mirror_hardwares: [amdproduction]
|
||||||
|
agent_pool: mi325_8
|
||||||
|
num_gpus: 8
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
|
||||||
|
|
||||||
|
- label: ROCm GPT-OSS Eval
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
agent_pool: mi325_1
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/evals/gpt_oss
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
##### RL Integration Tests #####
|
||||||
- label: Prime-RL Integration Test # 15min
|
- label: Prime-RL Integration Test # 15min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -1520,7 +1569,6 @@ steps:
|
|||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
@ -1553,3 +1601,26 @@ steps:
|
|||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||||
|
|
||||||
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
|
|||||||
@ -57,8 +57,8 @@ steps:
|
|||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
- pytest -v -s utils_
|
- pytest -v -s utils_
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 30
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
@ -66,6 +66,7 @@ steps:
|
|||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
- tests/tokenizers_
|
- tests/tokenizers_
|
||||||
|
- tests/tool_parsers
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
- tests/config
|
- tests/config
|
||||||
no_gpu: true
|
no_gpu: true
|
||||||
@ -75,6 +76,7 @@ steps:
|
|||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
- pytest -v -s tokenizers_
|
- pytest -v -s tokenizers_
|
||||||
|
- pytest -v -s tool_parsers
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s transformers_utils
|
||||||
- pytest -v -s config
|
- pytest -v -s config
|
||||||
|
|
||||||
@ -350,7 +352,8 @@ steps:
|
|||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
gpu: h100
|
gpu: h100
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/v1/attention
|
||||||
|
- vllm/model_executor/layers
|
||||||
- tests/v1/determinism/
|
- tests/v1/determinism/
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
@ -387,6 +390,7 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
|
- vllm/multimodal
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
@ -466,7 +470,9 @@ steps:
|
|||||||
# tests covered elsewhere.
|
# tests covered elsewhere.
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||||
|
# (using -0 for proper path handling)
|
||||||
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -480,7 +486,9 @@ steps:
|
|||||||
# as it is a heavy test that is covered in other steps.
|
# as it is a heavy test that is covered in other steps.
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||||
|
# (using -0 for proper path handling)
|
||||||
|
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 27min
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@ -666,16 +674,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s -m 'not cpu_test' tool_use
|
- pytest -v -s tool_use
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/tool_use
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'cpu_test' tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@ -686,6 +685,7 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
|
- tests/models/registry.py
|
||||||
commands:
|
commands:
|
||||||
# Run a subset of model initialization tests
|
# Run a subset of model initialization tests
|
||||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||||
@ -698,6 +698,7 @@ steps:
|
|||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
- vllm/transformers_utils/
|
- vllm/transformers_utils/
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
|
- tests/models/registry.py
|
||||||
commands:
|
commands:
|
||||||
# Only when vLLM model source is modified - test initialization of a large
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
# subset of supported models (the complement of the small subset in the above
|
# subset of supported models (the complement of the small subset in the above
|
||||||
@ -830,7 +831,7 @@ steps:
|
|||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
no_gpu: true
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
|
||||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test
|
- label: Multi-Modal Processor Test
|
||||||
@ -1340,6 +1341,7 @@ steps:
|
|||||||
- label: Prime-RL Integration Test # 15min
|
- label: Prime-RL Integration Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
optional: true
|
optional: true
|
||||||
|
soft_fail: true
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -1374,21 +1376,3 @@ steps:
|
|||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
|
||||||
|
|
||||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
|
||||||
|
|||||||
21
.buildkite/test_areas/attention.yaml
Normal file
21
.buildkite/test_areas/attention.yaml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
group: Attention
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: V1 attention (H100)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/attention
|
||||||
|
|
||||||
|
- label: V1 attention (B200)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||||
16
.buildkite/test_areas/basic_correctness.yaml
Normal file
16
.buildkite/test_areas/basic_correctness.yaml
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
group: Basic Correctness
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Basic Correctness
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/basic_correctness/test_basic_correctness
|
||||||
|
- tests/basic_correctness/test_cpu_offload
|
||||||
|
- tests/basic_correctness/test_cumem.py
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
19
.buildkite/test_areas/benchmarks.yaml
Normal file
19
.buildkite/test_areas/benchmarks.yaml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
group: Benchmarks
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Benchmarks
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
|
source_file_dependencies:
|
||||||
|
- benchmarks/
|
||||||
|
commands:
|
||||||
|
- bash scripts/run-benchmarks.sh
|
||||||
|
|
||||||
|
- label: Benchmarks CLI Test
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/benchmarks/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s benchmarks/
|
||||||
57
.buildkite/test_areas/compile.yaml
Normal file
57
.buildkite/test_areas/compile.yaml
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
group: Compile
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Fusion and Compile Tests (B200)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/test_fusion_attn.py
|
||||||
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml
|
||||||
|
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
|
- label: Fusion E2E (2 GPUs)(B200)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all e2e fusion tests
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
|
||||||
22
.buildkite/test_areas/cuda.yaml
Normal file
22
.buildkite/test_areas/cuda.yaml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
group: CUDA
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Platform Tests (CUDA)
|
||||||
|
timeout_in_minutes: 15
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/cuda
|
||||||
|
commands:
|
||||||
|
- pytest -v -s cuda/test_cuda_context.py
|
||||||
|
|
||||||
|
- label: Cudagraph
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/v1/cudagraph
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/config/compilation.py
|
||||||
|
- vllm/compilation
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||||
199
.buildkite/test_areas/distributed.yaml
Normal file
199
.buildkite/test_areas/distributed.yaml
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
group: Distributed
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Distributed Comm Ops
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed
|
||||||
|
- tests/distributed
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_comm_ops.py
|
||||||
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
- pytest -v -s distributed/test_shm_buffer.py
|
||||||
|
- pytest -v -s distributed/test_shm_storage.py
|
||||||
|
|
||||||
|
- label: Distributed (2 GPUs)
|
||||||
|
timeout_in_minutes: 90
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/compilation/
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/engine/
|
||||||
|
- vllm/executor/
|
||||||
|
- vllm/worker/worker_base.py
|
||||||
|
- vllm/v1/engine/
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
|
- tests/compile/test_wrapper.py
|
||||||
|
- tests/distributed/
|
||||||
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
|
- tests/v1/distributed
|
||||||
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
|
- tests/v1/shutdown
|
||||||
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
|
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||||
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (4 GPUs)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- tests/distributed/test_utils
|
||||||
|
- tests/distributed/test_pynccl
|
||||||
|
- tests/distributed/test_events
|
||||||
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
|
- examples/offline_inference/rlhf.py
|
||||||
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
|
- tests/v1/distributed
|
||||||
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and external_dp=2
|
||||||
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
# test with torchrun tp=2 and pp=2
|
||||||
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
# test with torchrun tp=4 and dp=1
|
||||||
|
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=2, pp=2 and dp=1
|
||||||
|
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=1 and dp=4 with ep
|
||||||
|
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=2 and dp=2 with ep
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with internal dp
|
||||||
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
|
- pytest -v -s distributed/test_utils.py
|
||||||
|
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||||
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
- pytest -v -s distributed/test_events.py
|
||||||
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
|
# when we have multiple distributed example tests
|
||||||
|
- cd ../examples/offline_inference
|
||||||
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||||
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (8 GPUs)(H100)
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 8
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- examples/offline_inference/torchrun_dp_example.py
|
||||||
|
- vllm/config/parallel.py
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/v1/engine/llm_engine.py
|
||||||
|
- vllm/v1/executor/uniproc_executor.py
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and dp=4 with ep
|
||||||
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
|
- label: Distributed Tests (4 GPUs)(A100)
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
commands:
|
||||||
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||||
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (2 GPUs)(H200)
|
||||||
|
gpu: h200
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||||
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (2 GPUs)(B200)
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
|
- label: 2 Node Test (4 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
num_nodes: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/engine/
|
||||||
|
- vllm/executor/
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/distributed/
|
||||||
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
|
commands:
|
||||||
|
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||||
|
|
||||||
|
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/engine/
|
||||||
|
- vllm/executor/
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/distributed/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||||
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
59
.buildkite/test_areas/e2e_integration.yaml
Normal file
59
.buildkite/test_areas/e2e_integration.yaml
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
group: E2E Integration
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
|
- label: Prime-RL Integration (2 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||||
|
|
||||||
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
26
.buildkite/test_areas/engine.yaml
Normal file
26
.buildkite/test_areas/engine.yaml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
group: Engine
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Engine
|
||||||
|
timeout_in_minutes: 15
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/engine
|
||||||
|
- tests/test_sequence
|
||||||
|
- tests/test_config
|
||||||
|
- tests/test_logger
|
||||||
|
- tests/test_vllm_port
|
||||||
|
commands:
|
||||||
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
|
|
||||||
|
- label: V1 e2e + engine
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
commands:
|
||||||
|
# TODO: accuracy does not match, whether setting
|
||||||
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
|
- pytest -v -s v1/e2e
|
||||||
|
- pytest -v -s v1/engine
|
||||||
68
.buildkite/test_areas/entrypoints.yaml
Normal file
68
.buildkite/test_areas/entrypoints.yaml
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
group: Entrypoints
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Entrypoints Unit Tests
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/entrypoints
|
||||||
|
- tests/entrypoints/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s entrypoints/openai/tool_parsers
|
||||||
|
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||||
|
|
||||||
|
- label: Entrypoints Integration (LLM)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/entrypoints/llm
|
||||||
|
- tests/entrypoints/offline_mode
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
|
- label: Entrypoints Integration (API Server)
|
||||||
|
timeout_in_minutes: 130
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/entrypoints/openai
|
||||||
|
- tests/entrypoints/test_chat_utils
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||||
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||||
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
|
||||||
|
|
||||||
|
- label: Entrypoints Integration (Pooling)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/entrypoints/pooling
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s entrypoints/pooling
|
||||||
|
|
||||||
|
|
||||||
|
- label: Entrypoints V1
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/entrypoints
|
||||||
|
|
||||||
|
- label: OpenAI API Correctness
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/entrypoints/openai/
|
||||||
|
- vllm/model_executor/models/whisper.py
|
||||||
|
commands: # LMEval+Transcription WER check
|
||||||
|
- pytest -s entrypoints/openai/correctness/
|
||||||
23
.buildkite/test_areas/expert_parallelism.yaml
Normal file
23
.buildkite/test_areas/expert_parallelism.yaml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
group: Expert Parallelism
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: EPLB Algorithm
|
||||||
|
timeout_in_minutes: 15
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/eplb
|
||||||
|
- tests/distributed/test_eplb_algo.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_eplb_algo.py
|
||||||
|
|
||||||
|
- label: EPLB Execution
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/eplb
|
||||||
|
- tests/distributed/test_eplb_execute.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
|
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||||
117
.buildkite/test_areas/kernels.yaml
Normal file
117
.buildkite/test_areas/kernels.yaml
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
group: Kernels
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Kernels Core Operation Test
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- tests/kernels/core
|
||||||
|
- tests/kernels/test_top_k_per_row.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||||
|
|
||||||
|
- label: Kernels Attention Test %N
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/attention/
|
||||||
|
- vllm/attention
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/kernels/attention
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Kernels Quantization Test %N
|
||||||
|
timeout_in_minutes: 90
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/kernels/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Kernels MoE Test %N
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/cutlass_w8a8/moe/
|
||||||
|
- csrc/moe/
|
||||||
|
- tests/kernels/moe
|
||||||
|
- vllm/model_executor/layers/fused_moe/
|
||||||
|
- vllm/distributed/device_communicators/
|
||||||
|
- vllm/envs.py
|
||||||
|
- vllm/config
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Kernels Mamba Test
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/mamba/
|
||||||
|
- tests/kernels/mamba
|
||||||
|
- vllm/model_executor/layers/mamba/ops
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
|
- label: Kernels DeepGEMM Test (H100)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 1
|
||||||
|
source_file_dependencies:
|
||||||
|
- tools/install_deepgemm.sh
|
||||||
|
- vllm/utils/deep_gemm.py
|
||||||
|
- vllm/model_executor/layers/fused_moe
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/kernels/quantization/test_block_fp8.py
|
||||||
|
- tests/kernels/moe/test_deepgemm.py
|
||||||
|
- tests/kernels/moe/test_batched_deepgemm.py
|
||||||
|
- tests/kernels/attention/test_deepgemm_attention.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||||
|
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||||
|
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||||
|
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||||
|
|
||||||
|
- label: Kernels (B200)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
# optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- csrc/attention/mla/
|
||||||
|
- csrc/quantization/cutlass_w8a8/moe/
|
||||||
|
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||||
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||||
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||||
|
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||||
|
- vllm/platforms/cuda.py
|
||||||
|
- vllm/attention/selector.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
# Attention
|
||||||
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
|
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||||
|
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||||
|
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||||
|
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||||
|
# Quantization
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||||
46
.buildkite/test_areas/lm_eval.yaml
Normal file
46
.buildkite/test_areas/lm_eval.yaml
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
group: LM Eval
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: LM Eval Small Models
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
autorun_on_main: true
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
|
- label: LM Eval Small Models (B200)
|
||||||
|
timeout_in_minutes: 120
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
31
.buildkite/test_areas/lora.yaml
Normal file
31
.buildkite/test_areas/lora.yaml
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
group: LoRA
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: LoRA %N
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/lora
|
||||||
|
- tests/lora
|
||||||
|
commands:
|
||||||
|
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
||||||
|
parallelism: 4
|
||||||
|
|
||||||
|
|
||||||
|
- label: LoRA TP (Distributed)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/lora
|
||||||
|
- tests/lora
|
||||||
|
commands:
|
||||||
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
|
# before the fix, we need to use spawn to test it
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
|
# requires multi-GPU testing for validation.
|
||||||
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||||
165
.buildkite/test_areas/misc.yaml
Normal file
165
.buildkite/test_areas/misc.yaml
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
group: Miscellaneous
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: V1 Others
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
|
- pytest -v -s v1/executor
|
||||||
|
- pytest -v -s v1/kv_offload
|
||||||
|
- pytest -v -s v1/sample
|
||||||
|
- pytest -v -s v1/logits_processors
|
||||||
|
- pytest -v -s v1/worker
|
||||||
|
- pytest -v -s v1/spec_decode
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||||
|
- pytest -v -s v1/test_oracle.py
|
||||||
|
- pytest -v -s v1/test_request.py
|
||||||
|
- pytest -v -s v1/test_outputs.py
|
||||||
|
# Integration test for streaming correctness (requires special branch).
|
||||||
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
|
- label: V1 Others (CPU)
|
||||||
|
depends_on: ~
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/core
|
||||||
|
- pytest -v -s v1/structured_output
|
||||||
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/metrics
|
||||||
|
|
||||||
|
- label: Regression
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/test_regression
|
||||||
|
commands:
|
||||||
|
- pip install modelscope
|
||||||
|
- pytest -v -s test_regression.py
|
||||||
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
|
- label: Examples
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
working_dir: "/vllm-workspace/examples"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/entrypoints
|
||||||
|
- vllm/multimodal
|
||||||
|
- examples/
|
||||||
|
commands:
|
||||||
|
- pip install tensorizer # for tensorizer test
|
||||||
|
- python3 offline_inference/basic/chat.py # for basic
|
||||||
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
|
- python3 offline_inference/basic/classify.py
|
||||||
|
- python3 offline_inference/basic/embed.py
|
||||||
|
- python3 offline_inference/basic/score.py
|
||||||
|
# for multi-modal models
|
||||||
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
|
# for pooling models
|
||||||
|
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||||
|
# for features demo
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
|
|
||||||
|
- label: Metrics, Tracing (2 GPUs)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1/tracing
|
||||||
|
commands:
|
||||||
|
- "pip install \
|
||||||
|
'opentelemetry-sdk>=1.26.0' \
|
||||||
|
'opentelemetry-api>=1.26.0' \
|
||||||
|
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||||
|
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||||
|
- pytest -v -s v1/tracing
|
||||||
|
|
||||||
|
- label: Python-only Installation
|
||||||
|
depends_on: ~
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/standalone_tests/python_only_compile.sh
|
||||||
|
- setup.py
|
||||||
|
commands:
|
||||||
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
|
- label: Async Engine, Inputs, Utils, Worker
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/multimodal
|
||||||
|
- tests/utils_
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
|
- pytest -v -s utils_
|
||||||
|
|
||||||
|
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||||
|
depends_on: ~
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/test_inputs.py
|
||||||
|
- tests/test_outputs.py
|
||||||
|
- tests/multimodal
|
||||||
|
- tests/standalone_tests/lazy_imports.py
|
||||||
|
- tests/tokenizers_
|
||||||
|
- tests/tool_parsers
|
||||||
|
- tests/transformers_utils
|
||||||
|
- tests/config
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- python3 standalone_tests/lazy_imports.py
|
||||||
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s test_outputs.py
|
||||||
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
|
- pytest -v -s tokenizers_
|
||||||
|
- pytest -v -s tool_parsers
|
||||||
|
- pytest -v -s transformers_utils
|
||||||
|
- pytest -v -s config
|
||||||
|
|
||||||
|
- label: GPT-OSS Eval (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/evals/gpt_oss
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
|
- label: Batch Invariance (H100)
|
||||||
|
timeout_in_minutes: 25
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- tests/v1/determinism/
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pip install pytest-timeout pytest-forked
|
||||||
|
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||||
|
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||||
17
.buildkite/test_areas/model_executor.yaml
Normal file
17
.buildkite/test_areas/model_executor.yaml
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
group: Model Executor
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Model Executor
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/engine/arg_utils.py
|
||||||
|
- vllm/config/model.py
|
||||||
|
- vllm/model_executor
|
||||||
|
- tests/model_executor
|
||||||
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
commands:
|
||||||
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s model_executor
|
||||||
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
62
.buildkite/test_areas/models_basic.yaml
Normal file
62
.buildkite/test_areas/models_basic.yaml
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
group: Models - Basic
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Basic Models Tests (Initialization)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/test_initialization.py
|
||||||
|
commands:
|
||||||
|
# Run a subset of model initialization tests
|
||||||
|
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||||
|
|
||||||
|
- label: Basic Models Tests (Extra Initialization) %N
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/models/test_initialization.py
|
||||||
|
commands:
|
||||||
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
|
# subset of supported models (the complement of the small subset in the above
|
||||||
|
# test.) Also run if model initialization test file is modified
|
||||||
|
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Basic Models Tests (Other)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/test_transformers.py
|
||||||
|
- tests/models/test_registry.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||||
|
|
||||||
|
- label: Basic Models Test (Other CPU) # 5min
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/test_utils.py
|
||||||
|
- tests/models/test_vision.py
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||||
|
|
||||||
|
- label: Transformers Nightly Models
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
optional: true
|
||||||
|
soft_fail: true
|
||||||
|
commands:
|
||||||
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
|
- pytest -v -s tests/models/test_initialization.py
|
||||||
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
|
# Whisper needs spawn method to avoid deadlock
|
||||||
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
22
.buildkite/test_areas/models_distributed.yaml
Normal file
22
.buildkite/test_areas/models_distributed.yaml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
group: Models - Distributed
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Distributed Model Tests (2 GPUs)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/basic_correctness/
|
||||||
|
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
||||||
|
- tests/models/
|
||||||
|
commands:
|
||||||
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
||||||
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||||
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
91
.buildkite/test_areas/models_language.yaml
Normal file
91
.buildkite/test_areas/models_language.yaml
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
group: Models - Language
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Language Models Tests (Standard)
|
||||||
|
timeout_in_minutes: 25
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language
|
||||||
|
commands:
|
||||||
|
# Test standard language models, excluding a subset of slow tests
|
||||||
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/language -m 'core_model and (not slow_test)'
|
||||||
|
|
||||||
|
- label: Language Models Tests (Extra Standard) %N
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/models/language/pooling/test_embedding.py
|
||||||
|
- tests/models/language/generation/test_common.py
|
||||||
|
- tests/models/language/pooling/test_classification.py
|
||||||
|
commands:
|
||||||
|
# Shard slow subset of standard language models tests. Only run when model
|
||||||
|
# source is modified, or when specified test files are modified
|
||||||
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Language Models Tests (Hybrid) %N
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/generation
|
||||||
|
commands:
|
||||||
|
# Install fast path packages for testing against transformers
|
||||||
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
|
# Shard hybrid language model tests
|
||||||
|
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Language Models Test (Extended Generation) # 80min
|
||||||
|
timeout_in_minutes: 110
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/generation
|
||||||
|
commands:
|
||||||
|
# Install fast path packages for testing against transformers
|
||||||
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
|
|
||||||
|
- label: Language Models Test (PPL)
|
||||||
|
timeout_in_minutes: 110
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/generation_ppl_test
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/language/generation_ppl_test
|
||||||
|
|
||||||
|
- label: Language Models Test (Extended Pooling) # 36min
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/pooling
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||||
|
|
||||||
|
- label: Language Models Test (MTEB)
|
||||||
|
timeout_in_minutes: 110
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/pooling_mteb_test
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/language/pooling_mteb_test
|
||||||
79
.buildkite/test_areas/models_multimodal.yaml
Normal file
79
.buildkite/test_areas/models_multimodal.yaml
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
group: Models - Multimodal
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Multi-Modal Models (Standard) # 60min
|
||||||
|
timeout_in_minutes: 80
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Processor Test (CPU)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||||
|
|
||||||
|
- label: Multi-Modal Processor # 44min
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||||
|
timeout_in_minutes: 70
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: Multi-Modal Models (Extended) 1
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||||
|
|
||||||
|
- label: Multi-Modal Models (Extended) 2
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||||
|
|
||||||
|
- label: Multi-Modal Models (Extended) 3
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||||
|
|
||||||
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
|
- label: Custom Models
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- echo 'Testing custom models...'
|
||||||
|
# PR authors can temporarily add commands below to test individual models
|
||||||
|
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||||
|
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||||
34
.buildkite/test_areas/plugins.yaml
Normal file
34
.buildkite/test_areas/plugins.yaml
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
group: Plugins
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Plugin Tests (2 GPUs)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/plugins/
|
||||||
|
- tests/plugins/
|
||||||
|
commands:
|
||||||
|
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
|
# end platform plugin tests
|
||||||
|
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||||
|
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||||
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
|
# end io_processor plugins test
|
||||||
|
# begin stat_logger plugins test
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||||
|
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||||
|
- pip uninstall dummy_stat_logger -y
|
||||||
|
# end stat_logger plugins test
|
||||||
|
# other tests continue here:
|
||||||
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||||
50
.buildkite/test_areas/pytorch.yaml
Normal file
50
.buildkite/test_areas/pytorch.yaml
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
group: PyTorch
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: PyTorch Compilation Unit Tests
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
# Run unit tests defined directly under compile/,
|
||||||
|
# not including subdirectories, which are usually heavier
|
||||||
|
# tests covered elsewhere.
|
||||||
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
|
||||||
|
|
||||||
|
- label: PyTorch Fullgraph Smoke Test
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||||
|
# as it is a heavy test that is covered in other steps.
|
||||||
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||||
|
|
||||||
|
- label: PyTorch Fullgraph
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
|
# Limit to no custom ops to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
|
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
|
||||||
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||||
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
|
# of the dependencies. Please check the error message and add the package to whitelist
|
||||||
|
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||||
|
soft_fail: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- requirements/nightly_torch_test.txt
|
||||||
|
commands:
|
||||||
|
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||||
46
.buildkite/test_areas/quantization.yaml
Normal file
46
.buildkite/test_areas/quantization.yaml
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
group: Quantization
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Quantization
|
||||||
|
timeout_in_minutes: 90
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/quantization
|
||||||
|
commands:
|
||||||
|
# temporary install here since we need nightly, will move to requirements/test.in
|
||||||
|
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
||||||
|
|
||||||
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
|
# we can only upgrade after this is resolved
|
||||||
|
# TODO(jerryzh168): resolve the above comment
|
||||||
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
|
- uv pip install --system conch-triton-kernels
|
||||||
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
|
- label: Quantized MoE Test (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/quantization/test_blackwell_moe.py
|
||||||
|
- vllm/model_executor/models/deepseek_v2.py
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/models/llama4.py
|
||||||
|
- vllm/model_executor/layers/fused_moe
|
||||||
|
- vllm/model_executor/layers/quantization/compressed_tensors
|
||||||
|
- vllm/model_executor/layers/quantization/modelopt.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
|
- label: Quantized Models Test
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/models/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/quantization
|
||||||
14
.buildkite/test_areas/samplers.yaml
Normal file
14
.buildkite/test_areas/samplers.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
group: Samplers
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Samplers Test
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- vllm/sampling_metadata.py
|
||||||
|
- tests/samplers
|
||||||
|
- tests/conftest.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s samplers
|
||||||
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
13
.buildkite/test_areas/tool_use.yaml
Normal file
13
.buildkite/test_areas/tool_use.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
group: Tool use
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: OpenAI-Compatible Tool Use
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
fast_check: false
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
commands:
|
||||||
|
- pytest -v -s tool_use
|
||||||
25
.buildkite/test_areas/weight_loading.yaml
Normal file
25
.buildkite/test_areas/weight_loading.yaml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
group: Weight Loading
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Weight Loading Multiple GPU # 33min
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/weight_loading
|
||||||
|
commands:
|
||||||
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
|
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/weight_loading
|
||||||
|
commands:
|
||||||
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
48
.github/mergify.yml
vendored
48
.github/mergify.yml
vendored
@ -14,6 +14,52 @@ pull_request_rules:
|
|||||||
comment:
|
comment:
|
||||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||||
|
|
||||||
|
- name: comment-pre-commit-failure
|
||||||
|
description: Comment on PR when pre-commit check fails
|
||||||
|
conditions:
|
||||||
|
- status-failure=pre-commit
|
||||||
|
- -closed
|
||||||
|
- -draft
|
||||||
|
actions:
|
||||||
|
comment:
|
||||||
|
message: |
|
||||||
|
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv pip install pre-commit
|
||||||
|
pre-commit install
|
||||||
|
pre-commit run --all-files
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, commit the changes and push to your branch.
|
||||||
|
|
||||||
|
For future commits, `pre-commit` will run automatically on changed files before each commit.
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> <details>
|
||||||
|
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
||||||
|
> <br/>
|
||||||
|
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> # For mypy (substitute "3.10" with the failing version if needed)
|
||||||
|
> pre-commit run --hook-stage manual mypy-3.10
|
||||||
|
> # For markdownlint
|
||||||
|
> pre-commit run --hook-stage manual markdownlint
|
||||||
|
> ```
|
||||||
|
> </details>
|
||||||
|
|
||||||
|
- name: comment-dco-failure
|
||||||
|
description: Comment on PR when DCO check fails
|
||||||
|
conditions:
|
||||||
|
- status-failure=dco
|
||||||
|
- -closed
|
||||||
|
- -draft
|
||||||
|
actions:
|
||||||
|
comment:
|
||||||
|
message: |
|
||||||
|
Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
|
||||||
|
|
||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
@ -140,7 +186,7 @@ pull_request_rules:
|
|||||||
- files~=^tests/entrypoints/test_context.py
|
- files~=^tests/entrypoints/test_context.py
|
||||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^vllm/entrypoints/harmony_utils.py
|
- files~=^vllm/entrypoints/openai/parser/harmony_utils.py
|
||||||
- files~=^vllm/entrypoints/tool_server.py
|
- files~=^vllm/entrypoints/tool_server.py
|
||||||
- files~=^vllm/entrypoints/tool.py
|
- files~=^vllm/entrypoints/tool.py
|
||||||
- files~=^vllm/entrypoints/context.py
|
- files~=^vllm/entrypoints/context.py
|
||||||
|
|||||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||||
|
|||||||
2
.github/workflows/macos-smoke-test.yml
vendored
2
.github/workflows/macos-smoke-test.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
|||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v6.0.1
|
||||||
|
|
||||||
- uses: astral-sh/setup-uv@v7
|
- uses: astral-sh/setup-uv@v7
|
||||||
with:
|
with:
|
||||||
|
|||||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
|||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
|
|||||||
4
.github/workflows/stale.yml
vendored
4
.github/workflows/stale.yml
vendored
@ -7,13 +7,15 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
close-issues-and-pull-requests:
|
close-issues-and-pull-requests:
|
||||||
|
# Prevents triggering on forks or other repos
|
||||||
|
if: github.repository == 'vllm-project/vllm'
|
||||||
permissions:
|
permissions:
|
||||||
issues: write
|
issues: write
|
||||||
pull-requests: write
|
pull-requests: write
|
||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
@ -384,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=$PYTHONPATH
|
PYTHONPATH=$ENV{PYTHONPATH}
|
||||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||||
RESULT_VARIABLE marlin_generation_result
|
RESULT_VARIABLE marlin_generation_result
|
||||||
OUTPUT_VARIABLE marlin_generation_result
|
OUTPUT_VARIABLE marlin_generation_result
|
||||||
@ -822,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
|
||||||
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||||
RESULT_VARIABLE machete_generation_result
|
RESULT_VARIABLE machete_generation_result
|
||||||
OUTPUT_VARIABLE machete_generation_output
|
OUTPUT_VARIABLE machete_generation_output
|
||||||
@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
|
||||||
|
"csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
|
||||||
|
"csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
|
||||||
|
)
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
@ -1002,7 +1004,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=$PYTHONPATH
|
PYTHONPATH=$ENV{PYTHONPATH}
|
||||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||||
RESULT_VARIABLE moe_marlin_generation_result
|
RESULT_VARIABLE moe_marlin_generation_result
|
||||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||||
|
|||||||
@ -143,11 +143,13 @@ Compute Resources:
|
|||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
|
- IBM
|
||||||
- Intel
|
- Intel
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
- Novita AI
|
- Novita AI
|
||||||
- NVIDIA
|
- NVIDIA
|
||||||
|
- Red Hat
|
||||||
- Replicate
|
- Replicate
|
||||||
- Roblox
|
- Roblox
|
||||||
- RunPod
|
- RunPod
|
||||||
|
|||||||
@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
|||||||
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
||||||
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
||||||
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
||||||
|
HOSTNAME=$(hostname)
|
||||||
|
if [[ -z "$HOSTNAME" ]]; then
|
||||||
|
echo "Error: Failed to determine hostname." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
RESULT="$LOG_FOLDER/result.txt"
|
||||||
@ -82,6 +87,7 @@ start_server() {
|
|||||||
"$MODEL"
|
"$MODEL"
|
||||||
"--disable-log-requests"
|
"--disable-log-requests"
|
||||||
"--port" "8004"
|
"--port" "8004"
|
||||||
|
"--host" "$HOSTNAME"
|
||||||
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
||||||
"--max-num-seqs" "$max_num_seqs"
|
"--max-num-seqs" "$max_num_seqs"
|
||||||
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
||||||
@ -96,8 +102,9 @@ start_server() {
|
|||||||
# This correctly passes each element as a separate argument.
|
# This correctly passes each element as a separate argument.
|
||||||
if [[ -n "$profile_dir" ]]; then
|
if [[ -n "$profile_dir" ]]; then
|
||||||
# Start server with profiling enabled
|
# Start server with profiling enabled
|
||||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
|
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
else
|
else
|
||||||
# Start server without profiling
|
# Start server without profiling
|
||||||
VLLM_SERVER_DEV_MODE=1 \
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
@ -112,7 +119,7 @@ start_server() {
|
|||||||
# since that we should always have permission to send signal to the server process.
|
# since that we should always have permission to send signal to the server process.
|
||||||
kill -0 $server_pid 2> /dev/null || break
|
kill -0 $server_pid 2> /dev/null || break
|
||||||
|
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
server_started=1
|
server_started=1
|
||||||
@ -172,6 +179,7 @@ run_benchmark() {
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 1000 \
|
--num-prompts 1000 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
|
--host "$HOSTNAME" \
|
||||||
--port 8004 &> "$bm_log"
|
--port 8004 &> "$bm_log"
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
@ -187,7 +195,7 @@ run_benchmark() {
|
|||||||
request_rate=$((${throughput%.*} + 1))
|
request_rate=$((${throughput%.*} + 1))
|
||||||
while ((request_rate > 0)); do
|
while ((request_rate > 0)); do
|
||||||
# clear prefix cache
|
# clear prefix cache
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
|
||||||
sleep 5
|
sleep 5
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
@ -203,6 +211,7 @@ run_benchmark() {
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 100 \
|
--num-prompts 100 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
|
--host "$HOSTNAME" \
|
||||||
--port 8004 &> "$bm_log"
|
--port 8004 &> "$bm_log"
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 100 \
|
--num-prompts 100 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
|
--host "$HOSTNAME" \
|
||||||
--port 8004 \
|
--port 8004 \
|
||||||
--profile &> "$bm_log"
|
--profile &> "$bm_log"
|
||||||
else
|
else
|
||||||
|
|||||||
@ -620,7 +620,7 @@ def get_tokenizer(
|
|||||||
kwargs["use_fast"] = False
|
kwargs["use_fast"] = False
|
||||||
if tokenizer_mode == "mistral":
|
if tokenizer_mode == "mistral":
|
||||||
try:
|
try:
|
||||||
from vllm.tokenizers import MistralTokenizer
|
from vllm.tokenizers.mistral import MistralTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"MistralTokenizer requires vllm package.\n"
|
"MistralTokenizer requires vllm package.\n"
|
||||||
|
|||||||
@ -32,12 +32,11 @@ def benchmark_propose(args):
|
|||||||
|
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
model="facebook/opt-125m",
|
model="facebook/opt-125m",
|
||||||
task="generate",
|
|
||||||
max_model_len=args.num_token + args.num_spec_token,
|
max_model_len=args.num_token + args.num_spec_token,
|
||||||
tokenizer="facebook/opt-125m",
|
tokenizer="facebook/opt-125m",
|
||||||
tokenizer_mode="auto",
|
tokenizer_mode="auto",
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
seed=None,
|
seed=0,
|
||||||
trust_remote_code=False,
|
trust_remote_code=False,
|
||||||
)
|
)
|
||||||
proposer = NgramProposer(
|
proposer = NgramProposer(
|
||||||
|
|||||||
@ -574,7 +574,7 @@ async def benchmark(
|
|||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
"{:<40} {:<10.2f}".format(
|
"{:<40} {:<10.2f}".format(
|
||||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
"Total token throughput (tok/s):", metrics.total_token_throughput
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -963,8 +963,7 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Use Torch Profiler. The endpoint must be launched with "
|
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--result-dir",
|
"--result-dir",
|
||||||
|
|||||||
@ -14,6 +14,9 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
import vllm._custom_ops as ops
|
import vllm._custom_ops as ops
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
|
per_token_group_quant_fp8,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -22,6 +25,7 @@ class bench_params_t:
|
|||||||
hidden_size: int
|
hidden_size: int
|
||||||
add_residual: bool
|
add_residual: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
|
group_size: list[int]
|
||||||
|
|
||||||
def description(self):
|
def description(self):
|
||||||
return (
|
return (
|
||||||
@ -29,6 +33,7 @@ class bench_params_t:
|
|||||||
f"x D {self.hidden_size} "
|
f"x D {self.hidden_size} "
|
||||||
f"x R {self.add_residual} "
|
f"x R {self.add_residual} "
|
||||||
f"x DT {self.dtype}"
|
f"x DT {self.dtype}"
|
||||||
|
f"x GS {self.group_size}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
||||||
ADD_RESIDUAL = [True, False]
|
ADD_RESIDUAL = [True, False]
|
||||||
DTYPES = [torch.bfloat16, torch.float]
|
DTYPES = [torch.bfloat16, torch.float]
|
||||||
|
GROUP_SIZES = [[1, 64], [1, 128]]
|
||||||
|
|
||||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
|
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
|
||||||
bench_params = list(
|
bench_params = list(
|
||||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
|
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
|
||||||
)
|
)
|
||||||
return bench_params
|
return bench_params
|
||||||
|
|
||||||
@ -52,6 +58,7 @@ def unfused_int8_impl(
|
|||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
torch_out = None
|
torch_out = None
|
||||||
@ -69,6 +76,7 @@ def unfused_fp8_impl(
|
|||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
torch_out = None
|
torch_out = None
|
||||||
@ -81,23 +89,63 @@ def unfused_fp8_impl(
|
|||||||
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
||||||
|
|
||||||
|
|
||||||
|
def unfused_groupwise_fp8_impl(
|
||||||
|
rms_norm_layer: RMSNorm,
|
||||||
|
x: torch.Tensor,
|
||||||
|
residual: torch.Tensor | None,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
|
):
|
||||||
|
# Norm
|
||||||
|
torch_out = None
|
||||||
|
if residual is None:
|
||||||
|
torch_out = rms_norm_layer.forward_cuda(x, residual)
|
||||||
|
else:
|
||||||
|
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
|
||||||
|
|
||||||
|
# Quant
|
||||||
|
torch_out, _ = per_token_group_quant_fp8(
|
||||||
|
torch_out, group_size=group_size[1], use_ue8m0=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def fused_groupwise_impl(
|
||||||
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
|
x: torch.Tensor,
|
||||||
|
residual: torch.Tensor | None,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
|
):
|
||||||
|
out, _ = ops.rms_norm_per_block_quant(
|
||||||
|
x,
|
||||||
|
rms_norm_layer.weight,
|
||||||
|
1e-6,
|
||||||
|
quant_dtype,
|
||||||
|
group_size,
|
||||||
|
residual=residual,
|
||||||
|
is_scale_transposed=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Bench functions
|
# Bench functions
|
||||||
def bench_fn(
|
def bench_fn(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor,
|
residual: torch.Tensor,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
fn: Callable,
|
fn: Callable,
|
||||||
@ -110,10 +158,11 @@ def bench_fn(
|
|||||||
"x": x,
|
"x": x,
|
||||||
"residual": residual,
|
"residual": residual,
|
||||||
"quant_dtype": quant_dtype,
|
"quant_dtype": quant_dtype,
|
||||||
|
"group_size": group_size,
|
||||||
"fn": fn,
|
"fn": fn,
|
||||||
}
|
}
|
||||||
return TBenchmark.Timer(
|
return TBenchmark.Timer(
|
||||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
|
stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.int8,
|
torch.int8,
|
||||||
|
params.group_size,
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
unfused_int8_impl,
|
unfused_int8_impl,
|
||||||
@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.float8_e4m3fn,
|
torch.float8_e4m3fn,
|
||||||
|
params.group_size,
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
unfused_fp8_impl,
|
unfused_fp8_impl,
|
||||||
@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.int8,
|
torch.int8,
|
||||||
|
params.group_size,
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
fused_impl,
|
fused_impl,
|
||||||
@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.float8_e4m3fn,
|
torch.float8_e4m3fn,
|
||||||
|
params.group_size,
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
fused_impl,
|
fused_impl,
|
||||||
@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# unfused groupwise fp8 impl.
|
||||||
|
timers.append(
|
||||||
|
bench_fn(
|
||||||
|
layer,
|
||||||
|
x,
|
||||||
|
residual,
|
||||||
|
torch.float8_e4m3fn,
|
||||||
|
params.group_size,
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
unfused_groupwise_fp8_impl,
|
||||||
|
"unfused_groupwise_fp8_impl",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# fused groupwise fp8 impl.
|
||||||
|
timers.append(
|
||||||
|
bench_fn(
|
||||||
|
layer,
|
||||||
|
x,
|
||||||
|
residual,
|
||||||
|
torch.float8_e4m3fn,
|
||||||
|
params.group_size,
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
fused_groupwise_impl,
|
||||||
|
"fused_groupwise_fp8_impl",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|||||||
150
benchmarks/kernels/benchmark_mla_k_concat.py
Normal file
150
benchmarks/kernels/benchmark_mla_k_concat.py
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
|
||||||
|
in MLA (Multi-head Latent Attention) prefill.
|
||||||
|
|
||||||
|
This validates that the optimization from commit 8d4142bd is beneficial across
|
||||||
|
various batch sizes, not just the originally tested batch size of 32768.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
# DeepSeek-V3 MLA dimensions
|
||||||
|
NUM_HEADS = 128
|
||||||
|
QK_NOPE_HEAD_DIM = 128
|
||||||
|
PE_DIM = 64
|
||||||
|
|
||||||
|
|
||||||
|
def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""Original torch.cat approach with expand."""
|
||||||
|
return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
|
||||||
|
|
||||||
|
|
||||||
|
def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""Optimized direct copy approach (avoids expand + cat overhead)."""
|
||||||
|
k = torch.empty(
|
||||||
|
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
|
||||||
|
dtype=k_nope.dtype,
|
||||||
|
device=k_nope.device,
|
||||||
|
)
|
||||||
|
k[..., : k_nope.shape[-1]] = k_nope
|
||||||
|
k[..., k_nope.shape[-1] :] = k_pe
|
||||||
|
return k
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_method(
|
||||||
|
method: Callable,
|
||||||
|
k_nope: torch.Tensor,
|
||||||
|
k_pe: torch.Tensor,
|
||||||
|
num_warmup: int = 10,
|
||||||
|
num_iters: int = 100,
|
||||||
|
) -> float:
|
||||||
|
"""Benchmark a concatenation method and return mean latency in ms."""
|
||||||
|
# Warmup
|
||||||
|
for _ in range(num_warmup):
|
||||||
|
_ = method(k_nope, k_pe)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Benchmark
|
||||||
|
start = time.perf_counter()
|
||||||
|
for _ in range(num_iters):
|
||||||
|
_ = method(k_nope, k_pe)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = time.perf_counter()
|
||||||
|
|
||||||
|
return (end - start) / num_iters * 1000 # Convert to ms
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def run_benchmark(dtype: torch.dtype, dtype_name: str):
|
||||||
|
"""Run benchmark for a specific dtype."""
|
||||||
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
|
# Batch sizes to test (powers of 2 from 32 to 65536)
|
||||||
|
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
|
||||||
|
print("=" * 80)
|
||||||
|
print(
|
||||||
|
f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
|
||||||
|
f"k_pe=[B, 1, {PE_DIM}]"
|
||||||
|
)
|
||||||
|
print(f"dtype: {dtype_name}")
|
||||||
|
print()
|
||||||
|
print(
|
||||||
|
f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
|
||||||
|
f"{'Speedup':>8} | {'Reduction':>10}"
|
||||||
|
)
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for batch_size in batch_sizes:
|
||||||
|
# Create input tensors (generate in float32 then convert for FP8 compatibility)
|
||||||
|
k_nope = torch.randn(
|
||||||
|
batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
|
||||||
|
).to(dtype)
|
||||||
|
k_pe = torch.randn(
|
||||||
|
batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
|
||||||
|
).to(dtype)
|
||||||
|
|
||||||
|
# Benchmark both methods
|
||||||
|
cat_time = benchmark_method(cat_method, k_nope, k_pe)
|
||||||
|
direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
|
||||||
|
|
||||||
|
speedup = cat_time / direct_time
|
||||||
|
reduction = (1 - direct_time / cat_time) * 100
|
||||||
|
|
||||||
|
results.append((batch_size, cat_time, direct_time, speedup, reduction))
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
|
||||||
|
f"{speedup:>7.2f}x | {reduction:>9.1f}%"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Summary statistics
|
||||||
|
speedups = [r[3] for r in results]
|
||||||
|
print("\nSpeedup summary:")
|
||||||
|
print(f" Min: {min(speedups):.2f}x")
|
||||||
|
print(f" Max: {max(speedups):.2f}x")
|
||||||
|
print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
|
||||||
|
|
||||||
|
# Find crossover point
|
||||||
|
crossover_batch = None
|
||||||
|
for batch_size, _, _, speedup, _ in results:
|
||||||
|
if speedup >= 1.0:
|
||||||
|
crossover_batch = batch_size
|
||||||
|
break
|
||||||
|
|
||||||
|
print("\nConclusion:")
|
||||||
|
if crossover_batch:
|
||||||
|
print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
|
||||||
|
# Filter for large batches (>= 512 which is typical for prefill)
|
||||||
|
large_batch_speedups = [r[3] for r in results if r[0] >= 512]
|
||||||
|
if large_batch_speedups:
|
||||||
|
avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
|
||||||
|
print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
|
||||||
|
print(" - MLA prefill typically uses large batches, so optimization is effective")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def main():
|
||||||
|
# Test bfloat16
|
||||||
|
print("\n")
|
||||||
|
run_benchmark(torch.bfloat16, "bfloat16")
|
||||||
|
|
||||||
|
# Test float8_e4m3fn
|
||||||
|
print("\n")
|
||||||
|
run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
|||||||
num_tokens_range = [1, 16, 256, 4096]
|
num_tokens_range = [1, 16, 256, 4096]
|
||||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||||
topk_range = [1, 2, 8]
|
topk_range = [1, 2, 8]
|
||||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
ep_size_range = [1, 8]
|
||||||
|
configs = list(
|
||||||
|
itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
@triton.testing.perf_report(
|
||||||
triton.testing.Benchmark(
|
triton.testing.Benchmark(
|
||||||
x_names=["num_tokens", "num_experts", "topk"],
|
x_names=["num_tokens", "num_experts", "topk", "ep_size"],
|
||||||
x_vals=configs,
|
x_vals=configs,
|
||||||
line_arg="provider",
|
line_arg="provider",
|
||||||
line_vals=["vllm"],
|
line_vals=["vllm"],
|
||||||
@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
|
|||||||
args={},
|
args={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
def benchmark(num_tokens, num_experts, topk, provider):
|
def benchmark(num_tokens, num_experts, topk, ep_size, provider):
|
||||||
"""Benchmark function for Triton."""
|
"""Benchmark function for Triton."""
|
||||||
block_size = 256
|
block_size = 256
|
||||||
|
torch.cuda.manual_seed_all(0)
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||||
|
|
||||||
|
e_map = None
|
||||||
|
if ep_size != 1:
|
||||||
|
local_e = num_experts // ep_size
|
||||||
|
e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
|
||||||
|
e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
|
||||||
|
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
if provider == "vllm":
|
if provider == "vllm":
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
lambda: moe_align_block_size(
|
||||||
|
topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
|
||||||
|
),
|
||||||
quantiles=quantiles,
|
quantiles=quantiles,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -99,7 +99,6 @@ def benchmark_mrope(
|
|||||||
# the parameters to compute the q k v size based on tp_size
|
# the parameters to compute the q k v size based on tp_size
|
||||||
mrope_helper_class = get_rope(
|
mrope_helper_class = get_rope(
|
||||||
head_size=head_dim,
|
head_size=head_dim,
|
||||||
rotary_dim=head_dim,
|
|
||||||
max_position=max_position,
|
max_position=max_position,
|
||||||
is_neox_style=is_neox_style,
|
is_neox_style=is_neox_style,
|
||||||
rope_parameters=rope_parameters,
|
rope_parameters=rope_parameters,
|
||||||
|
|||||||
@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
|||||||
def benchmark(batch_size, seq_len, num_heads, provider):
|
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||||
dtype = torch.bfloat16
|
dtype = torch.bfloat16
|
||||||
max_position = 8192
|
max_position = 8192
|
||||||
base = 10000
|
rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
|
||||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
|
||||||
rope = rope.to(dtype=dtype, device=device)
|
rope = rope.to(dtype=dtype, device=device)
|
||||||
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
||||||
|
|
||||||
|
|||||||
@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Build ACL with CMake
|
# Build ACL with CMake
|
||||||
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
|
|
||||||
set(CMAKE_BUILD_TYPE "Release")
|
|
||||||
set(ARM_COMPUTE_ARCH "armv8.2-a")
|
|
||||||
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
|
|
||||||
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
|
|
||||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
|
||||||
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
|
|
||||||
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
|
|
||||||
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
|
|
||||||
set(ARM_COMPUTE_BUILD_TESTING "OFF")
|
|
||||||
|
|
||||||
set(_cmake_config_cmd
|
set(_cmake_config_cmd
|
||||||
${CMAKE_COMMAND} -G Ninja -B build
|
${CMAKE_COMMAND} -G Ninja -B build
|
||||||
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
||||||
|
|||||||
@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
|||||||
# sm90a
|
# sm90a
|
||||||
|
|
||||||
set(SUPPORT_ARCHS)
|
set(SUPPORT_ARCHS)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
|
||||||
list(APPEND SUPPORT_ARCHS 9.0a)
|
list(APPEND SUPPORT_ARCHS "9.0a")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
|
||||||
list(APPEND SUPPORT_ARCHS 10.0a)
|
# CUDA 12.9 has introduced "Family-Specific Architecture Features"
|
||||||
|
# this supports all compute_10x family
|
||||||
|
list(APPEND SUPPORT_ARCHS "10.0f")
|
||||||
|
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
|
list(APPEND SUPPORT_ARCHS "10.0a")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
||||||
if(FLASH_MLA_ARCHS)
|
if(FLASH_MLA_ARCHS)
|
||||||
|
message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
|
||||||
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
||||||
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
||||||
|
|
||||||
@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
|
|||||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||||
else()
|
else()
|
||||||
# Create empty targets for setup.py when not targeting sm90a systems
|
message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
|
||||||
|
# Create empty targets for setup.py on unsupported systems
|
||||||
add_custom_target(_flashmla_C)
|
add_custom_target(_flashmla_C)
|
||||||
add_custom_target(_flashmla_extension_C)
|
add_custom_target(_flashmla_extension_C)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
|
|||||||
run_python(_VLLM_TORCH_GOMP_PATH
|
run_python(_VLLM_TORCH_GOMP_PATH
|
||||||
"
|
"
|
||||||
import os, glob
|
import os, glob
|
||||||
try:
|
import torch
|
||||||
import torch
|
torch_pkg = os.path.dirname(torch.__file__)
|
||||||
torch_pkg = os.path.dirname(torch.__file__)
|
site_root = os.path.dirname(torch_pkg)
|
||||||
site_root = os.path.dirname(torch_pkg)
|
|
||||||
torch_libs = os.path.join(site_root, 'torch.libs')
|
# Search both torch.libs and torch/lib
|
||||||
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
|
roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
|
||||||
except:
|
candidates = []
|
||||||
print('')
|
for root in roots:
|
||||||
|
if not os.path.isdir(root):
|
||||||
|
continue
|
||||||
|
candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
|
||||||
|
|
||||||
|
print(candidates[0] if candidates else '')
|
||||||
"
|
"
|
||||||
"failed to probe torch.libs for libgomp")
|
"failed to probe for libgomp")
|
||||||
|
|
||||||
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
||||||
return()
|
return()
|
||||||
|
|||||||
10
csrc/cache.h
10
csrc/cache.h
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
#include <c10/util/Optional.h>
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -58,6 +59,15 @@ void cp_gather_cache(
|
|||||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||||
|
|
||||||
|
// Gather and upconvert FP8 KV cache to BF16 workspace
|
||||||
|
void cp_gather_and_upconvert_fp8_kv_cache(
|
||||||
|
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||||
|
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
||||||
|
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||||
|
torch::Tensor const& seq_lens, // [BATCH]
|
||||||
|
torch::Tensor const& workspace_starts, // [BATCH]
|
||||||
|
int64_t batch_size);
|
||||||
|
|
||||||
// Indexer K quantization and cache function
|
// Indexer K quantization and cache function
|
||||||
void indexer_k_quant_and_cache(
|
void indexer_k_quant_and_cache(
|
||||||
torch::Tensor& k, // [num_tokens, head_dim]
|
torch::Tensor& k, // [num_tokens, head_dim]
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <c10/cuda/CUDAException.h>
|
#include <c10/cuda/CUDAException.h>
|
||||||
|
#include <c10/util/Optional.h>
|
||||||
|
|
||||||
#include "cuda_utils.h"
|
#include "cuda_utils.h"
|
||||||
#include "cuda_compat.h"
|
#include "cuda_compat.h"
|
||||||
@ -514,7 +515,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
|||||||
const int quant_block_size, // quantization block size
|
const int quant_block_size, // quantization block size
|
||||||
const int cache_block_size, // cache block size
|
const int cache_block_size, // cache block size
|
||||||
const int cache_stride, // stride for each token in kv_cache
|
const int cache_stride, // stride for each token in kv_cache
|
||||||
const bool use_ue8m0 // use ue8m0 scale format
|
|
||||||
|
const bool use_ue8m0 // use ue8m0 scale format
|
||||||
) {
|
) {
|
||||||
constexpr int VEC_SIZE = 4;
|
constexpr int VEC_SIZE = 4;
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
@ -1061,6 +1063,82 @@ void gather_and_maybe_dequant_cache(
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
|
// Gather and upconvert FP8 KV cache tokens to BF16 workspace
|
||||||
|
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
||||||
|
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||||
|
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||||
|
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
|
||||||
|
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||||
|
const int32_t* __restrict__ seq_lens, // [BATCH]
|
||||||
|
const int32_t* __restrict__ workspace_starts, // [BATCH]
|
||||||
|
const int32_t block_size, const int32_t head_dim,
|
||||||
|
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||||
|
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
|
||||||
|
const int64_t bid = blockIdx.x; // Batch ID
|
||||||
|
const int32_t num_splits = gridDim.y;
|
||||||
|
const int32_t split = blockIdx.y;
|
||||||
|
const int32_t seq_start = workspace_starts[bid];
|
||||||
|
const int32_t seq_len = seq_lens[bid];
|
||||||
|
const int32_t tot_slots = seq_len;
|
||||||
|
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
||||||
|
|
||||||
|
const int32_t split_start = split * split_slots;
|
||||||
|
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
||||||
|
|
||||||
|
const bool is_active_split = (split_start < tot_slots);
|
||||||
|
|
||||||
|
if (!is_active_split) return;
|
||||||
|
|
||||||
|
// Adjust the pointer for the block_table for this batch
|
||||||
|
const int32_t batch_offset = bid * block_table_stride;
|
||||||
|
int32_t offset = split_start;
|
||||||
|
int32_t offset_div = offset / block_size;
|
||||||
|
offset = offset % block_size;
|
||||||
|
const int32_t* batch_block_table = block_table + batch_offset;
|
||||||
|
|
||||||
|
// Adjust dst pointer based on the cumulative sequence lengths
|
||||||
|
dst += seq_start * dst_entry_stride;
|
||||||
|
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
|
// Process each token in this split
|
||||||
|
for (int pid = split_start; pid < split_end; ++pid) {
|
||||||
|
auto block_id = batch_block_table[offset_div];
|
||||||
|
const uint8_t* token_ptr =
|
||||||
|
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
|
||||||
|
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
|
||||||
|
|
||||||
|
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
|
||||||
|
const uint8_t* no_pe_ptr = token_ptr;
|
||||||
|
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
||||||
|
const __nv_bfloat16* rope_ptr =
|
||||||
|
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
|
||||||
|
|
||||||
|
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
|
||||||
|
if (tid < 512) {
|
||||||
|
// FP8 dequantization
|
||||||
|
const int tile = tid >> 7; // each tile is 128 elements
|
||||||
|
const float scale = scales_ptr[tile];
|
||||||
|
const uint8_t val = no_pe_ptr[tid];
|
||||||
|
dst_ptr[tid] =
|
||||||
|
fp8::scaled_convert<__nv_bfloat16, uint8_t,
|
||||||
|
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
|
||||||
|
} else if (tid < 576) {
|
||||||
|
// Rope copy (64 bf16 elements)
|
||||||
|
const int rope_idx = tid - 512;
|
||||||
|
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move to next token
|
||||||
|
offset += 1;
|
||||||
|
if (offset == block_size) {
|
||||||
|
offset_div += 1;
|
||||||
|
offset = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
|
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
|
||||||
// block_size.
|
// block_size.
|
||||||
@ -1202,6 +1280,57 @@ void cp_gather_cache(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cp_gather_and_upconvert_fp8_kv_cache(
|
||||||
|
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||||
|
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
||||||
|
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||||
|
torch::Tensor const& seq_lens, // [BATCH]
|
||||||
|
torch::Tensor const& workspace_starts, // [BATCH]
|
||||||
|
int64_t batch_size) {
|
||||||
|
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
int32_t block_size = src_cache.size(1);
|
||||||
|
int32_t head_dim = dst.size(1);
|
||||||
|
|
||||||
|
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||||
|
"block_table must be int32");
|
||||||
|
TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
|
||||||
|
TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
|
||||||
|
"workspace_starts must be int32");
|
||||||
|
|
||||||
|
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||||
|
"src_cache and dst must be on the same device");
|
||||||
|
TORCH_CHECK(src_cache.device() == block_table.device(),
|
||||||
|
"src_cache and block_table must be on the same device");
|
||||||
|
TORCH_CHECK(src_cache.device() == seq_lens.device(),
|
||||||
|
"src_cache and seq_lens must be on the same device");
|
||||||
|
TORCH_CHECK(src_cache.device() == workspace_starts.device(),
|
||||||
|
"src_cache and workspace_starts must be on the same device");
|
||||||
|
|
||||||
|
TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
|
||||||
|
TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
|
||||||
|
TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
|
||||||
|
|
||||||
|
int64_t block_table_stride = block_table.stride(0);
|
||||||
|
int64_t cache_block_stride = src_cache.stride(0);
|
||||||
|
int64_t cache_entry_stride = src_cache.stride(1);
|
||||||
|
int64_t dst_entry_stride = dst.stride(0);
|
||||||
|
|
||||||
|
// Decide on the number of splits based on the batch size
|
||||||
|
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||||
|
dim3 grid(batch_size, num_splits);
|
||||||
|
dim3 block(576);
|
||||||
|
|
||||||
|
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
||||||
|
src_cache.data_ptr<uint8_t>(),
|
||||||
|
reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||||
|
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
||||||
|
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
||||||
|
block_table_stride, cache_block_stride, cache_entry_stride,
|
||||||
|
dst_entry_stride);
|
||||||
|
}
|
||||||
|
|
||||||
// Macro to dispatch the kernel based on the data type.
|
// Macro to dispatch the kernel based on the data type.
|
||||||
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||||
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||||
|
|||||||
@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
|
|||||||
input.casual = casual;
|
input.casual = casual;
|
||||||
input.isa = isa;
|
input.isa = isa;
|
||||||
input.enable_kv_split = enable_kv_split;
|
input.enable_kv_split = enable_kv_split;
|
||||||
TORCH_CHECK(casual, "Only supports casual mask for now.");
|
|
||||||
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||||
|
|||||||
@ -186,7 +186,7 @@ struct AttentionMetadata {
|
|||||||
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
|
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
|
||||||
// * q_tile_size * 4, partial output, max + sum (float)
|
// * q_tile_size * 4, partial output, max + sum (float)
|
||||||
// Reduction scratchpad contains:
|
// Reduction scratchpad contains:
|
||||||
// - flags: bool array to indicate wether the split is finished
|
// - flags: bool array to indicate whether the split is finished
|
||||||
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
|
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
|
||||||
// - max, sum: 2 * split_num * q_tile_size * 4
|
// - max, sum: 2 * split_num * q_tile_size * 4
|
||||||
class AttentionScratchPad {
|
class AttentionScratchPad {
|
||||||
@ -1246,14 +1246,8 @@ class AttentionMainLoop {
|
|||||||
// rescale sum and partial outputs
|
// rescale sum and partial outputs
|
||||||
if (need_rescale) {
|
if (need_rescale) {
|
||||||
// compute rescale factor
|
// compute rescale factor
|
||||||
#ifdef DEFINE_FAST_EXP
|
|
||||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
|
||||||
rescale_factor_vec = fast_exp(rescale_factor_vec);
|
|
||||||
rescale_factor = rescale_factor_vec.get_last_elem();
|
|
||||||
#else
|
|
||||||
rescale_factor = std::exp(rescale_factor);
|
rescale_factor = std::exp(rescale_factor);
|
||||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||||
#endif
|
|
||||||
|
|
||||||
// rescale sum
|
// rescale sum
|
||||||
new_sum_val += rescale_factor * init_sum_val;
|
new_sum_val += rescale_factor * init_sum_val;
|
||||||
@ -1889,15 +1883,8 @@ class AttentionMainLoop {
|
|||||||
: curr_output_buffer;
|
: curr_output_buffer;
|
||||||
float rescale_factor = final_max > curr_max ? curr_max - final_max
|
float rescale_factor = final_max > curr_max ? curr_max - final_max
|
||||||
: final_max - curr_max;
|
: final_max - curr_max;
|
||||||
|
|
||||||
#ifdef DEFINE_FAST_EXP
|
|
||||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
|
||||||
rescale_factor_vec = fast_exp(rescale_factor_vec);
|
|
||||||
rescale_factor = rescale_factor_vec.get_last_elem();
|
|
||||||
#else
|
|
||||||
rescale_factor = std::exp(rescale_factor);
|
rescale_factor = std::exp(rescale_factor);
|
||||||
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
vec_op::FP32Vec16 rescale_factor_vec(rescale_factor);
|
||||||
#endif
|
|
||||||
|
|
||||||
local_sum[head_idx] = final_max > curr_max
|
local_sum[head_idx] = final_max > curr_max
|
||||||
? final_sum + rescale_factor * curr_sum
|
? final_sum + rescale_factor * curr_sum
|
||||||
|
|||||||
@ -60,4 +60,54 @@
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __aarch64__
|
||||||
|
// Implementation copied from Arm Optimized Routines (expf AdvSIMD)
|
||||||
|
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||||
|
#include <limits>
|
||||||
|
#define DEFINE_FAST_EXP \
|
||||||
|
const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); \
|
||||||
|
const float ln2_hi = 0x1.62e4p-1f; \
|
||||||
|
const float ln2_lo = 0x1.7f7d1cp-20f; \
|
||||||
|
const float c0 = 0x1.0e4020p-7f; \
|
||||||
|
const float c2 = 0x1.555e66p-3f; \
|
||||||
|
const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2}; \
|
||||||
|
const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000); \
|
||||||
|
const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f); \
|
||||||
|
const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f); \
|
||||||
|
const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f); \
|
||||||
|
const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f); \
|
||||||
|
const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound); \
|
||||||
|
const float32x4_t inf = \
|
||||||
|
vdupq_n_f32(std::numeric_limits<float>::infinity()); \
|
||||||
|
const float32x4_t zero = vdupq_n_f32(0.0f); \
|
||||||
|
auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) { \
|
||||||
|
float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); \
|
||||||
|
float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0); \
|
||||||
|
r = vfmsq_laneq_f32(r, n, ln2_c02, 1); \
|
||||||
|
uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \
|
||||||
|
float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias)); \
|
||||||
|
float32x4_t r2 = vmulq_f32(r, r); \
|
||||||
|
float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2); \
|
||||||
|
float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3); \
|
||||||
|
q = vfmaq_f32(q, p, r2); \
|
||||||
|
p = vmulq_f32(c4, r); \
|
||||||
|
float32x4_t poly = vfmaq_f32(p, q, r2); \
|
||||||
|
poly = vfmaq_f32(scale, poly, scale); \
|
||||||
|
const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound); \
|
||||||
|
const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound); \
|
||||||
|
poly = vbslq_f32(hi_mask, inf, poly); \
|
||||||
|
return vbslq_f32(lo_mask, zero, poly); \
|
||||||
|
}; \
|
||||||
|
auto fast_exp = [&](vec_op::FP32Vec16& vec) \
|
||||||
|
__attribute__((always_inline)) { \
|
||||||
|
float32x4x4_t result; \
|
||||||
|
result.val[0] = neon_expf(vec.reg.val[0]); \
|
||||||
|
result.val[1] = neon_expf(vec.reg.val[1]); \
|
||||||
|
result.val[2] = neon_expf(vec.reg.val[2]); \
|
||||||
|
result.val[3] = neon_expf(vec.reg.val[3]); \
|
||||||
|
return vec_op::FP32Vec16(result); \
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __aarch64__
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@ -118,6 +118,24 @@
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_BOOL(expr, const_expr, ...) \
|
||||||
|
if (expr) { \
|
||||||
|
constexpr bool const_expr = true; \
|
||||||
|
__VA_ARGS__(); \
|
||||||
|
} else { \
|
||||||
|
constexpr bool const_expr = false; \
|
||||||
|
__VA_ARGS__(); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_GROUP_SIZE(group_size, const_group_size, ...) \
|
||||||
|
if (group_size == 128) { \
|
||||||
|
constexpr int const_group_size = 128; \
|
||||||
|
__VA_ARGS__(); \
|
||||||
|
} else if (group_size == 64) { \
|
||||||
|
constexpr int const_group_size = 64; \
|
||||||
|
__VA_ARGS__(); \
|
||||||
|
}
|
||||||
|
|
||||||
#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \
|
#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \
|
||||||
switch (NUM_DIMS) { \
|
switch (NUM_DIMS) { \
|
||||||
case 2: { \
|
case 2: { \
|
||||||
|
|||||||
@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) {
|
|||||||
return cuda_cast<T, float>(sigmoid_accurate(f));
|
return cuda_cast<T, float>(sigmoid_accurate(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <ScoringFunc SF, typename T>
|
||||||
|
__device__ inline T apply_scoring(T val) {
|
||||||
|
if constexpr (SF == SCORING_SIGMOID) {
|
||||||
|
return apply_sigmoid(val);
|
||||||
|
} else {
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, ScoringFunc SF>
|
||||||
__device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
__device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
||||||
cg::thread_block_tile<32> const& tile,
|
cg::thread_block_tile<32> const& tile,
|
||||||
int32_t const lane_id,
|
int32_t const lane_id,
|
||||||
int const num_experts_per_group,
|
int const num_experts_per_group) {
|
||||||
int const scoring_func) {
|
|
||||||
// Get the top2 per thread
|
// Get the top2 per thread
|
||||||
T largest = neg_inf<T>();
|
T largest = neg_inf<T>();
|
||||||
T second_largest = neg_inf<T>();
|
T second_largest = neg_inf<T>();
|
||||||
|
|
||||||
if (num_experts_per_group > WARP_SIZE) {
|
if (num_experts_per_group > WARP_SIZE) {
|
||||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||||
T value = input[i];
|
T value = apply_scoring<SF>(input[i]);
|
||||||
// Apply scoring function if needed
|
|
||||||
if (scoring_func == SCORING_SIGMOID) {
|
|
||||||
value = apply_sigmoid(value);
|
|
||||||
}
|
|
||||||
value = value + bias[i];
|
value = value + bias[i];
|
||||||
|
|
||||||
if (value > largest) {
|
if (value > largest) {
|
||||||
@ -472,17 +476,11 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||||
T value = input[i];
|
T value = apply_scoring<SF>(input[i]);
|
||||||
// Apply scoring function if needed
|
|
||||||
if (scoring_func == SCORING_SIGMOID) {
|
|
||||||
value = apply_sigmoid(value);
|
|
||||||
}
|
|
||||||
value = value + bias[i];
|
value = value + bias[i];
|
||||||
largest = value;
|
largest = value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
|
||||||
// Get the top2 warpwise
|
// Get the top2 warpwise
|
||||||
T max1 = cg::reduce(tile, largest, cg::greater<T>());
|
T max1 = cg::reduce(tile, largest, cg::greater<T>());
|
||||||
|
|
||||||
@ -501,13 +499,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T, ScoringFunc SF>
|
||||||
__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
|
__global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
|
||||||
int64_t const num_tokens,
|
int64_t const num_tokens,
|
||||||
int64_t const num_cases,
|
int64_t const num_cases,
|
||||||
int64_t const n_group,
|
int64_t const n_group,
|
||||||
int64_t const num_experts_per_group,
|
int64_t const num_experts_per_group) {
|
||||||
int const scoring_func) {
|
|
||||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||||
|
|
||||||
@ -525,21 +522,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias,
|
|||||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||||
asm volatile("griddepcontrol.wait;");
|
asm volatile("griddepcontrol.wait;");
|
||||||
#endif
|
#endif
|
||||||
topk_with_k2(output, input, group_bias, tile, lane_id,
|
topk_with_k2<T, SF>(output, input, group_bias, tile, lane_id,
|
||||||
num_experts_per_group, scoring_func);
|
num_experts_per_group);
|
||||||
}
|
}
|
||||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||||
asm volatile("griddepcontrol.launch_dependents;");
|
asm volatile("griddepcontrol.launch_dependents;");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename IdxT>
|
template <typename T, typename IdxT, ScoringFunc SF, int NGroup = -1>
|
||||||
__global__ void group_idx_and_topk_idx_kernel(
|
__global__ void group_idx_and_topk_idx_kernel(
|
||||||
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
|
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
|
||||||
T const* bias, int64_t const num_tokens, int64_t const n_group,
|
T const* bias, int64_t const num_tokens, int64_t const n_group,
|
||||||
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
|
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
|
||||||
int64_t const num_experts_per_group, bool renormalize,
|
int64_t const num_experts_per_group, bool renormalize,
|
||||||
double routed_scaling_factor, int scoring_func) {
|
double routed_scaling_factor) {
|
||||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||||
int32_t case_id =
|
int32_t case_id =
|
||||||
@ -549,6 +546,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
topk_values += case_id * topk;
|
topk_values += case_id * topk;
|
||||||
topk_indices += case_id * topk;
|
topk_indices += case_id * topk;
|
||||||
|
|
||||||
|
constexpr bool kUseStaticNGroup = (NGroup > 0);
|
||||||
|
// use int32 to avoid implicit conversion
|
||||||
|
int32_t const n_group_i32 =
|
||||||
|
kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
|
||||||
|
|
||||||
int32_t align_num_experts_per_group =
|
int32_t align_num_experts_per_group =
|
||||||
warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
|
warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
|
||||||
|
|
||||||
@ -574,17 +576,17 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
|
|
||||||
if (case_id < num_tokens) {
|
if (case_id < num_tokens) {
|
||||||
// calculate group_idx
|
// calculate group_idx
|
||||||
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
|
int32_t target_num_min =
|
||||||
|
WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
|
||||||
// The check is necessary to avoid abnormal input
|
// The check is necessary to avoid abnormal input
|
||||||
if (lane_id < n_group && is_finite(group_scores[lane_id])) {
|
if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
|
||||||
value = group_scores[lane_id];
|
value = group_scores[lane_id];
|
||||||
}
|
}
|
||||||
|
|
||||||
int count_equal_to_top_value = WARP_SIZE - n_group;
|
int count_equal_to_top_value = WARP_SIZE - n_group_i32;
|
||||||
int pre_count_equal_to_top_value = 0;
|
int pre_count_equal_to_top_value = 0;
|
||||||
// Use loop to find the largset top_group
|
// Use loop to find the largset top_group
|
||||||
while (count_equal_to_top_value < target_num_min) {
|
while (count_equal_to_top_value < target_num_min) {
|
||||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
|
||||||
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
||||||
if (value == topk_group_value) {
|
if (value == topk_group_value) {
|
||||||
value = neg_inf<T>();
|
value = neg_inf<T>();
|
||||||
@ -604,7 +606,7 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
int count_equalto_topkth_group = 0;
|
int count_equalto_topkth_group = 0;
|
||||||
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
|
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
|
||||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||||
for (int i_group = 0; i_group < n_group; i_group++) {
|
auto process_group = [&](int i_group) {
|
||||||
if ((group_scores[i_group] > topk_group_value) ||
|
if ((group_scores[i_group] > topk_group_value) ||
|
||||||
((group_scores[i_group] == topk_group_value) &&
|
((group_scores[i_group] == topk_group_value) &&
|
||||||
(count_equalto_topkth_group < num_equalto_topkth_group))) {
|
(count_equalto_topkth_group < num_equalto_topkth_group))) {
|
||||||
@ -613,11 +615,10 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
i += WARP_SIZE) {
|
i += WARP_SIZE) {
|
||||||
T candidates = neg_inf<T>();
|
T candidates = neg_inf<T>();
|
||||||
if (i < num_experts_per_group) {
|
if (i < num_experts_per_group) {
|
||||||
// Apply scoring function (if any) and add bias
|
// apply scoring function (if any) and add bias
|
||||||
T input = scores[offset + i];
|
T input = scores[offset + i];
|
||||||
if (is_finite(input)) {
|
if (is_finite(input)) {
|
||||||
T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input)
|
T score = apply_scoring<SF>(input);
|
||||||
: input;
|
|
||||||
candidates = score + bias[offset + i];
|
candidates = score + bias[offset + i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -627,12 +628,21 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
count_equalto_topkth_group++;
|
count_equalto_topkth_group++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if constexpr (kUseStaticNGroup) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i_group = 0; i_group < NGroup; ++i_group) {
|
||||||
|
process_group(i_group);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i_group = 0; i_group < n_group_i32; ++i_group) {
|
||||||
|
process_group(i_group);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
queue.done();
|
queue.done();
|
||||||
__syncwarp();
|
|
||||||
// Get the topk_idx
|
// Get the topk_idx
|
||||||
queue.dumpIdx(s_topk_idx);
|
queue.dumpIdx(s_topk_idx);
|
||||||
__syncwarp();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load the valid score value
|
// Load the valid score value
|
||||||
@ -646,12 +656,13 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
if (i < topk) {
|
if (i < topk) {
|
||||||
// Load the score value (without bias) for normalization
|
// Load the score value (without bias) for normalization
|
||||||
T input = scores[s_topk_idx[i]];
|
T input = scores[s_topk_idx[i]];
|
||||||
value =
|
value = apply_scoring<SF>(input);
|
||||||
(scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input;
|
|
||||||
s_topk_value[i] = value;
|
s_topk_value[i] = value;
|
||||||
}
|
}
|
||||||
topk_sum +=
|
if (renormalize) {
|
||||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
topk_sum +=
|
||||||
|
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -660,13 +671,9 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
if (case_id < num_tokens) {
|
if (case_id < num_tokens) {
|
||||||
if (if_proceed_next_topk) {
|
if (if_proceed_next_topk) {
|
||||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||||
float value;
|
float base = cuda_cast<float, T>(s_topk_value[i]);
|
||||||
if (renormalize) {
|
float value = renormalize ? (base / topk_sum * routed_scaling_factor)
|
||||||
value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
|
: (base * routed_scaling_factor);
|
||||||
routed_scaling_factor;
|
|
||||||
} else {
|
|
||||||
value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
|
|
||||||
}
|
|
||||||
topk_indices[i] = s_topk_idx[i];
|
topk_indices[i] = s_topk_idx[i];
|
||||||
topk_values[i] = value;
|
topk_values[i] = value;
|
||||||
}
|
}
|
||||||
@ -684,6 +691,45 @@ __global__ void group_idx_and_topk_idx_kernel(
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, typename IdxT, ScoringFunc SF>
|
||||||
|
inline void launch_group_idx_and_topk_kernel(
|
||||||
|
cudaLaunchConfig_t const& config, T* scores, T* group_scores,
|
||||||
|
float* topk_values, IdxT* topk_indices, T const* bias,
|
||||||
|
int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
|
||||||
|
int64_t const topk, int64_t const num_experts,
|
||||||
|
int64_t const num_experts_per_group, bool const renormalize,
|
||||||
|
double const routed_scaling_factor) {
|
||||||
|
auto launch = [&](auto* kernel_instance2) {
|
||||||
|
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
|
||||||
|
topk_values, topk_indices, bias, num_tokens, n_group,
|
||||||
|
topk_group, topk, num_experts, num_experts_per_group,
|
||||||
|
renormalize, routed_scaling_factor);
|
||||||
|
};
|
||||||
|
|
||||||
|
switch (n_group) {
|
||||||
|
case 4: {
|
||||||
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 4>);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 8: {
|
||||||
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 8>);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 16: {
|
||||||
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 16>);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 32: {
|
||||||
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF, 32>);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
launch(&group_idx_and_topk_idx_kernel<T, IdxT, SF>);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T, typename IdxT>
|
template <typename T, typename IdxT>
|
||||||
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||||
IdxT* topk_indices, T const* bias, int64_t const num_tokens,
|
IdxT* topk_indices, T const* bias, int64_t const num_tokens,
|
||||||
@ -694,7 +740,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
|||||||
cudaStream_t const stream = 0) {
|
cudaStream_t const stream = 0) {
|
||||||
int64_t num_cases = num_tokens * n_group;
|
int64_t num_cases = num_tokens * n_group;
|
||||||
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||||
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
|
|
||||||
cudaLaunchConfig_t config;
|
cudaLaunchConfig_t config;
|
||||||
config.gridDim = topk_with_k2_num_blocks;
|
config.gridDim = topk_with_k2_num_blocks;
|
||||||
config.blockDim = BLOCK_SIZE;
|
config.blockDim = BLOCK_SIZE;
|
||||||
@ -705,16 +750,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
|||||||
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
||||||
config.numAttrs = 1;
|
config.numAttrs = 1;
|
||||||
config.attrs = attrs;
|
config.attrs = attrs;
|
||||||
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
|
auto const sf = static_cast<ScoringFunc>(scoring_func);
|
||||||
num_tokens, num_cases, n_group, num_experts / n_group,
|
int64_t const num_experts_per_group = num_experts / n_group;
|
||||||
scoring_func);
|
auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
|
||||||
|
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
|
||||||
|
num_tokens, num_cases, n_group, num_experts_per_group);
|
||||||
|
};
|
||||||
|
switch (sf) {
|
||||||
|
case SCORING_NONE: {
|
||||||
|
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_NONE>;
|
||||||
|
launch_topk_with_k2(kernel_instance1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case SCORING_SIGMOID: {
|
||||||
|
auto* kernel_instance1 = &topk_with_k2_kernel<T, SCORING_SIGMOID>;
|
||||||
|
launch_topk_with_k2(kernel_instance1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
// should be guarded by higher level checks.
|
||||||
|
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
|
||||||
|
}
|
||||||
|
|
||||||
int64_t topk_with_k_group_num_blocks =
|
int64_t topk_with_k_group_num_blocks =
|
||||||
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
|
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||||
size_t dynamic_smem_in_bytes =
|
size_t dynamic_smem_in_bytes =
|
||||||
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
|
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
|
||||||
topk);
|
topk);
|
||||||
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
|
|
||||||
config.gridDim = topk_with_k_group_num_blocks;
|
config.gridDim = topk_with_k_group_num_blocks;
|
||||||
config.blockDim = BLOCK_SIZE;
|
config.blockDim = BLOCK_SIZE;
|
||||||
config.dynamicSmemBytes = dynamic_smem_in_bytes;
|
config.dynamicSmemBytes = dynamic_smem_in_bytes;
|
||||||
@ -723,10 +785,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
|||||||
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
||||||
config.numAttrs = 1;
|
config.numAttrs = 1;
|
||||||
config.attrs = attrs;
|
config.attrs = attrs;
|
||||||
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
|
switch (sf) {
|
||||||
topk_values, topk_indices, bias, num_tokens, n_group,
|
case SCORING_NONE: {
|
||||||
topk_group, topk, num_experts, num_experts / n_group,
|
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_NONE>(
|
||||||
renormalize, routed_scaling_factor, scoring_func);
|
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||||
|
num_tokens, n_group, topk_group, topk, num_experts,
|
||||||
|
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case SCORING_SIGMOID: {
|
||||||
|
launch_group_idx_and_topk_kernel<T, IdxT, SCORING_SIGMOID>(
|
||||||
|
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||||
|
num_tokens, n_group, topk_group, topk, num_experts,
|
||||||
|
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
|
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
|
||||||
|
|||||||
@ -14,7 +14,6 @@
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
namespace moe {
|
namespace moe {
|
||||||
|
|
||||||
namespace batched_moe_align_block_size {
|
namespace batched_moe_align_block_size {
|
||||||
|
|
||||||
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
||||||
@ -80,17 +79,32 @@ __global__ void batched_moe_align_block_size_kernel(
|
|||||||
} // namespace batched_moe_align_block_size
|
} // namespace batched_moe_align_block_size
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
__global__ void moe_align_block_size_kernel(
|
__device__ void _moe_align_block_size(
|
||||||
const scalar_t* __restrict__ topk_ids,
|
const scalar_t* __restrict__ topk_ids,
|
||||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
int32_t* __restrict__ total_tokens_post_pad,
|
||||||
|
int32_t* __restrict__ expert_map, int32_t num_experts,
|
||||||
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||||
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) {
|
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
|
||||||
|
int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id,
|
||||||
|
int32_t topk_num, int32_t* token_mask, bool has_expert_map) {
|
||||||
extern __shared__ int32_t shared_counts[];
|
extern __shared__ int32_t shared_counts[];
|
||||||
|
|
||||||
// Initialize sorted_token_ids with numel
|
// Compute input buffer offsets. Typically these will all be 0, except when
|
||||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
// using Multi LoRA.
|
||||||
sorted_token_ids[it] = numel;
|
int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
|
||||||
|
int expert_ids_offset = max_num_m_blocks * model_offset;
|
||||||
|
int cumsum_offset = (num_experts + 1) * model_offset;
|
||||||
|
|
||||||
|
// Use separate threadblocks to fill sorted_token_ids.
|
||||||
|
// This is safe since the current kernel does not use sorted_token_ids.
|
||||||
|
if (blockIdx.x % 2) {
|
||||||
|
// Initialize sorted_token_ids with numel
|
||||||
|
for (size_t it = threadIdx.x; it < max_num_tokens_padded;
|
||||||
|
it += blockDim.x) {
|
||||||
|
sorted_token_ids[sorted_token_ids_offset + it] = numel;
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||||
@ -112,9 +126,16 @@ __global__ void moe_align_block_size_kernel(
|
|||||||
if (expert_id >= num_experts) {
|
if (expert_id >= num_experts) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (has_expert_map) {
|
||||||
|
expert_id = expert_map[expert_id];
|
||||||
|
// filter invalid experts
|
||||||
|
if (expert_id == -1) continue;
|
||||||
|
}
|
||||||
int warp_idx = expert_id / experts_per_warp;
|
int warp_idx = expert_id / experts_per_warp;
|
||||||
int expert_offset = expert_id % experts_per_warp;
|
int expert_offset = expert_id % experts_per_warp;
|
||||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
|
||||||
|
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset],
|
||||||
|
mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -135,48 +156,196 @@ __global__ void moe_align_block_size_kernel(
|
|||||||
int cumsum_val;
|
int cumsum_val;
|
||||||
BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
|
BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val);
|
||||||
if (expert_id <= num_experts) {
|
if (expert_id <= num_experts) {
|
||||||
cumsum[expert_id] = cumsum_val;
|
cumsum[cumsum_offset + expert_id] = cumsum_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (expert_id == num_experts) {
|
if (expert_id == num_experts) {
|
||||||
*total_tokens_post_pad = cumsum_val;
|
total_tokens_post_pad[model_offset] = cumsum_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x < num_experts) {
|
if (threadIdx.x < num_experts) {
|
||||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
for (int i = cumsum[cumsum_offset + threadIdx.x];
|
||||||
i += block_size) {
|
i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) {
|
||||||
expert_ids[i / block_size] = threadIdx.x;
|
expert_ids[expert_ids_offset + i / block_size] = threadIdx.x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill remaining expert_ids with 0
|
// Fill remaining expert_ids with 0
|
||||||
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
|
const size_t fill_start_idx =
|
||||||
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
|
cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x;
|
||||||
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
|
for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) {
|
||||||
expert_ids[i] = 0;
|
expert_ids[expert_ids_offset + i] = inactive_expert_id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, int32_t fill_threads>
|
||||||
|
__device__ void _moe_align_block_size_small_batch_expert(
|
||||||
|
const scalar_t* __restrict__ topk_ids,
|
||||||
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||||
|
int32_t* __restrict__ total_tokens_post_pad,
|
||||||
|
int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
|
||||||
|
size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks,
|
||||||
|
int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num,
|
||||||
|
int32_t* token_mask, bool has_expert_map) {
|
||||||
|
// Compute input buffer offsets. Typically these will all be 0, except when
|
||||||
|
// using Multi LoRA.
|
||||||
|
int sorted_token_ids_offset = max_num_tokens_padded * model_offset;
|
||||||
|
int expert_ids_offset = max_num_m_blocks * model_offset;
|
||||||
|
|
||||||
|
// Use an additional group of threads to fill sorted_token_ids.
|
||||||
|
// Since the current kernel will use sorted_token_ids afterward,
|
||||||
|
// we fill sorted_token_ids within the same threadblock to make
|
||||||
|
// synchronization easier.
|
||||||
|
if (threadIdx.x < fill_threads) {
|
||||||
|
// Initialize sorted_token_ids with numel
|
||||||
|
for (size_t it = threadIdx.x; it < max_num_tokens_padded;
|
||||||
|
it += fill_threads) {
|
||||||
|
sorted_token_ids[sorted_token_ids_offset + it] = numel;
|
||||||
|
}
|
||||||
|
// Three __syncthreads() corresponding to the other threads
|
||||||
|
__syncthreads();
|
||||||
|
__syncthreads();
|
||||||
|
__syncthreads();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t tid = threadIdx.x - fill_threads;
|
||||||
|
const size_t stride = blockDim.x - fill_threads;
|
||||||
|
|
||||||
|
extern __shared__ int32_t shared_mem[];
|
||||||
|
int32_t* cumsum = shared_mem;
|
||||||
|
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
|
tokens_cnts[(tid + 1) * num_experts + i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = tid; i < numel; i += stride) {
|
||||||
|
int32_t expert_id = topk_ids[i];
|
||||||
|
if (has_expert_map) {
|
||||||
|
expert_id = expert_map[expert_id];
|
||||||
|
// filter invalid expert
|
||||||
|
if (expert_id == -1) continue;
|
||||||
|
}
|
||||||
|
int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
|
||||||
|
tokens_cnts[(tid + 1) * num_experts + expert_id] += mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (tid < num_experts) {
|
||||||
|
tokens_cnts[tid] = 0;
|
||||||
|
for (int i = 1; i <= stride; ++i) {
|
||||||
|
tokens_cnts[i * num_experts + tid] +=
|
||||||
|
tokens_cnts[(i - 1) * num_experts + tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (tid == 0) {
|
||||||
|
cumsum[0] = 0;
|
||||||
|
for (int i = 1; i <= num_experts; ++i) {
|
||||||
|
cumsum[i] =
|
||||||
|
cumsum[i - 1] +
|
||||||
|
CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) *
|
||||||
|
block_size;
|
||||||
|
}
|
||||||
|
total_tokens_post_pad[model_offset] =
|
||||||
|
static_cast<int32_t>(cumsum[num_experts]);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (tid < num_experts) {
|
||||||
|
for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) {
|
||||||
|
expert_ids[expert_ids_offset + i / block_size] = tid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill remaining expert_ids with 0
|
||||||
|
const size_t fill_start_idx = cumsum[num_experts] / block_size + tid;
|
||||||
|
for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) {
|
||||||
|
expert_ids[expert_ids_offset + i] = inactive_expert_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = tid; i < numel; i += stride) {
|
||||||
|
int32_t expert_id = topk_ids[i];
|
||||||
|
if (has_expert_map) {
|
||||||
|
expert_id = expert_map[expert_id];
|
||||||
|
// filter invalid expert
|
||||||
|
if (expert_id == -1) continue;
|
||||||
|
}
|
||||||
|
int32_t rank_post_pad =
|
||||||
|
tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id];
|
||||||
|
|
||||||
|
if (token_mask == nullptr || token_mask[i / topk_num]) {
|
||||||
|
sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
|
||||||
|
++tokens_cnts[tid * num_experts + expert_id];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
__global__ void count_and_sort_expert_tokens_kernel(
|
__device__ void _count_and_sort_expert_tokens(
|
||||||
const scalar_t* __restrict__ topk_ids,
|
const scalar_t* __restrict__ topk_ids,
|
||||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||||
size_t numel, int32_t num_experts) {
|
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask,
|
||||||
const size_t stride = blockDim.x * gridDim.x;
|
int32_t model_offset, int32_t topk_num, bool has_expert_map) {
|
||||||
|
const size_t tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||||
|
const size_t stride = blockDim.x * gridDim.y;
|
||||||
|
|
||||||
for (size_t i = tid; i < numel; i += stride) {
|
for (size_t i = tid; i < numel; i += stride) {
|
||||||
int32_t expert_id = topk_ids[i];
|
int32_t expert_id = topk_ids[i];
|
||||||
if (expert_id >= num_experts) {
|
if (expert_id >= num_experts) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
|
|
||||||
sorted_token_ids[rank_post_pad] = i;
|
if (has_expert_map) {
|
||||||
|
expert_id = expert_map[expert_id];
|
||||||
|
// filter invalid experts
|
||||||
|
if (expert_id == -1) continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (token_mask == nullptr || token_mask[i / topk_num]) {
|
||||||
|
int32_t rank_post_pad = atomicAdd(
|
||||||
|
&cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
|
||||||
|
sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
|
||||||
|
i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
__global__ void moe_align_block_size_kernel(
|
||||||
|
const scalar_t* __restrict__ topk_ids,
|
||||||
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||||
|
int32_t* __restrict__ total_tokens_post_pad,
|
||||||
|
int32_t* __restrict__ expert_map, int32_t num_experts,
|
||||||
|
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||||
|
size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded,
|
||||||
|
int32_t topk_num, bool has_expert_map) {
|
||||||
|
_moe_align_block_size(
|
||||||
|
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||||
|
num_experts, padded_num_experts, experts_per_warp, block_size, numel,
|
||||||
|
cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size),
|
||||||
|
0, 0, topk_num, nullptr, has_expert_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
__global__ void count_and_sort_expert_tokens_kernel(
|
||||||
|
const scalar_t* __restrict__ topk_ids,
|
||||||
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||||
|
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||||
|
int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) {
|
||||||
|
_count_and_sort_expert_tokens(
|
||||||
|
topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
|
||||||
|
max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename scalar_t, int TOPK>
|
template <typename scalar_t, int TOPK>
|
||||||
__global__ void moe_sum_kernel(
|
__global__ void moe_sum_kernel(
|
||||||
scalar_t* __restrict__ out, // [..., d]
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
@ -193,78 +362,111 @@ __global__ void moe_sum_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t, int32_t fill_threads>
|
||||||
__global__ void moe_align_block_size_small_batch_expert_kernel(
|
__global__ void moe_align_block_size_small_batch_expert_kernel(
|
||||||
const scalar_t* __restrict__ topk_ids,
|
const scalar_t* __restrict__ topk_ids,
|
||||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
int32_t* __restrict__ total_tokens_post_pad,
|
||||||
int32_t block_size, size_t numel, int32_t max_num_tokens_padded) {
|
int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size,
|
||||||
// Initialize sorted_token_ids with numel
|
size_t numel, int32_t max_num_tokens_padded, int32_t topk_num,
|
||||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
bool has_expert_map) {
|
||||||
sorted_token_ids[it] = numel;
|
_moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
|
||||||
|
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||||
|
num_experts, block_size, numel, max_num_tokens_padded,
|
||||||
|
CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr,
|
||||||
|
has_expert_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
__global__ void moe_lora_align_block_size_kernel(
|
||||||
|
scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping,
|
||||||
|
int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
|
||||||
|
int max_loras, size_t numel, int max_num_tokens_padded,
|
||||||
|
int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
|
||||||
|
int32_t* __restrict__ expert_ids, int32_t topk_num,
|
||||||
|
int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
|
||||||
|
int32_t* __restrict__ cumsum, int32_t experts_per_warp,
|
||||||
|
int32_t padded_num_experts, int32_t* lora_ids,
|
||||||
|
int32_t* __restrict__ token_mask, bool has_expert_map) {
|
||||||
|
int lora_idx = blockIdx.x / 2;
|
||||||
|
int lora_id = lora_ids[lora_idx];
|
||||||
|
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t tid = threadIdx.x;
|
// Populate the token_mask based on the token-LoRA mapping
|
||||||
const size_t stride = blockDim.x;
|
int num_tokens = numel / topk_num;
|
||||||
|
|
||||||
extern __shared__ int32_t shared_mem[];
|
|
||||||
int32_t* cumsum = shared_mem;
|
|
||||||
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
|
|
||||||
|
|
||||||
for (int i = 0; i < num_experts; ++i) {
|
|
||||||
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = tid; i < numel; i += stride) {
|
|
||||||
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (threadIdx.x < num_experts) {
|
|
||||||
tokens_cnts[threadIdx.x] = 0;
|
|
||||||
for (int i = 1; i <= blockDim.x; ++i) {
|
|
||||||
tokens_cnts[i * num_experts + threadIdx.x] +=
|
|
||||||
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
cumsum[0] = 0;
|
total_tokens_post_pad[lora_id] = 0;
|
||||||
for (int i = 1; i <= num_experts; ++i) {
|
|
||||||
cumsum[i] =
|
for (int i = 0; i < num_tokens; i++) {
|
||||||
cumsum[i - 1] +
|
token_mask[(lora_id * num_tokens) + i] =
|
||||||
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
|
(int)token_lora_mapping[i] == lora_id;
|
||||||
block_size;
|
|
||||||
}
|
}
|
||||||
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x < num_experts) {
|
_moe_align_block_size(
|
||||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||||
i += block_size) {
|
num_experts, padded_num_experts, experts_per_warp, block_size, numel,
|
||||||
expert_ids[i / block_size] = threadIdx.x;
|
cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num,
|
||||||
|
&token_mask[(lora_id * num_tokens)], has_expert_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
__global__ void lora_count_and_sort_expert_tokens_kernel(
|
||||||
|
const scalar_t* __restrict__ topk_ids,
|
||||||
|
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||||
|
int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts,
|
||||||
|
int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask,
|
||||||
|
int32_t* lora_ids, bool has_expert_map) {
|
||||||
|
int lora_idx = blockIdx.x;
|
||||||
|
int lora_id = lora_ids[lora_idx];
|
||||||
|
if (lora_id == -1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_tokens = numel / topk_num;
|
||||||
|
|
||||||
|
_count_and_sort_expert_tokens(
|
||||||
|
topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts,
|
||||||
|
max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id,
|
||||||
|
topk_num, has_expert_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, int32_t fill_threads>
|
||||||
|
__global__ void moe_lora_align_block_size_small_batch_expert_kernel(
|
||||||
|
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
|
||||||
|
int64_t block_size, int32_t* __restrict__ expert_map, int num_experts,
|
||||||
|
int max_loras, size_t numel, int max_num_tokens_padded,
|
||||||
|
int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids,
|
||||||
|
int32_t* __restrict__ expert_ids, int topk_num,
|
||||||
|
int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids,
|
||||||
|
int32_t* token_mask, bool has_expert_map) {
|
||||||
|
int lora_idx = blockIdx.x;
|
||||||
|
int lora_id = lora_ids[lora_idx];
|
||||||
|
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_tokens = numel / topk_num;
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
total_tokens_post_pad[lora_id] = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_tokens; i++) {
|
||||||
|
token_mask[(lora_id * num_tokens) + i] =
|
||||||
|
(int)token_lora_mapping[i] == lora_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill remaining expert_ids with 0
|
__syncthreads();
|
||||||
const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x;
|
|
||||||
const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size);
|
|
||||||
for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) {
|
|
||||||
expert_ids[i] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = tid; i < numel; i += stride) {
|
_moe_align_block_size_small_batch_expert<scalar_t, fill_threads>(
|
||||||
int32_t expert_id = topk_ids[i];
|
topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map,
|
||||||
int32_t rank_post_pad =
|
num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks,
|
||||||
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
|
-1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)],
|
||||||
sorted_token_ids[rank_post_pad] = i;
|
has_expert_map);
|
||||||
++tokens_cnts[threadIdx.x * num_experts + expert_id];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace moe
|
} // namespace moe
|
||||||
@ -275,7 +477,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(
|
|||||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
torch::Tensor experts_ids,
|
torch::Tensor experts_ids,
|
||||||
torch::Tensor num_tokens_post_pad) {
|
torch::Tensor num_tokens_post_pad,
|
||||||
|
std::optional<torch::Tensor> maybe_expert_map) {
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
int64_t padded_num_experts =
|
int64_t padded_num_experts =
|
||||||
@ -287,14 +490,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
// BlockScan uses 1024 threads and assigns one thread per expert.
|
// BlockScan uses 1024 threads and assigns one thread per expert.
|
||||||
TORCH_CHECK(padded_num_experts < 1024,
|
TORCH_CHECK(padded_num_experts < 1024,
|
||||||
"padded_num_experts must be less than 1024");
|
"padded_num_experts must be less than 1024");
|
||||||
|
auto options_int =
|
||||||
|
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||||
|
bool has_expert_map = maybe_expert_map.has_value();
|
||||||
|
torch::Tensor expert_map;
|
||||||
|
if (has_expert_map) {
|
||||||
|
expert_map = maybe_expert_map.value();
|
||||||
|
} else {
|
||||||
|
expert_map = torch::empty({0}, options_int);
|
||||||
|
}
|
||||||
|
|
||||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||||
// calc needed amount of shared mem for `cumsum` tensors
|
// calc needed amount of shared mem for `cumsum` tensors
|
||||||
auto options_int =
|
|
||||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
|
||||||
torch::Tensor cumsum_buffer =
|
|
||||||
torch::empty({num_experts + 1}, options_int);
|
|
||||||
bool small_batch_expert_mode =
|
bool small_batch_expert_mode =
|
||||||
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||||
|
|
||||||
@ -304,43 +512,58 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
((threads + 1) * num_experts + (num_experts + 1)) *
|
((threads + 1) * num_experts + (num_experts + 1)) *
|
||||||
sizeof(int32_t);
|
sizeof(int32_t);
|
||||||
|
|
||||||
|
// threadIdx.x >= fill_threads: counting experts and aligning
|
||||||
|
// threadIdx.x < fill_threads: filling sorted_token_ids
|
||||||
|
constexpr int32_t fill_threads = 256;
|
||||||
auto small_batch_expert_kernel =
|
auto small_batch_expert_kernel =
|
||||||
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
|
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
|
||||||
scalar_t>;
|
scalar_t, fill_threads>;
|
||||||
small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
|
small_batch_expert_kernel<<<1, fill_threads + threads,
|
||||||
|
shared_mem_size, stream>>>(
|
||||||
topk_ids.data_ptr<scalar_t>(),
|
topk_ids.data_ptr<scalar_t>(),
|
||||||
sorted_token_ids.data_ptr<int32_t>(),
|
sorted_token_ids.data_ptr<int32_t>(),
|
||||||
experts_ids.data_ptr<int32_t>(),
|
experts_ids.data_ptr<int32_t>(),
|
||||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||||
topk_ids.numel(), sorted_token_ids.size(0));
|
expert_map.data_ptr<int32_t>(), num_experts, block_size,
|
||||||
|
topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1),
|
||||||
|
has_expert_map);
|
||||||
} else {
|
} else {
|
||||||
|
torch::Tensor cumsum_buffer =
|
||||||
|
torch::empty({num_experts + 1}, options_int);
|
||||||
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
||||||
|
|
||||||
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
|
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
|
||||||
size_t shared_mem_size =
|
size_t shared_mem_size =
|
||||||
num_warps * experts_per_warp * sizeof(int32_t);
|
num_warps * experts_per_warp * sizeof(int32_t);
|
||||||
|
|
||||||
align_kernel<<<1, threads, shared_mem_size, stream>>>(
|
// launch two threadblocks
|
||||||
|
// blockIdx.x == 0: counting experts and aligning
|
||||||
|
// blockIdx.x == 1: filling sorted_token_ids
|
||||||
|
align_kernel<<<2, threads, shared_mem_size, stream>>>(
|
||||||
topk_ids.data_ptr<scalar_t>(),
|
topk_ids.data_ptr<scalar_t>(),
|
||||||
sorted_token_ids.data_ptr<int32_t>(),
|
sorted_token_ids.data_ptr<int32_t>(),
|
||||||
experts_ids.data_ptr<int32_t>(),
|
experts_ids.data_ptr<int32_t>(),
|
||||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
|
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||||
padded_num_experts, experts_per_warp, block_size,
|
expert_map.data_ptr<int32_t>(), num_experts, padded_num_experts,
|
||||||
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(),
|
experts_per_warp, block_size, topk_ids.numel(),
|
||||||
sorted_token_ids.size(0));
|
cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0),
|
||||||
|
topk_ids.size(1), has_expert_map);
|
||||||
|
|
||||||
const int block_threads = std::min(256, (int)threads);
|
const int block_threads = std::min(256, (int)threads);
|
||||||
const int num_blocks =
|
const int num_blocks =
|
||||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||||
const int max_blocks = 65535;
|
const int max_blocks = 65535;
|
||||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||||
|
dim3 gridDims(1, actual_blocks);
|
||||||
|
|
||||||
auto sort_kernel =
|
auto sort_kernel =
|
||||||
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
|
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
|
||||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
sort_kernel<<<gridDims, block_threads, 0, stream>>>(
|
||||||
topk_ids.data_ptr<scalar_t>(),
|
topk_ids.data_ptr<scalar_t>(),
|
||||||
sorted_token_ids.data_ptr<int32_t>(),
|
sorted_token_ids.data_ptr<int32_t>(),
|
||||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
|
cumsum_buffer.data_ptr<int32_t>(), expert_map.data_ptr<int32_t>(),
|
||||||
|
topk_ids.numel(), num_experts, sorted_token_ids.size(0),
|
||||||
|
topk_ids.size(1), has_expert_map);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -414,3 +637,123 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void moe_lora_align_block_size(
|
||||||
|
torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
|
||||||
|
int64_t num_experts, int64_t block_size, int64_t max_loras,
|
||||||
|
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
||||||
|
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
||||||
|
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
||||||
|
torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map) {
|
||||||
|
const int topk_num = topk_ids.size(1);
|
||||||
|
|
||||||
|
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
|
||||||
|
|
||||||
|
int device_max_shared_mem;
|
||||||
|
auto dev = topk_ids.get_device();
|
||||||
|
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||||
|
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
int64_t padded_num_experts =
|
||||||
|
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
||||||
|
|
||||||
|
// BlockScan uses 1024 threads and assigns one thread per expert.
|
||||||
|
TORCH_CHECK(padded_num_experts < 1024,
|
||||||
|
"padded_num_experts must be less than 1024");
|
||||||
|
|
||||||
|
auto options_int =
|
||||||
|
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||||
|
torch::Tensor token_mask =
|
||||||
|
torch::empty({max_loras * topk_ids.size(0)}, options_int);
|
||||||
|
bool has_expert_map = maybe_expert_map.has_value();
|
||||||
|
torch::Tensor expert_map;
|
||||||
|
if (has_expert_map) {
|
||||||
|
expert_map = maybe_expert_map.value();
|
||||||
|
} else {
|
||||||
|
expert_map = torch::empty({0}, options_int);
|
||||||
|
}
|
||||||
|
|
||||||
|
VLLM_DISPATCH_INTEGRAL_TYPES(
|
||||||
|
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
|
||||||
|
bool small_batch_expert_mode =
|
||||||
|
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||||
|
|
||||||
|
if (small_batch_expert_mode) {
|
||||||
|
const int32_t num_thread = max((int32_t)num_experts, 128);
|
||||||
|
const int32_t shared_mem =
|
||||||
|
(num_thread + 1) * num_experts * sizeof(int32_t) +
|
||||||
|
(num_experts + 1) * sizeof(int32_t);
|
||||||
|
if (shared_mem > device_max_shared_mem) {
|
||||||
|
TORCH_CHECK(false, "Shared memory usage exceeds device limit.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// threadIdx.x >= fill_threads: counting experts and aligning
|
||||||
|
// threadIdx.x < fill_threads: filling sorted_token_ids
|
||||||
|
constexpr int32_t fill_threads = 256;
|
||||||
|
|
||||||
|
dim3 blockDim(num_thread + fill_threads);
|
||||||
|
auto kernel =
|
||||||
|
vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel<
|
||||||
|
scalar_t, fill_threads>;
|
||||||
|
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||||
|
(void*)kernel, shared_mem));
|
||||||
|
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
|
||||||
|
topk_ids.data_ptr<scalar_t>(),
|
||||||
|
token_lora_mapping.data_ptr<int32_t>(), block_size,
|
||||||
|
expert_map.data_ptr<int32_t>(), num_experts, max_loras,
|
||||||
|
topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
|
||||||
|
sorted_token_ids.data_ptr<int32_t>(),
|
||||||
|
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||||
|
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||||
|
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>(),
|
||||||
|
token_mask.data_ptr<int32_t>(), has_expert_map);
|
||||||
|
} else {
|
||||||
|
int num_thread = 1024;
|
||||||
|
dim3 blockDim(num_thread);
|
||||||
|
size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE);
|
||||||
|
|
||||||
|
size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t);
|
||||||
|
|
||||||
|
// cumsum buffer
|
||||||
|
torch::Tensor cumsum =
|
||||||
|
torch::zeros({max_loras * (num_experts + 1)}, options_int);
|
||||||
|
|
||||||
|
auto align_kernel =
|
||||||
|
vllm::moe::moe_lora_align_block_size_kernel<scalar_t>;
|
||||||
|
|
||||||
|
// launch two threadblocks for each lora
|
||||||
|
// blockIdx.x % 2 == 0: counting experts and aligning
|
||||||
|
// blockIdx.x % 2 == 1: filling sorted_token_ids
|
||||||
|
align_kernel<<<max_loras * 2, blockDim, shared_mem_size, stream>>>(
|
||||||
|
topk_ids.data_ptr<scalar_t>(),
|
||||||
|
token_lora_mapping.data_ptr<int32_t>(), block_size,
|
||||||
|
expert_map.data_ptr<int32_t>(), num_experts, max_loras,
|
||||||
|
topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks,
|
||||||
|
sorted_token_ids.data_ptr<int32_t>(),
|
||||||
|
expert_ids.data_ptr<int32_t>(), topk_num,
|
||||||
|
num_tokens_post_pad.data_ptr<int32_t>(),
|
||||||
|
adapter_enabled.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
|
||||||
|
WARP_SIZE, padded_num_experts, lora_ids.data_ptr<int32_t>(),
|
||||||
|
token_mask.data_ptr<int32_t>(), has_expert_map);
|
||||||
|
|
||||||
|
const int block_threads = std::min(256, (int)num_thread);
|
||||||
|
const int num_blocks =
|
||||||
|
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||||
|
|
||||||
|
const int max_blocks = 65535;
|
||||||
|
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||||
|
|
||||||
|
dim3 gridDims(max_loras, actual_blocks);
|
||||||
|
auto sort_kernel =
|
||||||
|
vllm::moe::lora_count_and_sort_expert_tokens_kernel<scalar_t>;
|
||||||
|
|
||||||
|
sort_kernel<<<gridDims, block_threads, 0, stream>>>(
|
||||||
|
topk_ids.data_ptr<scalar_t>(),
|
||||||
|
sorted_token_ids.data_ptr<int32_t>(), cumsum.data_ptr<int32_t>(),
|
||||||
|
expert_map.data_ptr<int32_t>(), topk_ids.numel(), num_experts,
|
||||||
|
max_num_tokens_padded, topk_num, token_mask.data_ptr<int32_t>(),
|
||||||
|
lora_ids.data_ptr<int32_t>(), has_expert_map);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -1,174 +0,0 @@
|
|||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <time.h>
|
|
||||||
#include <torch/all.h>
|
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
|
||||||
#include <ATen/cuda/Atomic.cuh>
|
|
||||||
|
|
||||||
#include "../cuda_compat.h"
|
|
||||||
#include "../dispatch_utils.h"
|
|
||||||
#include "core/math.hpp"
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
|
||||||
int32_t col) {
|
|
||||||
return row * total_col + col;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
// TODO: Refactor common parts with moe_align_sum_kernels
|
|
||||||
template <typename scalar_t, typename token_cnts_t>
|
|
||||||
__global__ void moe_lora_align_sum_kernel(
|
|
||||||
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
|
|
||||||
int64_t block_size, int num_experts, int max_loras, size_t numel,
|
|
||||||
int max_num_tokens_padded, int max_num_m_blocks,
|
|
||||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
|
||||||
int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
|
|
||||||
int32_t* lora_ids) {
|
|
||||||
const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
|
|
||||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
|
||||||
|
|
||||||
int lora_idx = blockIdx.x;
|
|
||||||
int lora_id = lora_ids[lora_idx];
|
|
||||||
if (lora_id == -1 || adapter_enabled[lora_id] == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
extern __shared__ int32_t shared_mem[];
|
|
||||||
int32_t* cumsum = shared_mem;
|
|
||||||
token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
|
|
||||||
|
|
||||||
// Initialize sorted_token_ids with numel
|
|
||||||
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
|
|
||||||
sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize expert_ids with -1
|
|
||||||
for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
|
|
||||||
expert_ids[lora_id * max_num_m_blocks + it] = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize total_tokens_post_pad with 0
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
total_tokens_post_pad[lora_id] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < num_experts; ++i) {
|
|
||||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
|
||||||
int mask = token_lora_mapping[i / topk_num] == lora_id;
|
|
||||||
int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
|
|
||||||
tokens_cnts[idx] += mask;
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// For each expert we accumulate the token counts from the different threads.
|
|
||||||
if (threadIdx.x < num_experts) {
|
|
||||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
|
||||||
for (int i = 1; i <= blockDim.x; ++i) {
|
|
||||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
|
||||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// We accumulate the token counts of all experts in thread 0.
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
cumsum[0] = 0;
|
|
||||||
for (int i = 1; i <= num_experts; ++i) {
|
|
||||||
cumsum[i] = cumsum[i - 1] +
|
|
||||||
div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
|
||||||
block_size) *
|
|
||||||
block_size;
|
|
||||||
}
|
|
||||||
total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* For each expert, each thread processes the tokens of the corresponding
|
|
||||||
* blocks and stores the corresponding expert_id for each block.
|
|
||||||
*/
|
|
||||||
if (threadIdx.x < num_experts) {
|
|
||||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
|
||||||
i += block_size) {
|
|
||||||
expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
|
|
||||||
threadIdx.x;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
|
||||||
int32_t expert_id = topk_ids[i];
|
|
||||||
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
|
||||||
* expert with expert_id needs to process, and
|
|
||||||
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
|
||||||
* processed by the expert with expert_id within the current thread's token
|
|
||||||
* shard.
|
|
||||||
*/
|
|
||||||
int32_t rank_post_pad =
|
|
||||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
|
||||||
cumsum[expert_id];
|
|
||||||
|
|
||||||
int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
|
|
||||||
atomicAdd(
|
|
||||||
&sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
|
|
||||||
(i - numel) * mask);
|
|
||||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void moe_lora_align_block_size(
|
|
||||||
torch::Tensor topk_ids, torch::Tensor token_lora_mapping,
|
|
||||||
int64_t num_experts, int64_t block_size, int64_t max_loras,
|
|
||||||
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
|
||||||
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
|
||||||
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
|
||||||
torch::Tensor lora_ids) {
|
|
||||||
const int topk_num = topk_ids.size(1);
|
|
||||||
|
|
||||||
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
|
|
||||||
|
|
||||||
int device_max_shared_mem;
|
|
||||||
auto dev = topk_ids.get_device();
|
|
||||||
cudaDeviceGetAttribute(&device_max_shared_mem,
|
|
||||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
|
|
||||||
const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE,
|
|
||||||
TORCH_CHECK(num_thread <= 1024,
|
|
||||||
"num_thread must be less than 1024, "
|
|
||||||
"and fallback is not implemented yet.");
|
|
||||||
const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
|
|
||||||
(num_experts + 1) * sizeof(int32_t);
|
|
||||||
|
|
||||||
if (shared_mem > device_max_shared_mem) {
|
|
||||||
TORCH_CHECK(false,
|
|
||||||
"Shared memory usage exceeds device limit, and global memory "
|
|
||||||
"fallback is not implemented yet.");
|
|
||||||
}
|
|
||||||
|
|
||||||
VLLM_DISPATCH_INTEGRAL_TYPES(
|
|
||||||
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
|
|
||||||
dim3 blockDim(num_thread);
|
|
||||||
auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
|
|
||||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
|
||||||
(void*)kernel, shared_mem));
|
|
||||||
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
|
|
||||||
topk_ids.data_ptr<scalar_t>(),
|
|
||||||
token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
|
|
||||||
max_loras, topk_ids.numel(), max_num_tokens_padded,
|
|
||||||
max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
|
|
||||||
expert_ids.data_ptr<int32_t>(), topk_num,
|
|
||||||
num_tokens_post_pad.data_ptr<int32_t>(),
|
|
||||||
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
|
|
||||||
});
|
|
||||||
}
|
|
||||||
@ -11,7 +11,8 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
|||||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
torch::Tensor experts_ids,
|
torch::Tensor experts_ids,
|
||||||
torch::Tensor num_tokens_post_pad);
|
torch::Tensor num_tokens_post_pad,
|
||||||
|
std::optional<torch::Tensor> maybe_expert_map);
|
||||||
|
|
||||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||||
int64_t block_size,
|
int64_t block_size,
|
||||||
@ -26,7 +27,7 @@ void moe_lora_align_block_size(
|
|||||||
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
|
||||||
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
|
||||||
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
|
||||||
torch::Tensor lora_ids);
|
torch::Tensor lora_ids, std::optional<torch::Tensor> maybe_expert_map);
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||||
|
|||||||
@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
"moe_align_block_size(Tensor topk_ids, int num_experts,"
|
"moe_align_block_size(Tensor topk_ids, int num_experts,"
|
||||||
" int block_size, Tensor! sorted_token_ids,"
|
" int block_size, Tensor! sorted_token_ids,"
|
||||||
" Tensor! experts_ids,"
|
" Tensor! experts_ids,"
|
||||||
" Tensor! num_tokens_post_pad) -> ()");
|
" Tensor! num_tokens_post_pad,"
|
||||||
|
" Tensor? maybe_expert_map) -> ()");
|
||||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||||
|
|
||||||
// Aligning the number of tokens to be processed by each expert such
|
// Aligning the number of tokens to be processed by each expert such
|
||||||
@ -46,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
" Tensor !experts_ids,"
|
" Tensor !experts_ids,"
|
||||||
" Tensor !num_tokens_post_pad,"
|
" Tensor !num_tokens_post_pad,"
|
||||||
" Tensor !adapter_enabled,"
|
" Tensor !adapter_enabled,"
|
||||||
" Tensor !lora_ids) -> () ");
|
" Tensor !lora_ids,"
|
||||||
|
" Tensor? maybe_expert_map) -> () ");
|
||||||
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
|
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
|
|||||||
31
csrc/ops.h
31
csrc/ops.h
@ -102,13 +102,16 @@ void apply_repetition_penalties_(torch::Tensor& logits,
|
|||||||
const torch::Tensor& output_mask,
|
const torch::Tensor& output_mask,
|
||||||
const torch::Tensor& repetition_penalties);
|
const torch::Tensor& repetition_penalties);
|
||||||
|
|
||||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
void top_k_per_row_prefill(const torch::Tensor& logits,
|
||||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
const torch::Tensor& rowStarts,
|
||||||
int64_t numRows, int64_t stride0, int64_t stride1);
|
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||||
|
int64_t numRows, int64_t stride0, int64_t stride1,
|
||||||
|
int64_t topK);
|
||||||
|
|
||||||
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
||||||
const torch::Tensor& seq_lens, torch::Tensor& indices,
|
const torch::Tensor& seqLens, torch::Tensor& indices,
|
||||||
int64_t numRows, int64_t stride0, int64_t stride1);
|
int64_t numRows, int64_t stride0, int64_t stride1,
|
||||||
|
int64_t topK);
|
||||||
|
|
||||||
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||||
torch::Tensor& weight, torch::Tensor& scale,
|
torch::Tensor& weight, torch::Tensor& scale,
|
||||||
@ -128,6 +131,13 @@ void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
|
|||||||
std::optional<torch::Tensor> scale_ub,
|
std::optional<torch::Tensor> scale_ub,
|
||||||
std::optional<torch::Tensor> residual);
|
std::optional<torch::Tensor> residual);
|
||||||
|
|
||||||
|
void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
torch::Tensor const& weight,
|
||||||
|
torch::Tensor& scales, double const epsilon,
|
||||||
|
std::optional<torch::Tensor> scale_ub,
|
||||||
|
std::optional<torch::Tensor> residual,
|
||||||
|
int64_t group_size, bool is_scale_transposed);
|
||||||
|
|
||||||
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
std::optional<torch::Tensor> key, int64_t head_size,
|
std::optional<torch::Tensor> key, int64_t head_size,
|
||||||
torch::Tensor& cos_sin_cache, bool is_neox);
|
torch::Tensor& cos_sin_cache, bool is_neox);
|
||||||
@ -252,7 +262,8 @@ void get_cutlass_moe_mm_data(
|
|||||||
void get_cutlass_moe_mm_problem_sizes(
|
void get_cutlass_moe_mm_problem_sizes(
|
||||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
|
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||||
|
std::optional<bool> force_swap_ab = std::nullopt);
|
||||||
|
|
||||||
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||||
torch::Tensor& problem_sizes1,
|
torch::Tensor& problem_sizes1,
|
||||||
@ -299,6 +310,14 @@ void per_token_group_quant_int8(const torch::Tensor& input,
|
|||||||
torch::Tensor& output_q,
|
torch::Tensor& output_q,
|
||||||
torch::Tensor& output_s, int64_t group_size,
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
double eps, double int8_min, double int8_max);
|
double eps, double int8_min, double int8_max);
|
||||||
|
|
||||||
|
// Fused activation quantisation + DeepGEMM-compatible UE8M0-packed scales.
|
||||||
|
void per_token_group_quant_8bit_packed(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s_packed,
|
||||||
|
int64_t group_size, double eps,
|
||||||
|
double min_8bit, double max_8bit);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
|||||||
104
csrc/quantization/cutlass_w4a8/get_group_starts.cuh
Normal file
104
csrc/quantization/cutlass_w4a8/get_group_starts.cuh
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
#include <c10/cuda/CUDAStream.h>
|
||||||
|
|
||||||
|
#include "core/scalar_type.hpp"
|
||||||
|
#include "cutlass/bfloat16.h"
|
||||||
|
#include "cutlass/float8.h"
|
||||||
|
|
||||||
|
// ElementB is int32 (packed int4)
|
||||||
|
// ElementGroupScale is cutlass::Array<cutlass::float_e4m3_t, 8> (packed fp8)
|
||||||
|
template <typename ElementA, typename ElementB, typename ElementC,
|
||||||
|
typename ElementAccumulator, typename ElementGroupScale>
|
||||||
|
__global__ void get_group_gemm_starts(
|
||||||
|
int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets,
|
||||||
|
ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
|
||||||
|
ElementAccumulator** b_scales_offsets,
|
||||||
|
ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int,
|
||||||
|
ElementB* b_base_as_int, ElementC* out_base_as_int,
|
||||||
|
ElementAccumulator* a_scales_base_as_int,
|
||||||
|
ElementAccumulator* b_scales_base_as_int,
|
||||||
|
ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k,
|
||||||
|
int64_t scale_k) {
|
||||||
|
int expert_id = threadIdx.x;
|
||||||
|
|
||||||
|
int64_t expert_offset = expert_offsets[expert_id];
|
||||||
|
|
||||||
|
// same as w8a8
|
||||||
|
a_offsets[expert_id] = a_base_as_int + expert_offset * k;
|
||||||
|
out_offsets[expert_id] = out_base_as_int + expert_offset * n;
|
||||||
|
a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset;
|
||||||
|
b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id);
|
||||||
|
|
||||||
|
// w4a8 specific
|
||||||
|
constexpr int pack_factor = 8; // pack 8 int4 into int32
|
||||||
|
b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor);
|
||||||
|
b_group_scales_offsets[expert_id] =
|
||||||
|
b_group_scales_base_as_int + (expert_id * scale_k * n);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE) \
|
||||||
|
else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
|
||||||
|
get_group_gemm_starts<cutlass::float_e4m3_t, int32_t, C_TYPE, float, \
|
||||||
|
cutlass::Array<cutlass::float_e4m3_t, 8>> \
|
||||||
|
<<<1, num_experts, 0, stream>>>( \
|
||||||
|
static_cast<int64_t*>(expert_offsets.data_ptr()), \
|
||||||
|
static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()), \
|
||||||
|
static_cast<int32_t**>(b_ptrs.data_ptr()), \
|
||||||
|
static_cast<C_TYPE**>(out_ptrs.data_ptr()), \
|
||||||
|
static_cast<float**>(a_scales_ptrs.data_ptr()), \
|
||||||
|
static_cast<float**>(b_scales_ptrs.data_ptr()), \
|
||||||
|
static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>**>( \
|
||||||
|
b_group_scales_ptrs.data_ptr()), \
|
||||||
|
static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()), \
|
||||||
|
static_cast<int32_t*>(b_tensors.data_ptr()), \
|
||||||
|
static_cast<C_TYPE*>(out_tensors.data_ptr()), \
|
||||||
|
static_cast<float*>(a_scales.data_ptr()), \
|
||||||
|
static_cast<float*>(b_scales.data_ptr()), \
|
||||||
|
static_cast<cutlass::Array<cutlass::float_e4m3_t, 8>*>( \
|
||||||
|
b_group_scales.data_ptr()), \
|
||||||
|
n, k, scale_k); \
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
void run_get_group_gemm_starts(
|
||||||
|
torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
|
||||||
|
torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
|
||||||
|
torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
|
||||||
|
torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors,
|
||||||
|
torch::Tensor const& b_tensors, torch::Tensor& out_tensors,
|
||||||
|
torch::Tensor const& a_scales, torch::Tensor const& b_scales,
|
||||||
|
torch::Tensor const& b_group_scales, const int64_t b_group_size) {
|
||||||
|
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
TORCH_CHECK(b_tensors.dtype() == torch::kInt32); // int4 8x packed into int32
|
||||||
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_group_scales.dtype() ==
|
||||||
|
torch::kFloat8_e4m3fn); // the underlying torch type is e4m3
|
||||||
|
TORCH_CHECK(out_tensors.dtype() ==
|
||||||
|
torch::kBFloat16); // only support bf16 for now
|
||||||
|
// expect int64_t to avoid overflow during offset calculations
|
||||||
|
TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
|
||||||
|
|
||||||
|
int num_experts = static_cast<int>(expert_offsets.size(0));
|
||||||
|
// logical k, n
|
||||||
|
int64_t n = out_tensors.size(1);
|
||||||
|
int64_t k = a_tensors.size(1);
|
||||||
|
int64_t scale_k = cutlass::ceil_div(k, b_group_size);
|
||||||
|
|
||||||
|
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
|
||||||
|
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
|
__CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
|
||||||
|
__CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
|
||||||
|
else {
|
||||||
|
TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
483
csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
Normal file
483
csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
Normal file
@ -0,0 +1,483 @@
|
|||||||
|
#include <vector>
|
||||||
|
#include <tuple>
|
||||||
|
|
||||||
|
#include "cutlass/cutlass.h"
|
||||||
|
|
||||||
|
#include "cute/tensor.hpp"
|
||||||
|
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||||
|
#include "cutlass/gemm/group_array_problem_shape.hpp"
|
||||||
|
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||||
|
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||||
|
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||||
|
|
||||||
|
#include "cutlass/util/packed_stride.hpp"
|
||||||
|
#include "cutlass/util/mixed_dtype_utils.hpp"
|
||||||
|
|
||||||
|
// vllm includes
|
||||||
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
#include "cutlass_extensions/torch_utils.hpp"
|
||||||
|
#include "cutlass_extensions/common.hpp"
|
||||||
|
|
||||||
|
#include "core/registration.h"
|
||||||
|
#include "get_group_starts.cuh"
|
||||||
|
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
|
||||||
|
#include "w4a8_utils.cuh"
|
||||||
|
|
||||||
|
namespace vllm::cutlass_w4a8_moe {
|
||||||
|
|
||||||
|
using namespace cute;
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------------------
|
||||||
|
// Static configuration shared across all instantiations
|
||||||
|
// -------------------------------------------------------------------------------------
|
||||||
|
using ProblemShape =
|
||||||
|
cutlass::gemm::GroupProblemShape<Shape<int, int, int>>; // <M,N,K> per
|
||||||
|
// group
|
||||||
|
using MmaType = cutlass::float_e4m3_t;
|
||||||
|
using QuantType = cutlass::int4b_t;
|
||||||
|
|
||||||
|
constexpr int TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
|
||||||
|
static int constexpr PackFactor = 8; // 8 int4 packed into int32
|
||||||
|
|
||||||
|
// A matrix configuration
|
||||||
|
using ElementA = MmaType;
|
||||||
|
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||||
|
constexpr int AlignmentA =
|
||||||
|
128 /
|
||||||
|
cutlass::sizeof_bits<ElementA>::value; // Alignment of A matrix in units of
|
||||||
|
// elements (up to 16 bytes)
|
||||||
|
|
||||||
|
// B matrix configuration
|
||||||
|
using ElementB = QuantType; // Element type for B matrix operand
|
||||||
|
using LayoutB =
|
||||||
|
cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||||
|
constexpr int AlignmentB =
|
||||||
|
128 / cutlass::sizeof_bits<
|
||||||
|
ElementB>::value; // Memory access granularity/alignment of B
|
||||||
|
// matrix in units of elements (up to 16 bytes)
|
||||||
|
|
||||||
|
// This example manually swaps and transposes, so keep transpose of input
|
||||||
|
// layouts
|
||||||
|
using LayoutA_Transpose =
|
||||||
|
typename cutlass::layout::LayoutTranspose<LayoutA>::type;
|
||||||
|
using LayoutB_Transpose =
|
||||||
|
typename cutlass::layout::LayoutTranspose<LayoutB>::type;
|
||||||
|
|
||||||
|
// Need to pass a pointer type to make the 3rd dimension of Stride be _0
|
||||||
|
using StrideA =
|
||||||
|
cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
|
||||||
|
using StrideB =
|
||||||
|
cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
|
||||||
|
|
||||||
|
// Define the CuTe layout for reoredered quantized tensor B
|
||||||
|
// LayoutAtomQuant places values that will be read by the same thread in
|
||||||
|
// contiguous locations in global memory. It specifies the reordering within a
|
||||||
|
// single warp's fragment
|
||||||
|
using LayoutAtomQuant =
|
||||||
|
decltype(cutlass::compute_memory_reordering_atom<MmaType>());
|
||||||
|
using LayoutB_Reordered = decltype(cute::tile_to_shape(
|
||||||
|
LayoutAtomQuant{}, Layout<Shape<int, int, Int<1>>, StrideB>{}));
|
||||||
|
|
||||||
|
using ElementScale = cutlass::float_e4m3_t;
|
||||||
|
using LayoutScale = cutlass::layout::RowMajor;
|
||||||
|
|
||||||
|
// C/D matrix configuration
|
||||||
|
using ElementC =
|
||||||
|
cutlass::bfloat16_t; // Element type for C and D matrix operands
|
||||||
|
using LayoutC =
|
||||||
|
cutlass::layout::RowMajor; // Layout type for C and D matrix operands
|
||||||
|
constexpr int AlignmentC =
|
||||||
|
128 / cutlass::sizeof_bits<
|
||||||
|
ElementC>::value; // Memory access granularity/alignment of C
|
||||||
|
// matrix in units of elements (up to 16 bytes)
|
||||||
|
|
||||||
|
// D matrix configuration
|
||||||
|
using ElementD = ElementC;
|
||||||
|
using LayoutD = LayoutC;
|
||||||
|
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||||
|
|
||||||
|
// Core kernel configurations
|
||||||
|
using ElementAccumulator = float; // Element type for internal accumulation
|
||||||
|
using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that
|
||||||
|
// supports the intended feature
|
||||||
|
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||||
|
using StageCountType =
|
||||||
|
cutlass::gemm::collective::StageCountAuto; // Stage count maximized based
|
||||||
|
// on the tile size
|
||||||
|
|
||||||
|
// per-channel and per-token scales for epilogue
|
||||||
|
using ElementSChannel = float;
|
||||||
|
|
||||||
|
template <class TileShape_MN, class ClusterShape_MNK, class KernelSchedule,
|
||||||
|
class EpilogueSchedule>
|
||||||
|
struct W4A8GroupedGemmKernel {
|
||||||
|
using TileShape =
|
||||||
|
decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
|
||||||
|
using ClusterShape = ClusterShape_MNK;
|
||||||
|
|
||||||
|
// per-channel, per-token scales epilogue
|
||||||
|
using ChTokScalesEpilogue =
|
||||||
|
typename vllm::c3x::ScaledEpilogueArray<ElementAccumulator, ElementD,
|
||||||
|
TileShape>;
|
||||||
|
using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
|
||||||
|
using CollectiveEpilogue =
|
||||||
|
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||||
|
ArchTag, OperatorClass, TileShape, ClusterShape,
|
||||||
|
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||||
|
ElementSChannel, ElementC,
|
||||||
|
typename cutlass::layout::LayoutTranspose<LayoutC>::type*, AlignmentC,
|
||||||
|
ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*,
|
||||||
|
AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp;
|
||||||
|
|
||||||
|
// =========================================================== MIXED INPUT
|
||||||
|
// WITH SCALES
|
||||||
|
// ===========================================================================
|
||||||
|
// The Scale information must get paired with the operand that will be scaled.
|
||||||
|
// In this example, B is scaled so we make a tuple of B's information and the
|
||||||
|
// scale information.
|
||||||
|
using CollectiveMainloopShuffled =
|
||||||
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
|
ArchTag, OperatorClass,
|
||||||
|
cute::tuple<ElementB, cutlass::Array<ElementScale, 8>>,
|
||||||
|
LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*,
|
||||||
|
AlignmentA, ElementAccumulator, TileShape, ClusterShape,
|
||||||
|
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||||
|
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||||
|
KernelSchedule>::CollectiveOp;
|
||||||
|
|
||||||
|
using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
|
||||||
|
ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>;
|
||||||
|
|
||||||
|
using GemmShuffled =
|
||||||
|
cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
|
||||||
|
|
||||||
|
using StrideC = typename GemmKernelShuffled::InternalStrideC;
|
||||||
|
using StrideD = typename GemmKernelShuffled::InternalStrideD;
|
||||||
|
|
||||||
|
using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
|
||||||
|
using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
|
||||||
|
using StrideS = typename CollectiveMainloopShuffled::StrideScale;
|
||||||
|
using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
|
||||||
|
|
||||||
|
// static asserts for passing in strides/layouts
|
||||||
|
// pack to 2x int64
|
||||||
|
static_assert(sizeof(StrideS) == 2 * sizeof(int64_t));
|
||||||
|
// pack to 3xint32,
|
||||||
|
static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0,
|
||||||
|
"LayoutB_Reordered size must be divisible by 4 bytes");
|
||||||
|
|
||||||
|
static void grouped_mm(
|
||||||
|
torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
|
||||||
|
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
|
||||||
|
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
|
||||||
|
const int64_t b_group_size, const torch::Tensor& expert_offsets,
|
||||||
|
const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides,
|
||||||
|
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
|
||||||
|
const torch::Tensor& group_scale_strides) {
|
||||||
|
auto device = a_tensors.device();
|
||||||
|
auto device_id = device.index();
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device);
|
||||||
|
auto stream = at::cuda::getCurrentCUDAStream(device_id);
|
||||||
|
|
||||||
|
int num_experts = static_cast<int>(expert_offsets.size(0));
|
||||||
|
int n = static_cast<int>(b_tensors.size(1));
|
||||||
|
int k = static_cast<int>(b_tensors.size(2)) * PackFactor;
|
||||||
|
|
||||||
|
auto options_int =
|
||||||
|
torch::TensorOptions().dtype(torch::kInt64).device(device);
|
||||||
|
torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
|
||||||
|
torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
|
||||||
|
torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
|
||||||
|
torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
|
||||||
|
torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
|
||||||
|
torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int);
|
||||||
|
|
||||||
|
// get the correct offsets to pass to gemm
|
||||||
|
run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
|
||||||
|
a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs,
|
||||||
|
a_tensors, b_tensors, out_tensors, a_scales,
|
||||||
|
b_scales, b_group_scales, b_group_size);
|
||||||
|
|
||||||
|
// construct args
|
||||||
|
using Args = typename GemmShuffled::Arguments;
|
||||||
|
using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
|
||||||
|
using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
|
||||||
|
Args arguments;
|
||||||
|
|
||||||
|
ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
|
||||||
|
static_cast<ProblemShape::UnderlyingProblemShape*>(
|
||||||
|
problem_sizes_torch.data_ptr());
|
||||||
|
ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
|
||||||
|
|
||||||
|
// SwapAB so B operands come first
|
||||||
|
MainloopArguments mainloop_arguments{
|
||||||
|
static_cast<const QuantType**>(b_ptrs.data_ptr()),
|
||||||
|
static_cast<LayoutB_Reordered*>(b_strides.data_ptr()),
|
||||||
|
static_cast<const MmaType**>(a_ptrs.data_ptr()),
|
||||||
|
static_cast<StrideA*>(a_strides.data_ptr()),
|
||||||
|
static_cast<const cutlass::Array<ElementScale, 8>**>(
|
||||||
|
b_group_scales_ptrs.data_ptr()),
|
||||||
|
static_cast<StrideS*>(group_scale_strides.data_ptr()),
|
||||||
|
static_cast<int>(b_group_size)};
|
||||||
|
|
||||||
|
EpilogueArguments epilogue_arguments{
|
||||||
|
// since we are doing SwapAB the channel scales comes first, then token
|
||||||
|
// scales
|
||||||
|
ChTokScalesEpilogue::prepare_args( // see ScaledEpilogueArray
|
||||||
|
static_cast<const ElementAccumulator**>(
|
||||||
|
b_scales_ptrs.data_ptr()), // per-channel
|
||||||
|
static_cast<const ElementAccumulator**>(
|
||||||
|
a_scales_ptrs.data_ptr()), // per-token
|
||||||
|
true, true),
|
||||||
|
nullptr, // C
|
||||||
|
static_cast<StrideC*>(c_strides.data_ptr()), // C
|
||||||
|
static_cast<ElementD**>(out_ptrs.data_ptr()), // D
|
||||||
|
static_cast<StrideC*>(c_strides.data_ptr()) // D
|
||||||
|
};
|
||||||
|
|
||||||
|
static const cutlass::KernelHardwareInfo hw_info{
|
||||||
|
device_id,
|
||||||
|
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||||
|
device_id)};
|
||||||
|
|
||||||
|
arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape,
|
||||||
|
mainloop_arguments, epilogue_arguments, hw_info};
|
||||||
|
|
||||||
|
// Allocate workspace
|
||||||
|
size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
|
||||||
|
torch::Tensor workspace =
|
||||||
|
torch::empty(workspace_size,
|
||||||
|
torch::TensorOptions().dtype(torch::kU8).device(device));
|
||||||
|
|
||||||
|
// Run GEMM
|
||||||
|
GemmShuffled gemm;
|
||||||
|
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||||
|
CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
|
||||||
|
CUTLASS_CHECK(gemm.run(stream));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
// Kernel instantiations and dispatch logic
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
|
||||||
|
using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
|
||||||
|
|
||||||
|
// Kernel_TileShape_ClusterShape_Schedule
|
||||||
|
using Kernel_128x16_1x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||||
|
using Kernel_128x16_2x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_128, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||||
|
|
||||||
|
using Kernel_256x16_1x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||||
|
using Kernel_256x16_2x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_256, _16>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||||
|
|
||||||
|
using Kernel_256x32_1x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||||
|
using Kernel_256x32_2x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_256, _32>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||||
|
|
||||||
|
using Kernel_256x64_1x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||||
|
using Kernel_256x64_2x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_256, _64>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||||
|
|
||||||
|
using Kernel_256x128_1x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>, Coop, CoopEpi>;
|
||||||
|
using Kernel_256x128_2x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_256, _128>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||||
|
|
||||||
|
using Kernel_128x256_2x1x1_Coop =
|
||||||
|
W4A8GroupedGemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>, Coop, CoopEpi>;
|
||||||
|
|
||||||
|
void mm_dispatch(
|
||||||
|
torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
|
||||||
|
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
|
||||||
|
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
|
||||||
|
const int64_t b_group_size, const torch::Tensor& expert_offsets,
|
||||||
|
const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
|
||||||
|
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
|
||||||
|
const torch::Tensor& group_scale_strides, const std::string& schedule) {
|
||||||
|
if (schedule == "Kernel_128x16_1x1x1_Coop") {
|
||||||
|
Kernel_128x16_1x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_128x16_2x1x1_Coop") {
|
||||||
|
Kernel_128x16_2x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_256x16_1x1x1_Coop") {
|
||||||
|
Kernel_256x16_1x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_256x16_2x1x1_Coop") {
|
||||||
|
Kernel_256x16_2x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_256x32_1x1x1_Coop") {
|
||||||
|
Kernel_256x32_1x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_256x32_2x1x1_Coop") {
|
||||||
|
Kernel_256x32_2x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_256x64_1x1x1_Coop") {
|
||||||
|
Kernel_256x64_1x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_256x64_2x1x1_Coop") {
|
||||||
|
Kernel_256x64_2x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_256x128_1x1x1_Coop") {
|
||||||
|
Kernel_256x128_1x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_256x128_2x1x1_Coop") {
|
||||||
|
Kernel_256x128_2x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else if (schedule == "Kernel_128x256_2x1x1_Coop") {
|
||||||
|
Kernel_128x256_2x1x1_Coop::grouped_mm(
|
||||||
|
out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales,
|
||||||
|
b_group_size, expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
|
c_strides, group_scale_strides);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(false,
|
||||||
|
"cutlass_w4a8_moe_mm: unknown schedule string: ", schedule);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors,
|
||||||
|
const torch::Tensor& b_tensors, const torch::Tensor& a_scales,
|
||||||
|
const torch::Tensor& b_scales, const torch::Tensor& b_group_scales,
|
||||||
|
const int64_t b_group_size, const torch::Tensor& expert_offsets,
|
||||||
|
const torch::Tensor& problem_sizes, const torch::Tensor& a_strides,
|
||||||
|
const torch::Tensor& b_strides, const torch::Tensor& c_strides,
|
||||||
|
const torch::Tensor& group_scale_strides,
|
||||||
|
std::optional<std::string> maybe_schedule) {
|
||||||
|
// user has specified a schedule
|
||||||
|
if (maybe_schedule) {
|
||||||
|
mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
|
b_group_scales, b_group_size, expert_offsets, problem_sizes,
|
||||||
|
a_strides, b_strides, c_strides, group_scale_strides,
|
||||||
|
*maybe_schedule);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// use heuristic
|
||||||
|
int m_full = a_tensors.size(0);
|
||||||
|
int n = b_tensors.size(1);
|
||||||
|
int k = b_tensors.size(2) * PackFactor; // logical k
|
||||||
|
int num_experts = b_tensors.size(0);
|
||||||
|
// per-expert batch size assuming uniform distribution
|
||||||
|
int m_expert = m_full / num_experts;
|
||||||
|
|
||||||
|
std::string schedule;
|
||||||
|
if (m_expert <= 16) {
|
||||||
|
schedule = "Kernel_128x16_2x1x1_Coop";
|
||||||
|
} else if (m_expert <= 32) {
|
||||||
|
schedule = "Kernel_256x32_1x1x1_Coop";
|
||||||
|
} else if (m_expert <= 64) {
|
||||||
|
schedule = "Kernel_256x64_1x1x1_Coop";
|
||||||
|
} else if (m_expert <= 128) {
|
||||||
|
schedule = "Kernel_256x128_2x1x1_Coop";
|
||||||
|
} else { // m_expert > 128
|
||||||
|
schedule = "Kernel_128x256_2x1x1_Coop";
|
||||||
|
}
|
||||||
|
|
||||||
|
mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
|
b_group_scales, b_group_size, expert_offsets, problem_sizes,
|
||||||
|
a_strides, b_strides, c_strides, group_scale_strides, schedule);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::tuple<torch::Tensor, torch::Tensor> encode_and_reorder_int4b(
|
||||||
|
torch::Tensor const& b_tensors) {
|
||||||
|
TORCH_CHECK(b_tensors.dtype() == torch::kInt32);
|
||||||
|
TORCH_CHECK(b_tensors.dim() == 3); // (experts, n, k)
|
||||||
|
TORCH_CHECK(b_tensors.is_contiguous());
|
||||||
|
TORCH_CHECK(b_tensors.is_cuda());
|
||||||
|
|
||||||
|
int n = static_cast<int>(b_tensors.size(1));
|
||||||
|
int k = static_cast<int>(b_tensors.size(2)) * PackFactor; // logical k
|
||||||
|
|
||||||
|
// CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0.
|
||||||
|
// These misalignments cause silent OOB unless run under Compute Sanitizer.
|
||||||
|
TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256");
|
||||||
|
TORCH_CHECK(n % 16 == 0, "n must be divisible by 16");
|
||||||
|
|
||||||
|
// we will store the layout to an int32 tensor;
|
||||||
|
// this is the number of elements we need per layout
|
||||||
|
constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t);
|
||||||
|
|
||||||
|
torch::Tensor b_tensors_packed = torch::empty_like(b_tensors);
|
||||||
|
int num_experts = static_cast<int>(b_tensors.size(0));
|
||||||
|
|
||||||
|
auto b_ptr = static_cast<QuantType const*>(b_tensors.const_data_ptr());
|
||||||
|
auto b_packed_ptr = static_cast<QuantType*>(b_tensors_packed.data_ptr());
|
||||||
|
|
||||||
|
// multiply by ull so result does not overflow int32
|
||||||
|
size_t num_int4_elems = 1ull * num_experts * n * k;
|
||||||
|
bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr,
|
||||||
|
num_int4_elems);
|
||||||
|
TORCH_CHECK(ok, "unified_encode_int4b failed");
|
||||||
|
|
||||||
|
// construct the layout once; assumes each expert has the same layout
|
||||||
|
using LayoutType = LayoutB_Reordered;
|
||||||
|
std::vector<LayoutType> layout_B_reordered_host(num_experts);
|
||||||
|
auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}});
|
||||||
|
auto shape_B = cute::make_shape(n, k, Int<1>{});
|
||||||
|
auto layout_B = make_layout(shape_B, stride_B);
|
||||||
|
LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B);
|
||||||
|
|
||||||
|
// reorder weights for each expert
|
||||||
|
for (int i = 0; i < num_experts; i++) {
|
||||||
|
// since the storage type of int4b is 1 byte but one element is 4 bits
|
||||||
|
// we need to adjust the offset
|
||||||
|
int64_t offset =
|
||||||
|
1ull * i * n * k * cutlass::sizeof_bits<QuantType>::value / 8;
|
||||||
|
cutlass::reorder_tensor(b_packed_ptr + offset, layout_B,
|
||||||
|
layout_B_reordered);
|
||||||
|
}
|
||||||
|
|
||||||
|
// save the packed layout to torch tensor so we can re-use it
|
||||||
|
auto cpu_opts =
|
||||||
|
torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
|
||||||
|
torch::Tensor layout_cpu =
|
||||||
|
torch::empty({num_experts, layout_width}, cpu_opts);
|
||||||
|
|
||||||
|
int32_t* layout_data = layout_cpu.data_ptr<int32_t>();
|
||||||
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
|
std::memcpy(layout_data + i * layout_width, // dst (int32*)
|
||||||
|
&layout_B_reordered, // src (LayoutType*)
|
||||||
|
sizeof(LayoutType)); // number of bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
torch::Tensor packed_layout =
|
||||||
|
layout_cpu.to(b_tensors.device(), /*non_blocking=*/false);
|
||||||
|
|
||||||
|
return {b_tensors_packed, packed_layout};
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||||
|
m.impl("cutlass_w4a8_moe_mm", &mm);
|
||||||
|
m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm::cutlass_w4a8_moe
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -7,6 +7,7 @@
|
|||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include "cutlass_extensions/torch_utils.hpp"
|
#include "cutlass_extensions/torch_utils.hpp"
|
||||||
|
#include "w4a8_utils.cuh"
|
||||||
|
|
||||||
#include "core/registration.h"
|
#include "core/registration.h"
|
||||||
|
|
||||||
@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
|
|||||||
return packed_scales;
|
return packed_scales;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
GPU-accelerated implementation of cutlass::unified_encode_int4b.
|
|
||||||
Constructs a lookup table in constant memory to map 8 bits
|
|
||||||
(two 4-bit values) at a time. Assumes memory is contiguous
|
|
||||||
and pointers are 16-byte aligned.
|
|
||||||
*/
|
|
||||||
__constant__ uint8_t kNibbleLUT[256];
|
|
||||||
|
|
||||||
__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
|
|
||||||
size_t nbytes) {
|
|
||||||
constexpr size_t V = sizeof(uint4); // 16 bytes
|
|
||||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
const size_t nthreads = size_t(gridDim.x) * blockDim.x;
|
|
||||||
const size_t nvec = nbytes / V;
|
|
||||||
|
|
||||||
// 1-D grid-stride loop over 16-byte chunks
|
|
||||||
for (size_t vec = tid; vec < nvec; vec += nthreads) {
|
|
||||||
uint4 v = reinterpret_cast<const uint4*>(in)[vec];
|
|
||||||
uint8_t* b = reinterpret_cast<uint8_t*>(&v);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
|
|
||||||
reinterpret_cast<uint4*>(out)[vec] = v;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool upload_lut() {
|
|
||||||
std::array<uint8_t, 256> lut{};
|
|
||||||
auto map_nib = [](uint8_t v) -> uint8_t {
|
|
||||||
// 1..7 -> (8 - v); keep 0 and 8..15
|
|
||||||
return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
|
|
||||||
};
|
|
||||||
for (int b = 0; b < 256; ++b) {
|
|
||||||
uint8_t lo = b & 0xF;
|
|
||||||
uint8_t hi = (b >> 4) & 0xF;
|
|
||||||
lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
|
|
||||||
}
|
|
||||||
cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
|
|
||||||
/*offset=*/0, cudaMemcpyHostToDevice);
|
|
||||||
|
|
||||||
return (e == cudaSuccess);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool unified_encode_int4b(cutlass::int4b_t const* in,
|
|
||||||
cutlass::int4b_t* out, size_t num_int4_elems) {
|
|
||||||
// Build/upload LUT
|
|
||||||
if (!upload_lut()) return false;
|
|
||||||
|
|
||||||
static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
|
|
||||||
"int4 storage must be 1 byte");
|
|
||||||
const size_t nbytes = num_int4_elems >> 1;
|
|
||||||
|
|
||||||
auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
|
|
||||||
auto* out_bytes = reinterpret_cast<uint8_t*>(out);
|
|
||||||
|
|
||||||
// kernel launch params
|
|
||||||
constexpr int block = 256;
|
|
||||||
const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors
|
|
||||||
int grid = int((nvec + block - 1) / block);
|
|
||||||
if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel
|
|
||||||
|
|
||||||
unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
|
|
||||||
cudaError_t err = cudaGetLastError();
|
|
||||||
return (err == cudaSuccess);
|
|
||||||
}
|
|
||||||
|
|
||||||
torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
|
torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
|
||||||
TORCH_CHECK(B.dtype() == torch::kInt32);
|
TORCH_CHECK(B.dtype() == torch::kInt32);
|
||||||
TORCH_CHECK(B.dim() == 2);
|
TORCH_CHECK(B.dim() == 2);
|
||||||
@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
|
|||||||
LayoutB_Reordered layout_B_reordered =
|
LayoutB_Reordered layout_B_reordered =
|
||||||
cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
|
cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
|
||||||
|
|
||||||
bool ok =
|
bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr,
|
||||||
vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
|
n * k);
|
||||||
TORCH_CHECK(ok, "unified_encode_int4b failed");
|
TORCH_CHECK(ok, "unified_encode_int4b failed");
|
||||||
cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
|
cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
|
||||||
|
|
||||||
|
|||||||
90
csrc/quantization/cutlass_w4a8/w4a8_utils.cu
Normal file
90
csrc/quantization/cutlass_w4a8/w4a8_utils.cu
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
#include "w4a8_utils.cuh"
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
namespace vllm::cutlass_w4a8_utils {
|
||||||
|
|
||||||
|
/*
|
||||||
|
GPU-accelerated implementation of cutlass::unified_encode_int4b.
|
||||||
|
Constructs a lookup table in constant memory to map 8 bits
|
||||||
|
(two 4-bit values) at a time. Assumes memory is contiguous
|
||||||
|
and pointers are 16-byte aligned.
|
||||||
|
*/
|
||||||
|
__constant__ uint8_t kNibbleLUT[256];
|
||||||
|
|
||||||
|
__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out,
|
||||||
|
size_t nbytes) {
|
||||||
|
constexpr size_t V = sizeof(uint4); // 16 bytes
|
||||||
|
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
const size_t nthreads = size_t(gridDim.x) * blockDim.x;
|
||||||
|
const size_t nvec = nbytes / V;
|
||||||
|
|
||||||
|
// 1-D grid-stride loop over 16-byte chunks
|
||||||
|
for (size_t vec = tid; vec < nvec; vec += nthreads) {
|
||||||
|
uint4 v = reinterpret_cast<const uint4*>(in)[vec];
|
||||||
|
uint8_t* b = reinterpret_cast<uint8_t*>(&v);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]];
|
||||||
|
reinterpret_cast<uint4*>(out)[vec] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool upload_lut() {
|
||||||
|
std::array<uint8_t, 256> lut{};
|
||||||
|
auto map_nib = [](uint8_t v) -> uint8_t {
|
||||||
|
// 1..7 -> (8 - v); keep 0 and 8..15
|
||||||
|
return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v);
|
||||||
|
};
|
||||||
|
for (int b = 0; b < 256; ++b) {
|
||||||
|
uint8_t lo = b & 0xF;
|
||||||
|
uint8_t hi = (b >> 4) & 0xF;
|
||||||
|
lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo));
|
||||||
|
}
|
||||||
|
cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(),
|
||||||
|
/*offset=*/0, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
return (e == cudaSuccess);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
|
||||||
|
size_t num_int4_elems) {
|
||||||
|
// Build/upload LUT
|
||||||
|
if (!upload_lut()) return false;
|
||||||
|
|
||||||
|
static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1,
|
||||||
|
"int4 storage must be 1 byte");
|
||||||
|
const size_t nbytes = num_int4_elems >> 1;
|
||||||
|
|
||||||
|
auto* in_bytes = reinterpret_cast<uint8_t const*>(in);
|
||||||
|
auto* out_bytes = reinterpret_cast<uint8_t*>(out);
|
||||||
|
|
||||||
|
// kernel launch params
|
||||||
|
constexpr int block = 256;
|
||||||
|
const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors
|
||||||
|
int grid = int((nvec + block - 1) / block);
|
||||||
|
if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel
|
||||||
|
|
||||||
|
unified_encode_int4b_device<<<grid, block>>>(in_bytes, out_bytes, nbytes);
|
||||||
|
|
||||||
|
// launch errors
|
||||||
|
cudaError_t err = cudaGetLastError();
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("unified_encode_int4b_device launch error: %s (%d)\n",
|
||||||
|
cudaGetErrorString(err), err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// runtime errors
|
||||||
|
err = cudaDeviceSynchronize();
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
printf("unified_encode_int4b_device runtime error: %s (%d)\n",
|
||||||
|
cudaGetErrorString(err), err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm::cutlass_w4a8_utils
|
||||||
11
csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
Normal file
11
csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include "cutlass/numeric_types.h"
|
||||||
|
|
||||||
|
namespace vllm::cutlass_w4a8_utils {
|
||||||
|
|
||||||
|
bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out,
|
||||||
|
size_t num_int4_elems);
|
||||||
|
|
||||||
|
} // namespace vllm::cutlass_w4a8_utils
|
||||||
@ -31,14 +31,15 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
|
|||||||
|
|
||||||
// RMS Norm + Quant
|
// RMS Norm + Quant
|
||||||
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
|
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
|
||||||
|
token_scale = 1.0f / token_scale;
|
||||||
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
|
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
|
||||||
has_residual>(
|
has_residual>(
|
||||||
out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
|
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||||
} else {
|
} else {
|
||||||
// FP8 - Do not invert token_scale for exact match with FBGemm
|
// FP8 - Do not invert token_scale for exact match with FBGemm
|
||||||
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
|
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
|
||||||
has_residual>(
|
has_residual>(
|
||||||
out, input, weight, rms, token_scale, hidden_size, residual);
|
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -75,14 +76,52 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
|
|||||||
|
|
||||||
// RMS Norm + Quant
|
// RMS Norm + Quant
|
||||||
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
|
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
|
||||||
|
token_scale = 1.0f / token_scale;
|
||||||
vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
|
vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
|
||||||
out, input, weight, rms, 1.0f / token_scale, hidden_size, residual);
|
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||||
} else {
|
} else {
|
||||||
// FP8 - Do not invert s_token_scale for exact match with FBGemm
|
// FP8 - Do not invert s_token_scale for exact match with FBGemm
|
||||||
vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
|
vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
|
||||||
out, input, weight, rms, token_scale, hidden_size, residual);
|
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RMS norm + quant kernel
|
||||||
|
template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
|
||||||
|
bool is_scale_transposed = false, int32_t group_size = 0>
|
||||||
|
__global__ void rms_norm_per_block_quant_kernel(
|
||||||
|
scalar_out_t* __restrict__ out, // [..., hidden_size]
|
||||||
|
float* __restrict__ scales, // [num_tokens, hidden_size / group_size]
|
||||||
|
// or
|
||||||
|
// [hidden_size / group_size, num_tokens]
|
||||||
|
scalar_t const* __restrict__ input, // [..., hidden_size]
|
||||||
|
scalar_t const* __restrict__ weight, // [hidden_size]
|
||||||
|
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
|
||||||
|
scalar_t* __restrict__ residual = nullptr) {
|
||||||
|
float rms;
|
||||||
|
// Compute RMS
|
||||||
|
// Always able to vectorize due to constraints on hidden_size
|
||||||
|
vllm::vectorized::compute_rms<scalar_t, has_residual>(
|
||||||
|
&rms, input, hidden_size, var_epsilon, residual);
|
||||||
|
|
||||||
|
// Compute Scale
|
||||||
|
// Always able to vectorize due to constraints on hidden_size and group_size
|
||||||
|
vllm::vectorized::compute_dynamic_per_token_scales<
|
||||||
|
scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
|
||||||
|
nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual);
|
||||||
|
|
||||||
|
// RMS Norm + Quant
|
||||||
|
// Always able to vectorize due to constraints on hidden_size
|
||||||
|
// For int8, don't invert token_scale here: do it inside the norm_and_quant
|
||||||
|
// kernel. We do it because particular elements of token_scale can be shared
|
||||||
|
// between multiple threads, so this way, we avoid extra synchronization
|
||||||
|
// overhead.
|
||||||
|
vllm::vectorized::norm_and_quant<
|
||||||
|
scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
|
||||||
|
has_residual, is_scale_transposed, group_size>(
|
||||||
|
out, input, weight, rms, scales, hidden_size, residual);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
// Residual add + RMS norm + dynamic per token
|
// Residual add + RMS norm + dynamic per token
|
||||||
@ -103,30 +142,19 @@ void rms_norm_dynamic_per_token_quant_dispatch(
|
|||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
if (residual.has_value()) {
|
VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
|
||||||
VLLM_DISPATCH_QUANT_TYPES(
|
VLLM_DISPATCH_QUANT_TYPES(
|
||||||
out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
|
out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
|
||||||
vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
|
vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
|
||||||
true>
|
has_residual>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
|
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
|
||||||
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
|
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
|
||||||
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
|
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
|
||||||
var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
|
var_epsilon, hidden_size,
|
||||||
|
has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
|
||||||
});
|
});
|
||||||
|
});
|
||||||
} else {
|
|
||||||
VLLM_DISPATCH_QUANT_TYPES(
|
|
||||||
out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
|
|
||||||
vllm::rms_norm_dynamic_per_token_quant_kernel<scalar_in_t, scalar_t,
|
|
||||||
false>
|
|
||||||
<<<grid, block, 0, stream>>>(
|
|
||||||
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
|
|
||||||
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
|
|
||||||
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
|
|
||||||
var_epsilon, hidden_size, nullptr);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void rms_norm_dynamic_per_token_quant(
|
void rms_norm_dynamic_per_token_quant(
|
||||||
@ -157,3 +185,79 @@ void rms_norm_dynamic_per_token_quant(
|
|||||||
out, input, weight, scales, var_epsilon, scale_ub, residual);
|
out, input, weight, scales, var_epsilon, scale_ub, residual);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Residual add + RMS norm + dynamic per token
|
||||||
|
void rms_norm_per_block_quant_dispatch(
|
||||||
|
torch::Tensor& out, // [..., hidden_size]
|
||||||
|
torch::Tensor const& input, // [..., hidden_size]
|
||||||
|
torch::Tensor const& weight, // [hidden_size]
|
||||||
|
torch::Tensor& scales, // [num_tokens, hidden_size / group_size] or
|
||||||
|
// [hidden_size / group_size, num_tokens]
|
||||||
|
int32_t group_size,
|
||||||
|
double const var_epsilon, // Variance epsilon used in norm calculation
|
||||||
|
std::optional<at::Tensor> const& scale_ub,
|
||||||
|
std::optional<at::Tensor>& residual, bool is_scale_transposed) {
|
||||||
|
int32_t hidden_size = input.size(-1);
|
||||||
|
auto num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
const int max_block_size = (num_tokens <= 256) ? 512 : 256;
|
||||||
|
dim3 block(std::min(hidden_size, max_block_size));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "rms_norm_per_block_quant_fp_dispatch", [&] {
|
||||||
|
using scalar_in_t = scalar_t;
|
||||||
|
VLLM_DISPATCH_GROUP_SIZE(group_size, gs, [&] {
|
||||||
|
VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] {
|
||||||
|
VLLM_DISPATCH_BOOL(is_scale_transposed, transpose_scale, [&] {
|
||||||
|
VLLM_DISPATCH_QUANT_TYPES(
|
||||||
|
out.scalar_type(), "rms_norm_per_block_quant_kernel", [&] {
|
||||||
|
vllm::rms_norm_per_block_quant_kernel<scalar_in_t, scalar_t,
|
||||||
|
has_residual,
|
||||||
|
transpose_scale, gs>
|
||||||
|
<<<grid, block, 0, stream>>>(
|
||||||
|
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
|
||||||
|
input.data_ptr<scalar_in_t>(),
|
||||||
|
weight.data_ptr<scalar_in_t>(),
|
||||||
|
scale_ub.has_value() ? scale_ub->data_ptr<float>()
|
||||||
|
: nullptr,
|
||||||
|
var_epsilon, hidden_size,
|
||||||
|
has_residual ? residual->data_ptr<scalar_in_t>()
|
||||||
|
: nullptr);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
torch::Tensor const& weight,
|
||||||
|
torch::Tensor& scales, double const var_epsilon,
|
||||||
|
std::optional<torch::Tensor> scale_ub,
|
||||||
|
std::optional<torch::Tensor> residual,
|
||||||
|
int64_t group_size, bool is_scale_transposed) {
|
||||||
|
static c10::ScalarType kFp8Type = is_fp8_ocp()
|
||||||
|
? c10::ScalarType::Float8_e4m3fn
|
||||||
|
: c10::ScalarType::Float8_e4m3fnuz;
|
||||||
|
TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
|
||||||
|
TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
|
||||||
|
|
||||||
|
if (scale_ub.has_value()) {
|
||||||
|
TORCH_CHECK(out.dtype() == kFp8Type);
|
||||||
|
}
|
||||||
|
TORCH_CHECK(weight.dtype() == input.dtype());
|
||||||
|
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
||||||
|
if (residual) {
|
||||||
|
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_CHECK(group_size == 128 || group_size == 64,
|
||||||
|
"Unsupported group size: ", group_size);
|
||||||
|
|
||||||
|
rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
|
||||||
|
var_epsilon, scale_ub, residual,
|
||||||
|
is_scale_transposed);
|
||||||
|
}
|
||||||
@ -9,6 +9,7 @@
|
|||||||
#include "quant_conversions.cuh"
|
#include "quant_conversions.cuh"
|
||||||
|
|
||||||
#include "../../cub_helpers.h"
|
#include "../../cub_helpers.h"
|
||||||
|
#include "../../cuda_compat.h"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
@ -43,62 +44,150 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
|||||||
*rms = s_rms;
|
*rms = s_rms;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
|
__device__ float warpReduceMaxSpecialized(volatile float* val, int64_t tid,
|
||||||
|
int64_t thread_in_warp,
|
||||||
|
int64_t reduced_elems) {
|
||||||
|
static_assert(WARP_SIZE == 32 || WARP_SIZE == 64);
|
||||||
|
if constexpr (WARP_SIZE == 64) {
|
||||||
|
if (thread_in_warp + 64 < reduced_elems)
|
||||||
|
val[tid] = fmaxf(val[tid], val[tid + 64]);
|
||||||
|
}
|
||||||
|
if (thread_in_warp + 32 < reduced_elems)
|
||||||
|
val[tid] = fmaxf(val[tid], val[tid + 32]);
|
||||||
|
if (thread_in_warp + 16 < reduced_elems)
|
||||||
|
val[tid] = fmaxf(val[tid], val[tid + 16]);
|
||||||
|
if (thread_in_warp + 8 < reduced_elems)
|
||||||
|
val[tid] = fmaxf(val[tid], val[tid + 8]);
|
||||||
|
if (thread_in_warp + 4 < reduced_elems)
|
||||||
|
val[tid] = fmaxf(val[tid], val[tid + 4]);
|
||||||
|
if (thread_in_warp + 2 < reduced_elems)
|
||||||
|
val[tid] = fmaxf(val[tid], val[tid + 2]);
|
||||||
|
if (thread_in_warp + 1 < reduced_elems)
|
||||||
|
val[tid] = fmaxf(val[tid], val[tid + 1]);
|
||||||
|
return val[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
|
||||||
|
bool is_scale_transposed = false>
|
||||||
__device__ void compute_dynamic_per_token_scales(
|
__device__ void compute_dynamic_per_token_scales(
|
||||||
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
|
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
|
||||||
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
|
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
|
||||||
float const rms, float const* __restrict__ scale_ub,
|
float const rms, float const* __restrict__ scale_ub,
|
||||||
int32_t const hidden_size,
|
int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
|
||||||
scalar_t const* __restrict__ residual = nullptr) {
|
int32_t const group_size = 0) {
|
||||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
|
||||||
;
|
|
||||||
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
|
||||||
|
|
||||||
float block_absmax_val_maybe = 0.0f;
|
float block_absmax_val_maybe = 0.0f;
|
||||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
||||||
float x = static_cast<float>(input[token_offset + i]);
|
|
||||||
if constexpr (has_residual) {
|
|
||||||
x += static_cast<float>(residual[token_offset + i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
|
||||||
block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
|
|
||||||
}
|
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
|
||||||
block_absmax_val_maybe =
|
|
||||||
BlockReduce(reduceStore)
|
|
||||||
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
|
||||||
|
|
||||||
__shared__ float s_token_scale;
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
float scale = 0.0f;
|
|
||||||
if (scale_ub) {
|
|
||||||
scale = min(block_absmax_val_maybe, *scale_ub);
|
|
||||||
} else {
|
|
||||||
scale = block_absmax_val_maybe;
|
|
||||||
}
|
|
||||||
// token scale computation
|
|
||||||
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
|
||||||
s_token_scale = scale; // Shared memory store
|
|
||||||
all_token_scales[blockIdx.x] = scale; // Global output store
|
|
||||||
}
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
if (group_size > 0) {
|
||||||
|
__shared__ float s_max_vals[1024];
|
||||||
|
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||||
|
int64_t num_groups = hidden_size / group_size;
|
||||||
|
int64_t const threads_per_group = blockDim.x / num_groups;
|
||||||
|
int64_t const thread_in_group = threadIdx.x % threads_per_group;
|
||||||
|
int64_t const group_offset = threadIdx.x / threads_per_group * group_size;
|
||||||
|
int64_t const thread_offset = group_offset + thread_in_group;
|
||||||
|
int64_t const thread_end =
|
||||||
|
min(group_offset + group_size, static_cast<int64_t>(hidden_size));
|
||||||
|
for (auto i = thread_offset; i < thread_end; i += threads_per_group) {
|
||||||
|
float x = static_cast<float>(input[token_offset + i]);
|
||||||
|
if constexpr (has_residual) {
|
||||||
|
x += static_cast<float>(residual[token_offset + i]);
|
||||||
|
}
|
||||||
|
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
||||||
|
block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
|
||||||
|
}
|
||||||
|
s_max_vals[threadIdx.x] = block_absmax_val_maybe;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
*token_scale = s_token_scale;
|
int64_t const warp_size = WARP_SIZE;
|
||||||
|
int64_t const num_warps = blockDim.x / warp_size;
|
||||||
|
int64_t const warp_id = threadIdx.x / warp_size;
|
||||||
|
int64_t const thread_in_warp = threadIdx.x % warp_size;
|
||||||
|
int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
|
||||||
|
for (auto i = 0; i < groups_per_warp; ++i) {
|
||||||
|
int64_t const group_id = i * num_warps + warp_id;
|
||||||
|
if (group_id < num_groups) {
|
||||||
|
int64_t warp_start = group_id * threads_per_group;
|
||||||
|
int64_t const start = warp_start + thread_in_warp;
|
||||||
|
int64_t const warp_end = min(warp_start + threads_per_group,
|
||||||
|
static_cast<int64_t>(hidden_size));
|
||||||
|
for (auto j = start; j + warp_size < warp_end; j += warp_size) {
|
||||||
|
s_max_vals[start] =
|
||||||
|
fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
|
||||||
|
}
|
||||||
|
warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
|
||||||
|
min(warp_end - warp_start, warp_size));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (thread_in_group == 0 && thread_offset < thread_end) {
|
||||||
|
block_absmax_val_maybe = s_max_vals[threadIdx.x];
|
||||||
|
float scale = 0.0f;
|
||||||
|
if (scale_ub) {
|
||||||
|
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||||
|
} else {
|
||||||
|
scale = block_absmax_val_maybe;
|
||||||
|
}
|
||||||
|
// token scale computation
|
||||||
|
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||||
|
// Global output store
|
||||||
|
if constexpr (is_scale_transposed) {
|
||||||
|
all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
|
||||||
|
blockIdx.x] = scale;
|
||||||
|
} else {
|
||||||
|
all_token_scales[blockIdx.x * num_groups +
|
||||||
|
threadIdx.x / threads_per_group] = scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
} else {
|
||||||
|
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||||
|
|
||||||
|
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||||
|
float x = static_cast<float>(input[token_offset + i]);
|
||||||
|
if constexpr (has_residual) {
|
||||||
|
x += static_cast<float>(residual[token_offset + i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
||||||
|
block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x));
|
||||||
|
}
|
||||||
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
|
block_absmax_val_maybe =
|
||||||
|
BlockReduce(reduceStore)
|
||||||
|
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||||
|
|
||||||
|
__shared__ float s_token_scale;
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
float scale = 0.0f;
|
||||||
|
if (scale_ub) {
|
||||||
|
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||||
|
} else {
|
||||||
|
scale = block_absmax_val_maybe;
|
||||||
|
}
|
||||||
|
// token scale computation
|
||||||
|
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||||
|
s_token_scale = scale; // Shared memory store
|
||||||
|
all_token_scales[blockIdx.x] = scale; // Global output store
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
*token_scale = s_token_scale;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
|
template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
|
||||||
bool has_residual = false>
|
bool has_residual = false, bool is_scale_transposed = false>
|
||||||
__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||||
scalar_t const* __restrict__ input,
|
scalar_t const* __restrict__ input,
|
||||||
scalar_t const* __restrict__ weight,
|
scalar_t const* __restrict__ weight,
|
||||||
float const rms, float const scale,
|
float const rms, float* const scale,
|
||||||
int32_t const hidden_size,
|
int32_t const hidden_size,
|
||||||
scalar_t* __restrict__ residual = nullptr) {
|
scalar_t* __restrict__ residual = nullptr,
|
||||||
|
int32_t const group_size = 0) {
|
||||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||||
;
|
|
||||||
|
|
||||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||||
float x = static_cast<float>(input[token_offset + i]);
|
float x = static_cast<float>(input[token_offset + i]);
|
||||||
@ -109,8 +198,21 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
|||||||
// Norm
|
// Norm
|
||||||
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
x = static_cast<float>(static_cast<scalar_t>(x * rms) * weight[i]);
|
||||||
// Quant
|
// Quant
|
||||||
|
// If groupwise is_scale_inverted is true, so we invert the scale here.
|
||||||
|
int64_t scale_idx = 0;
|
||||||
|
if (group_size > 0) {
|
||||||
|
if constexpr (is_scale_transposed) {
|
||||||
|
scale_idx = (i / group_size) * gridDim.x + blockIdx.x;
|
||||||
|
} else {
|
||||||
|
scale_idx = blockIdx.x * (hidden_size / group_size) + i / group_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto scale_val =
|
||||||
|
(group_size > 0
|
||||||
|
? (is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx])
|
||||||
|
: *scale);
|
||||||
output[token_offset + i] =
|
output[token_offset + i] =
|
||||||
ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale);
|
ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(x, scale_val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,95 +280,191 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
|||||||
|
|
||||||
// Vectorized version of vllm::compute_dynamic_per_token_scales
|
// Vectorized version of vllm::compute_dynamic_per_token_scales
|
||||||
// hidden_size must be a multiple of 4
|
// hidden_size must be a multiple of 4
|
||||||
template <typename scalar_t, typename scalar_out_t, bool has_residual = false>
|
template <typename scalar_t, typename scalar_out_t, bool has_residual = false,
|
||||||
|
bool is_scale_transposed = false, int32_t group_size = 0>
|
||||||
__device__ void compute_dynamic_per_token_scales(
|
__device__ void compute_dynamic_per_token_scales(
|
||||||
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
|
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
|
||||||
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
|
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
|
||||||
float const rms, float const* __restrict__ scale_ub,
|
float const rms, float const* __restrict__ scale_ub,
|
||||||
int32_t const hidden_size,
|
int32_t const hidden_size,
|
||||||
scalar_t const* __restrict__ residual = nullptr) {
|
scalar_t const* __restrict__ residual = nullptr) {
|
||||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
|
||||||
;
|
|
||||||
|
|
||||||
// Vectorized input/weight/residual to better utilize memory bandwidth.
|
|
||||||
vec4_t<scalar_t> const* vec_input =
|
|
||||||
reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
|
||||||
vec4_t<scalar_t> const* vec_weight =
|
|
||||||
reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
|
||||||
vec4_t<scalar_t> const* vec_residual = nullptr;
|
|
||||||
if constexpr (has_residual) {
|
|
||||||
vec_residual =
|
|
||||||
reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
||||||
|
|
||||||
const int VEC_SIZE = 4;
|
const int VEC_SIZE = 4;
|
||||||
int32_t const num_vec_elems = hidden_size >> 2;
|
|
||||||
float block_absmax_val_maybe = 0.0f;
|
float block_absmax_val_maybe = 0.0f;
|
||||||
|
|
||||||
#pragma unroll 4
|
// Vectorized input/weight/residual to better utilize memory bandwidth.
|
||||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
vec4_t<scalar_t> const* vec_input = nullptr;
|
||||||
vec4_t<scalar_t> in = vec_input[i];
|
vec4_t<scalar_t> const* vec_weight = nullptr;
|
||||||
vec4_t<scalar_t> const w = vec_weight[i];
|
vec4_t<scalar_t> const* vec_residual = nullptr;
|
||||||
|
|
||||||
vec4_t<float> x;
|
if constexpr (group_size > 0) {
|
||||||
#pragma unroll
|
__shared__ float s_max_vals[1024];
|
||||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
|
||||||
x.val[j] = static_cast<float>(in.val[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||||
|
int64_t const num_groups = hidden_size / group_size;
|
||||||
|
int64_t const threads_per_group = blockDim.x / num_groups;
|
||||||
|
int64_t const thread_in_group = threadIdx.x % threads_per_group;
|
||||||
|
int64_t const group_offset =
|
||||||
|
threadIdx.x / threads_per_group * (group_size >> 2);
|
||||||
|
int64_t const thread_offset = group_offset + thread_in_group;
|
||||||
|
int64_t const thread_end = min(group_offset + (group_size >> 2),
|
||||||
|
static_cast<int64_t>(hidden_size >> 2));
|
||||||
|
vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||||
|
vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
||||||
if constexpr (has_residual) {
|
if constexpr (has_residual) {
|
||||||
vec4_t<scalar_t> r = vec_residual[i];
|
vec_residual =
|
||||||
|
reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
|
||||||
|
}
|
||||||
|
int32_t const num_vec_elems = thread_end;
|
||||||
|
|
||||||
|
#pragma unroll 4
|
||||||
|
for (auto i = thread_offset; i < num_vec_elems; i += threads_per_group) {
|
||||||
|
vec4_t<scalar_t> in = vec_input[i];
|
||||||
|
vec4_t<scalar_t> const w = vec_weight[i];
|
||||||
|
|
||||||
|
vec4_t<float> x;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
x.val[j] += static_cast<float>(r.val[j]);
|
x.val[j] = static_cast<float>(in.val[j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (has_residual) {
|
||||||
|
vec4_t<scalar_t> r = vec_residual[i];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
|
x.val[j] += static_cast<float>(r.val[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
|
block_absmax_val_maybe =
|
||||||
|
fmaxf(block_absmax_val_maybe,
|
||||||
|
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s_max_vals[threadIdx.x] = block_absmax_val_maybe;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
int64_t const warp_size = WARP_SIZE;
|
||||||
|
int64_t const num_warps = blockDim.x / warp_size;
|
||||||
|
int64_t const warp_id = threadIdx.x / warp_size;
|
||||||
|
int64_t const thread_in_warp = threadIdx.x % warp_size;
|
||||||
|
int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps;
|
||||||
|
for (auto i = 0; i < groups_per_warp; ++i) {
|
||||||
|
int64_t const group_id = i * num_warps + warp_id;
|
||||||
|
if (group_id < num_groups) {
|
||||||
|
int64_t warp_start = group_id * threads_per_group;
|
||||||
|
int64_t const start = warp_start + thread_in_warp;
|
||||||
|
int64_t const warp_end = min(warp_start + threads_per_group,
|
||||||
|
static_cast<int64_t>(hidden_size));
|
||||||
|
for (auto j = start; j + warp_size < warp_end; j += warp_size) {
|
||||||
|
s_max_vals[start] =
|
||||||
|
fmaxf(s_max_vals[start], s_max_vals[j + warp_size]);
|
||||||
|
}
|
||||||
|
warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp,
|
||||||
|
min(warp_end - warp_start, warp_size));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (thread_in_group == 0 && thread_offset < thread_end) {
|
||||||
|
block_absmax_val_maybe = s_max_vals[threadIdx.x];
|
||||||
|
float scale = 0.0f;
|
||||||
|
if (scale_ub) {
|
||||||
|
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||||
|
} else {
|
||||||
|
scale = block_absmax_val_maybe;
|
||||||
|
}
|
||||||
|
// token scale computation
|
||||||
|
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||||
|
// Global output store
|
||||||
|
if constexpr (is_scale_transposed) {
|
||||||
|
all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x +
|
||||||
|
blockIdx.x] = scale;
|
||||||
|
} else {
|
||||||
|
all_token_scales[blockIdx.x * num_groups +
|
||||||
|
threadIdx.x / threads_per_group] = scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||||
|
vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||||
|
vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
||||||
|
if constexpr (has_residual) {
|
||||||
|
vec_residual =
|
||||||
|
reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t const num_vec_elems = (hidden_size >> 2);
|
||||||
|
|
||||||
|
#pragma unroll 4
|
||||||
|
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||||
|
vec4_t<scalar_t> in = vec_input[i];
|
||||||
|
vec4_t<scalar_t> const w = vec_weight[i];
|
||||||
|
|
||||||
|
vec4_t<float> x;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
block_absmax_val_maybe =
|
x.val[j] = static_cast<float>(in.val[j]);
|
||||||
fmaxf(block_absmax_val_maybe,
|
}
|
||||||
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
|
|
||||||
|
if constexpr (has_residual) {
|
||||||
|
vec4_t<scalar_t> r = vec_residual[i];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
|
x.val[j] += static_cast<float>(r.val[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
|
block_absmax_val_maybe =
|
||||||
|
fmaxf(block_absmax_val_maybe,
|
||||||
|
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
block_absmax_val_maybe =
|
block_absmax_val_maybe =
|
||||||
BlockReduce(reduceStore)
|
BlockReduce(reduceStore)
|
||||||
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
.Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x);
|
||||||
|
|
||||||
__shared__ float s_token_scale;
|
__shared__ float s_token_scale;
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
float scale = 0.0f;
|
float scale = 0.0f;
|
||||||
if (scale_ub) {
|
if (scale_ub) {
|
||||||
scale = min(block_absmax_val_maybe, *scale_ub);
|
scale = min(block_absmax_val_maybe, *scale_ub);
|
||||||
} else {
|
} else {
|
||||||
scale = block_absmax_val_maybe;
|
scale = block_absmax_val_maybe;
|
||||||
|
}
|
||||||
|
// token scale computation
|
||||||
|
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
||||||
|
s_token_scale = scale; // shared memory store
|
||||||
|
all_token_scales[blockIdx.x] = scale; // global output store
|
||||||
}
|
}
|
||||||
// token scale computation
|
__syncthreads();
|
||||||
scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
|
|
||||||
s_token_scale = scale; // shared memory store
|
|
||||||
all_token_scales[blockIdx.x] = scale; // global output store
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
*token_scale = s_token_scale;
|
*token_scale = s_token_scale;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// hidden_size must be a multiple of 4
|
// hidden_size must be a multiple of 4
|
||||||
template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
|
template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
|
||||||
bool has_residual = false>
|
bool has_residual = false, bool is_scale_transposed = false,
|
||||||
|
int32_t group_size = 0>
|
||||||
__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||||
scalar_t const* __restrict__ input,
|
scalar_t const* __restrict__ input,
|
||||||
scalar_t const* __restrict__ weight,
|
scalar_t const* __restrict__ weight,
|
||||||
float const rms, float const scale,
|
float const rms, float* const scale,
|
||||||
int32_t const hidden_size,
|
int32_t const hidden_size,
|
||||||
scalar_t* __restrict__ residual = nullptr) {
|
scalar_t* __restrict__ residual = nullptr) {
|
||||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||||
;
|
|
||||||
|
|
||||||
// Vectorized input/output/weight/residual to better utilize memory bandwidth.
|
// Vectorized input/output/weight/residual to better utilize memory bandwidth.
|
||||||
vec4_t<scalar_t> const* vec_input =
|
vec4_t<scalar_t> const* vec_input =
|
||||||
@ -311,10 +509,26 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
|||||||
}
|
}
|
||||||
|
|
||||||
q8x4_t<scalar_out_t> out;
|
q8x4_t<scalar_out_t> out;
|
||||||
|
|
||||||
|
float scale_val;
|
||||||
|
|
||||||
|
if constexpr (group_size > 0) {
|
||||||
|
int64_t const num_groups = hidden_size / group_size;
|
||||||
|
int64_t scale_idx = 0;
|
||||||
|
if constexpr (is_scale_transposed) {
|
||||||
|
scale_idx = (i * VEC_SIZE / group_size) * gridDim.x + blockIdx.x;
|
||||||
|
} else {
|
||||||
|
scale_idx = blockIdx.x * num_groups + i * VEC_SIZE / group_size;
|
||||||
|
}
|
||||||
|
scale_val =
|
||||||
|
is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx];
|
||||||
|
} else {
|
||||||
|
scale_val = *scale;
|
||||||
|
}
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
for (int j = 0; j < VEC_SIZE; ++j) {
|
||||||
out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
|
out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
|
||||||
static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
|
static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale_val);
|
||||||
}
|
}
|
||||||
vec_output[i] = out;
|
vec_output[i] = out;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -617,7 +617,7 @@ struct MacheteCollectiveMma {
|
|||||||
|
|
||||||
// Same as upstream, should be kept the same when possible, not formatted for
|
// Same as upstream, should be kept the same when possible, not formatted for
|
||||||
// easier comparison
|
// easier comparison
|
||||||
// with `SwapAB ? N : M -> M` since we dont support SwapAB
|
// with `SwapAB ? N : M -> M` since we don't support SwapAB
|
||||||
// clang-format off
|
// clang-format off
|
||||||
template<class ProblemShape>
|
template<class ProblemShape>
|
||||||
static bool
|
static bool
|
||||||
|
|||||||
@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
|
|||||||
void get_cutlass_moe_mm_problem_sizes_caller(
|
void get_cutlass_moe_mm_problem_sizes_caller(
|
||||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
|
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||||
|
std::optional<bool> force_swap_ab = std::nullopt) {
|
||||||
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
|
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
|
||||||
auto options_int32 =
|
auto options_int32 =
|
||||||
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
|
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
|
||||||
torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
|
torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
|
||||||
|
|
||||||
// Swap-AB should be disabled for FP4 path
|
// Swap-AB should be disabled for FP4 path
|
||||||
bool may_swap_ab = (!blockscale_offsets.has_value()) &&
|
bool may_swap_ab =
|
||||||
(topk_ids.numel() <= SWAP_AB_THRESHOLD);
|
force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
|
||||||
|
(topk_ids.numel() <= SWAP_AB_THRESHOLD));
|
||||||
|
|
||||||
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
|
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
|
||||||
atomic_buffer, num_experts, n, k, stream,
|
atomic_buffer, num_experts, n, k, stream,
|
||||||
|
|||||||
@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
void get_cutlass_moe_mm_problem_sizes_caller(
|
void get_cutlass_moe_mm_problem_sizes_caller(
|
||||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
|
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||||
|
std::optional<bool> force_swap_ab = std::nullopt);
|
||||||
|
|
||||||
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
|
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
|
||||||
torch::Tensor& problem_sizes1,
|
torch::Tensor& problem_sizes1,
|
||||||
@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data(
|
|||||||
void get_cutlass_moe_mm_problem_sizes(
|
void get_cutlass_moe_mm_problem_sizes(
|
||||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
|
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||||
|
std::optional<bool> force_swap_ab = std::nullopt) {
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||||
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
|
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
|
||||||
problem_sizes2, num_experts, n, k,
|
problem_sizes2, num_experts, n, k,
|
||||||
blockscale_offsets);
|
blockscale_offsets, force_swap_ab);
|
||||||
return;
|
return;
|
||||||
#endif
|
#endif
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
|||||||
@ -22,6 +22,62 @@ __device__ __forceinline__ float GroupReduceMax(float val) {
|
|||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, bool SCALE_UE8M0>
|
||||||
|
__device__ __forceinline__ float ComputeGroupScale(
|
||||||
|
const T* __restrict__ group_input, T* __restrict__ smem_group,
|
||||||
|
const int group_size, const int lane_id, const int threads_per_group,
|
||||||
|
const float eps, const float max_8bit) {
|
||||||
|
float local_absmax = eps;
|
||||||
|
|
||||||
|
constexpr int vec_size = 16 / sizeof(T);
|
||||||
|
|
||||||
|
// copy global -> shared & compute absmax
|
||||||
|
auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
|
||||||
|
float abs_v = fabsf(static_cast<float>(src));
|
||||||
|
local_absmax = fmaxf(local_absmax, abs_v);
|
||||||
|
dst = src;
|
||||||
|
};
|
||||||
|
|
||||||
|
vllm::vectorize_with_alignment<vec_size>(
|
||||||
|
group_input, // in
|
||||||
|
smem_group, // out (shared)
|
||||||
|
group_size, // elements per group
|
||||||
|
lane_id, // thread id
|
||||||
|
threads_per_group, // stride in group
|
||||||
|
scalar_op_cache); // scalar handler
|
||||||
|
|
||||||
|
local_absmax = GroupReduceMax(local_absmax);
|
||||||
|
|
||||||
|
float y_s = local_absmax / max_8bit;
|
||||||
|
if constexpr (SCALE_UE8M0) {
|
||||||
|
y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
|
||||||
|
}
|
||||||
|
|
||||||
|
return y_s;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename DST_DTYPE>
|
||||||
|
__device__ __forceinline__ void QuantizeGroup(
|
||||||
|
const T* __restrict__ smem_group, DST_DTYPE* __restrict__ group_output,
|
||||||
|
const int group_size, const int lane_id, const int threads_per_group,
|
||||||
|
const float y_s, const float min_8bit, const float max_8bit) {
|
||||||
|
constexpr int vec_size = 16 / sizeof(T);
|
||||||
|
|
||||||
|
// quantize shared -> global 8-bit
|
||||||
|
auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
|
||||||
|
float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
|
||||||
|
dst = DST_DTYPE(q);
|
||||||
|
};
|
||||||
|
|
||||||
|
vllm::vectorize_with_alignment<vec_size>(
|
||||||
|
smem_group, // in (shared)
|
||||||
|
group_output, // out (global quant tensor)
|
||||||
|
group_size, // elements
|
||||||
|
lane_id, // tid
|
||||||
|
threads_per_group, // stride
|
||||||
|
scalar_op_quant); // scalar handler
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
|
template <typename T, typename DST_DTYPE, bool IS_COLUMN_MAJOR = false,
|
||||||
bool SCALE_UE8M0 = false, typename scale_packed_t = float>
|
bool SCALE_UE8M0 = false, typename scale_packed_t = float>
|
||||||
__global__ void per_token_group_quant_8bit_kernel(
|
__global__ void per_token_group_quant_8bit_kernel(
|
||||||
@ -38,8 +94,6 @@ __global__ void per_token_group_quant_8bit_kernel(
|
|||||||
const int64_t global_group_id = block_group_id + local_group_id;
|
const int64_t global_group_id = block_group_id + local_group_id;
|
||||||
const int64_t block_group_offset = global_group_id * group_size;
|
const int64_t block_group_offset = global_group_id * group_size;
|
||||||
|
|
||||||
float local_absmax = eps;
|
|
||||||
|
|
||||||
using scale_element_t = float;
|
using scale_element_t = float;
|
||||||
static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);
|
static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);
|
||||||
|
|
||||||
@ -68,30 +122,9 @@ __global__ void per_token_group_quant_8bit_kernel(
|
|||||||
T* smem = reinterpret_cast<T*>(smem_raw);
|
T* smem = reinterpret_cast<T*>(smem_raw);
|
||||||
T* smem_group = smem + local_group_id * group_size;
|
T* smem_group = smem + local_group_id * group_size;
|
||||||
|
|
||||||
constexpr int vec_size = 16 / sizeof(T);
|
const float y_s = ComputeGroupScale<T, SCALE_UE8M0>(
|
||||||
using vec_t = vllm::vec_n_t<T, vec_size>;
|
group_input, smem_group, group_size, lane_id, threads_per_group, eps,
|
||||||
|
max_8bit);
|
||||||
// copy global -> shared & compute absmax
|
|
||||||
auto scalar_op_cache = [&] __device__(T & dst, const T& src) {
|
|
||||||
float abs_v = fabsf(static_cast<float>(src));
|
|
||||||
local_absmax = fmaxf(local_absmax, abs_v);
|
|
||||||
dst = src;
|
|
||||||
};
|
|
||||||
|
|
||||||
vllm::vectorize_with_alignment<vec_size>(
|
|
||||||
group_input, // in
|
|
||||||
smem_group, // out (shared)
|
|
||||||
group_size, // elements per group
|
|
||||||
lane_id, // thread id
|
|
||||||
threads_per_group, // stride in group
|
|
||||||
scalar_op_cache); // scalar handler
|
|
||||||
|
|
||||||
local_absmax = GroupReduceMax(local_absmax);
|
|
||||||
|
|
||||||
float y_s = local_absmax / max_8bit;
|
|
||||||
if constexpr (SCALE_UE8M0) {
|
|
||||||
y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f))));
|
|
||||||
}
|
|
||||||
|
|
||||||
scale_element_t y_s_quant = y_s;
|
scale_element_t y_s_quant = y_s;
|
||||||
|
|
||||||
@ -101,19 +134,24 @@ __global__ void per_token_group_quant_8bit_kernel(
|
|||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// quantize shared -> global 8-bit
|
QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
|
||||||
auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) {
|
threads_per_group, y_s, min_8bit, max_8bit);
|
||||||
float q = fminf(fmaxf(static_cast<float>(src) / y_s, min_8bit), max_8bit);
|
}
|
||||||
dst = DST_DTYPE(q);
|
|
||||||
};
|
|
||||||
|
|
||||||
vllm::vectorize_with_alignment<vec_size>(
|
inline int GetGroupsPerBlock(int64_t num_groups) {
|
||||||
smem_group, // in (shared)
|
if (num_groups % 16 == 0) {
|
||||||
group_output, // out (global quant tensor)
|
return 16;
|
||||||
group_size, // elements
|
}
|
||||||
lane_id, // tid
|
if (num_groups % 8 == 0) {
|
||||||
threads_per_group, // stride
|
return 8;
|
||||||
scalar_op_quant); // scalar handler
|
}
|
||||||
|
if (num_groups % 4 == 0) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
if (num_groups % 2 == 0) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void per_token_group_quant_8bit(const torch::Tensor& input,
|
void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||||
@ -133,17 +171,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
|||||||
|
|
||||||
constexpr int THREADS_PER_GROUP = 16;
|
constexpr int THREADS_PER_GROUP = 16;
|
||||||
|
|
||||||
int groups_per_block = 1;
|
const int groups_per_block = GetGroupsPerBlock(num_groups);
|
||||||
|
|
||||||
if (num_groups % 16 == 0) {
|
|
||||||
groups_per_block = 16;
|
|
||||||
} else if (num_groups % 8 == 0) {
|
|
||||||
groups_per_block = 8;
|
|
||||||
} else if (num_groups % 4 == 0) {
|
|
||||||
groups_per_block = 4;
|
|
||||||
} else if (num_groups % 2 == 0) {
|
|
||||||
groups_per_block = 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto dst_type = output_q.scalar_type();
|
auto dst_type = output_q.scalar_type();
|
||||||
const int num_blocks = num_groups / groups_per_block;
|
const int num_blocks = num_groups / groups_per_block;
|
||||||
@ -206,6 +234,148 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
|||||||
#undef LAUNCH_KERNEL
|
#undef LAUNCH_KERNEL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, typename DST_DTYPE>
|
||||||
|
__global__ void per_token_group_quant_8bit_packed_kernel(
|
||||||
|
const T* __restrict__ input, void* __restrict__ output_q,
|
||||||
|
unsigned int* __restrict__ output_s_packed, const int group_size,
|
||||||
|
const int num_groups, const int groups_per_block, const int groups_per_row,
|
||||||
|
const int mn, const int tma_aligned_mn, const float eps,
|
||||||
|
const float min_8bit, const float max_8bit) {
|
||||||
|
const int threads_per_group = 16;
|
||||||
|
const int64_t local_group_id = threadIdx.x / threads_per_group;
|
||||||
|
const int lane_id = threadIdx.x % threads_per_group;
|
||||||
|
|
||||||
|
const int64_t block_group_id = blockIdx.x * groups_per_block;
|
||||||
|
const int64_t global_group_id = block_group_id + local_group_id;
|
||||||
|
if (global_group_id >= num_groups) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t block_group_offset = global_group_id * group_size;
|
||||||
|
|
||||||
|
const T* group_input = input + block_group_offset;
|
||||||
|
DST_DTYPE* group_output =
|
||||||
|
static_cast<DST_DTYPE*>(output_q) + block_group_offset;
|
||||||
|
|
||||||
|
// shared memory to cache each group's data to avoid double DRAM reads.
|
||||||
|
extern __shared__ __align__(16) char smem_raw[];
|
||||||
|
T* smem = reinterpret_cast<T*>(smem_raw);
|
||||||
|
T* smem_group = smem + local_group_id * group_size;
|
||||||
|
const float y_s =
|
||||||
|
ComputeGroupScale<T, true>(group_input, smem_group, group_size, lane_id,
|
||||||
|
threads_per_group, eps, max_8bit);
|
||||||
|
|
||||||
|
// pack 4 scales into a uint32
|
||||||
|
if (lane_id == 0) {
|
||||||
|
// map flat group id to 2D indices (mn_idx, sf_k_idx)
|
||||||
|
const int sf_k_idx = static_cast<int>(global_group_id % groups_per_row);
|
||||||
|
const int mn_idx = static_cast<int>(global_group_id / groups_per_row);
|
||||||
|
|
||||||
|
if (mn_idx < mn) {
|
||||||
|
// each uint32 in output_s_packed stores 4 packed scales
|
||||||
|
const int sf_k_pack_idx = sf_k_idx / 4;
|
||||||
|
const int pos = sf_k_idx % 4;
|
||||||
|
|
||||||
|
// reinterpret the UE8M0 scale y_s as IEEE bits, extract the 8-bit
|
||||||
|
// exponent, and place it into the correct byte of the 32-bit word.
|
||||||
|
const unsigned int bits = __float_as_uint(y_s);
|
||||||
|
const unsigned int exponent = (bits >> 23u) & 0xffu;
|
||||||
|
const unsigned int contrib = exponent << (pos * 8u);
|
||||||
|
|
||||||
|
const int out_idx = sf_k_pack_idx * tma_aligned_mn + mn_idx;
|
||||||
|
// atomically OR 8-bit exponent into the packed scales buffer
|
||||||
|
atomicOr(output_s_packed + out_idx, contrib);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
QuantizeGroup<T, DST_DTYPE>(smem_group, group_output, group_size, lane_id,
|
||||||
|
threads_per_group, y_s, min_8bit, max_8bit);
|
||||||
|
}
|
||||||
|
|
||||||
|
void per_token_group_quant_8bit_packed(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s_packed,
|
||||||
|
int64_t group_size, double eps,
|
||||||
|
double min_8bit, double max_8bit) {
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(output_q.is_contiguous());
|
||||||
|
|
||||||
|
const int64_t k = input.size(-1);
|
||||||
|
TORCH_CHECK(k % group_size == 0, "Last dimension (", k,
|
||||||
|
") must be divisible by group_size (", group_size, ").");
|
||||||
|
|
||||||
|
const int64_t mn = input.numel() / k;
|
||||||
|
const int64_t groups_per_row = k / group_size;
|
||||||
|
const int64_t num_groups = mn * groups_per_row;
|
||||||
|
|
||||||
|
TORCH_CHECK(output_s_packed.dim() == 2,
|
||||||
|
"output_s_packed must be 2D, got dim=", output_s_packed.dim(),
|
||||||
|
".");
|
||||||
|
|
||||||
|
const int64_t k_num_packed_sfk = (groups_per_row + 3) / 4;
|
||||||
|
const int64_t tma_aligned_mn = ((mn + 3) / 4) * 4;
|
||||||
|
|
||||||
|
TORCH_CHECK(output_s_packed.scalar_type() == at::ScalarType::Int,
|
||||||
|
"output_s_packed must have dtype int32 for UE8M0-packed scales.");
|
||||||
|
// DeepGEMM expects SFA scales in MN-major form with shape
|
||||||
|
// [mn, ceil_div(K, 128 * 4)] and TMA-aligned stride on the last
|
||||||
|
// dimension.
|
||||||
|
TORCH_CHECK(output_s_packed.size(0) == mn &&
|
||||||
|
output_s_packed.size(1) == k_num_packed_sfk,
|
||||||
|
"output_s_packed shape must be [", mn, ", ", k_num_packed_sfk,
|
||||||
|
"], but got [", output_s_packed.size(0), ", ",
|
||||||
|
output_s_packed.size(1), "].");
|
||||||
|
|
||||||
|
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
constexpr int THREADS_PER_GROUP = 16;
|
||||||
|
|
||||||
|
const int groups_per_block = GetGroupsPerBlock(num_groups);
|
||||||
|
|
||||||
|
auto dst_type = output_q.scalar_type();
|
||||||
|
const int num_blocks = num_groups / groups_per_block;
|
||||||
|
const int num_threads = groups_per_block * THREADS_PER_GROUP;
|
||||||
|
|
||||||
|
// zero-initialize packed scales, since we use atomicOr to accumulate
|
||||||
|
// exponents from different groups.
|
||||||
|
output_s_packed.zero_();
|
||||||
|
|
||||||
|
#define LAUNCH_PACKED_KERNEL(T, DST_DTYPE) \
|
||||||
|
do { \
|
||||||
|
dim3 grid(num_blocks); \
|
||||||
|
dim3 block(num_threads); \
|
||||||
|
size_t smem_bytes = \
|
||||||
|
static_cast<size_t>(groups_per_block) * group_size * sizeof(T); \
|
||||||
|
per_token_group_quant_8bit_packed_kernel<T, DST_DTYPE> \
|
||||||
|
<<<grid, block, smem_bytes, stream>>>( \
|
||||||
|
static_cast<const T*>(input.data_ptr()), output_q.data_ptr(), \
|
||||||
|
reinterpret_cast<unsigned int*>(output_s_packed.data_ptr()), \
|
||||||
|
static_cast<int>(group_size), static_cast<int>(num_groups), \
|
||||||
|
groups_per_block, static_cast<int>(groups_per_row), \
|
||||||
|
static_cast<int>(mn), static_cast<int>(tma_aligned_mn), \
|
||||||
|
static_cast<float>(eps), static_cast<float>(min_8bit), \
|
||||||
|
static_cast<float>(max_8bit)); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "per_token_group_quant_8bit_packed", ([&] {
|
||||||
|
if (dst_type == at::ScalarType::Float8_e4m3fn) {
|
||||||
|
LAUNCH_PACKED_KERNEL(scalar_t, __nv_fp8_e4m3);
|
||||||
|
} else if (dst_type == at::ScalarType::Char) {
|
||||||
|
LAUNCH_PACKED_KERNEL(scalar_t, int8_t);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(
|
||||||
|
false,
|
||||||
|
"per_token_group_quant_8bit_packed only supports FP8/INT8 "
|
||||||
|
"outputs.");
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
#undef LAUNCH_PACKED_KERNEL
|
||||||
|
}
|
||||||
|
|
||||||
void per_token_group_quant_fp8(const torch::Tensor& input,
|
void per_token_group_quant_fp8(const torch::Tensor& input,
|
||||||
torch::Tensor& output_q, torch::Tensor& output_s,
|
torch::Tensor& output_q, torch::Tensor& output_s,
|
||||||
int64_t group_size, double eps, double fp8_min,
|
int64_t group_size, double eps, double fp8_min,
|
||||||
|
|||||||
@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
|
|||||||
}
|
}
|
||||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||||
|
|
||||||
|
// Find the min val of div2 that doesn't increase N/(div1*div2)
|
||||||
int mindiv(int N, int div1, int div2) {
|
int mindiv(int N, int div1, int div2) {
|
||||||
int nPrRnd = div1 * div2;
|
int nPrRnd = div1 * div2;
|
||||||
int rnds0 = N / nPrRnd;
|
int rnds[13];
|
||||||
nPrRnd -= div1 * 3;
|
for (int i = 0; i < 13; i++) {
|
||||||
int rnds3 = N / nPrRnd;
|
rnds[i] = (N + nPrRnd - 1) / nPrRnd;
|
||||||
nPrRnd -= div1;
|
nPrRnd -= div1;
|
||||||
int rnds4 = N / nPrRnd;
|
}
|
||||||
nPrRnd -= div1;
|
for (int i = 12; i >= 0; i--)
|
||||||
int rnds5 = N / nPrRnd;
|
if (rnds[0] == rnds[i]) return (div2 - i);
|
||||||
nPrRnd -= div1;
|
|
||||||
int rnds6 = N / nPrRnd;
|
|
||||||
nPrRnd -= div1;
|
|
||||||
int rnds7 = N / nPrRnd;
|
|
||||||
nPrRnd -= div1;
|
|
||||||
int rnds8 = N / nPrRnd;
|
|
||||||
nPrRnd -= div1;
|
|
||||||
int rnds9 = N / nPrRnd;
|
|
||||||
nPrRnd -= div1;
|
|
||||||
int rtn = div2;
|
|
||||||
if (rnds0 == rnds3) rtn = div2 - 3;
|
|
||||||
if (rnds0 == rnds4) rtn = div2 - 4;
|
|
||||||
if (rnds0 == rnds5) rtn = div2 - 5;
|
|
||||||
if (rnds0 == rnds6) rtn = div2 - 6;
|
|
||||||
if (rnds0 == rnds7) rtn = div2 - 7;
|
|
||||||
if (rnds0 == rnds8) rtn = div2 - 8;
|
|
||||||
if (rnds0 == rnds9) rtn = div2 - 9;
|
|
||||||
return rtn;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||||
@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
|||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
const int max_lds_len = get_lds_size() / 2;
|
const int max_lds_len = get_lds_size() / 2;
|
||||||
|
|
||||||
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
#define WVSPLITK(_YTILE, _UNRL, _N) \
|
||||||
_N) \
|
{ \
|
||||||
{ \
|
dim3 block(64, 16); \
|
||||||
dim3 block(64, _WvPrGrp); \
|
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \
|
||||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \
|
||||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||||
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
|
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
biasf4, c, __wvPrGrp, CuCount); \
|
||||||
biasf4, c, __wvPrGrp, CuCount); \
|
else if (K_in * N_in <= max_lds_len * 1.2) \
|
||||||
} else if (K_in * N_in <= max_lds_len * 1.2) { \
|
wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
|
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||||
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
|
biasf4, c, __wvPrGrp, CuCount); \
|
||||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
else \
|
||||||
biasf4, c, __wvPrGrp, CuCount); \
|
wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||||
} else { \
|
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \
|
biasf4, c, __wvPrGrp, CuCount); \
|
||||||
wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N> \
|
}
|
||||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
|
||||||
biasf4, c, __wvPrGrp, CuCount); \
|
#define WVSPLIT_TILE(_sYT, __N) \
|
||||||
} \
|
{ \
|
||||||
|
bool fit_lds = (K_in * N_in <= max_lds_len); \
|
||||||
|
if (_sYT <= 1) \
|
||||||
|
WVSPLITK(1, 4, __N) \
|
||||||
|
else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
|
||||||
|
WVSPLITK(2, 2, __N) \
|
||||||
|
else if (_sYT <= 4 * 3) \
|
||||||
|
WVSPLITK(3, 2, __N) \
|
||||||
|
else if (__N == 4) \
|
||||||
|
WVSPLITK(4, 1, __N) \
|
||||||
|
else \
|
||||||
|
WVSPLITK(4, 2, __N) \
|
||||||
}
|
}
|
||||||
|
|
||||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
|
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
|
||||||
@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
|||||||
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
|
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
|
||||||
: nullptr;
|
: nullptr;
|
||||||
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
|
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
|
||||||
|
|
||||||
|
// first shoot for biggest tile-size that keeps all simd busy,
|
||||||
|
// then cut the active waves to balance their distribution...
|
||||||
|
int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
|
||||||
|
|
||||||
switch (N_in) {
|
switch (N_in) {
|
||||||
case 1:
|
case 1:
|
||||||
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
|
WVSPLIT_TILE(sYT, 1)
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
|
WVSPLIT_TILE(sYT, 2)
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
|
WVSPLIT_TILE(sYT, 3)
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
|
WVSPLIT_TILE(sYT, 4)
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
|
|||||||
743
csrc/sampler.cu
743
csrc/sampler.cu
@ -44,41 +44,300 @@ __global__ void apply_repetition_penalties_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __device__ uint16_t extractBinIdx(float x) {
|
__device__ __forceinline__ auto convert_to_uint32(float x) -> uint32_t {
|
||||||
union {
|
uint32_t bits = __float_as_uint(x);
|
||||||
__half h;
|
return (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
|
||||||
uint16_t u16;
|
|
||||||
} tmp;
|
|
||||||
tmp.h = __float2half_rn(x);
|
|
||||||
tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
|
|
||||||
return 511 - (tmp.u16 >> 7);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
|
template <int step>
|
||||||
__device__ void topKPerRowJob(const float* logits, const int rowStart,
|
static inline __device__ uint32_t extractBinIdx(float x) {
|
||||||
const int rowEnd, const int rowIdx,
|
if constexpr (step == 0) {
|
||||||
int* outIndices, int stride0, int stride1) {
|
__half hx = __float2half(x);
|
||||||
// The number of elements per thread for the final top-k sort.
|
uint16_t bits = __half_as_ushort(hx);
|
||||||
static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
|
bits = (bits & 0x8000) ? bits : ~bits & 0x7fff;
|
||||||
// The class to sort the elements during the final top-k sort.
|
return bits >> 5;
|
||||||
using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
|
} else {
|
||||||
kNumTopKItemsPerThread, int>;
|
uint32_t bits = __float_as_uint(x);
|
||||||
|
bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
|
||||||
|
|
||||||
|
if constexpr (step == 1) {
|
||||||
|
return bits >> 21;
|
||||||
|
} else if constexpr (step == 2) {
|
||||||
|
return (bits >> 10) & 0x7ff;
|
||||||
|
} else if constexpr (step == 3) {
|
||||||
|
return bits & 0x3ff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int shift>
|
||||||
|
static inline __device__ bool isPartialMatch(float x, uint32_t pattern) {
|
||||||
|
if constexpr (shift == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
uint32_t bits = __float_as_uint(x);
|
||||||
|
bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff;
|
||||||
|
return (bits ^ pattern) >> shift == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map a Func over the input data, using vectorized load instructions if
|
||||||
|
* possible.
|
||||||
|
*
|
||||||
|
* @tparam T element type
|
||||||
|
* @tparam IdxT indexing type
|
||||||
|
* @tparam Func void (T x, IdxT idx)
|
||||||
|
*
|
||||||
|
* @param thread_rank rank of the calling thread among all participating threads
|
||||||
|
* @param num_threads number of the threads that participate in processing
|
||||||
|
* @param in the input data
|
||||||
|
* @param len the number of elements to read
|
||||||
|
* @param f the lambda taking two arguments (T x, IdxT idx)
|
||||||
|
*/
|
||||||
|
template <typename T, typename idxT, typename Func>
|
||||||
|
__device__ void vectorized_process(size_t thread_rank, size_t num_threads,
|
||||||
|
const T* in, idxT len, Func f) {
|
||||||
|
constexpr int WARP_SIZE = 32;
|
||||||
|
using WideT = float4;
|
||||||
|
if constexpr (sizeof(T) >= sizeof(WideT)) {
|
||||||
|
for (idxT i = thread_rank; i < len; i += num_threads) {
|
||||||
|
f(in[i], i);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
static_assert(sizeof(WideT) % sizeof(T) == 0);
|
||||||
|
constexpr int items_per_scalar = sizeof(WideT) / sizeof(T);
|
||||||
|
// TODO: it's UB
|
||||||
|
union {
|
||||||
|
WideT scalar;
|
||||||
|
T array[items_per_scalar];
|
||||||
|
} wide;
|
||||||
|
|
||||||
|
int skip_cnt =
|
||||||
|
(reinterpret_cast<size_t>(in) % sizeof(WideT))
|
||||||
|
? ((sizeof(WideT) - reinterpret_cast<size_t>(in) % sizeof(WideT)) /
|
||||||
|
sizeof(T))
|
||||||
|
: 0;
|
||||||
|
if (skip_cnt > len) {
|
||||||
|
skip_cnt = len;
|
||||||
|
}
|
||||||
|
const WideT* in_cast = reinterpret_cast<decltype(in_cast)>(in + skip_cnt);
|
||||||
|
const idxT len_cast = (len - skip_cnt) / items_per_scalar;
|
||||||
|
|
||||||
|
for (idxT i = thread_rank; i < len_cast; i += num_threads) {
|
||||||
|
wide.scalar = in_cast[i];
|
||||||
|
const idxT real_i = skip_cnt + i * items_per_scalar;
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < items_per_scalar; ++j) {
|
||||||
|
f(wide.array[j], real_i + j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static_assert(WARP_SIZE >= items_per_scalar);
|
||||||
|
// and because items_per_scalar > skip_cnt, WARP_SIZE > skip_cnt
|
||||||
|
// no need to use loop
|
||||||
|
if (thread_rank < skip_cnt) {
|
||||||
|
f(in[thread_rank], thread_rank);
|
||||||
|
}
|
||||||
|
// because len_cast = (len - skip_cnt) / items_per_scalar,
|
||||||
|
// len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
|
||||||
|
// and so
|
||||||
|
// len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <=
|
||||||
|
// WARP_SIZE no need to use loop
|
||||||
|
const idxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank;
|
||||||
|
if (remain_i < len) {
|
||||||
|
f(in[remain_i], remain_i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int step, int kNumThreadsPerBlock, int kNumBins, int kNumFinalItems,
|
||||||
|
bool multipleBlocksPerRow, bool mergeBlocks, typename SmemFinalType,
|
||||||
|
typename SmemOutputType>
|
||||||
|
__device__ bool processHistogramStep(
|
||||||
|
const int* indices, const float* logits, int rowEnd, uint32_t& logitPattern,
|
||||||
|
int& thresholdBinIdx, SmemOutputType& smemOutput, int* smemThresholdBinIdx,
|
||||||
|
int* smemFinalDstIdx, int* smemFinalBinSize, int* smemFoundTopKValues,
|
||||||
|
SmemFinalType& smemFinal, int stride1, int rowStart, int topK) {
|
||||||
|
// Clear the histogram.
|
||||||
|
#pragma unroll
|
||||||
|
for (int idx = threadIdx.x; idx < kNumBins; idx += kNumThreadsPerBlock) {
|
||||||
|
smemFinal.histo.data[idx] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure the histogram is ready.
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Update pattern
|
||||||
|
constexpr auto patternShift = step < 2 ? 0 : step == 2 ? 21 : 10;
|
||||||
|
if constexpr (step == 2) {
|
||||||
|
logitPattern = static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
|
||||||
|
<< patternShift;
|
||||||
|
} else if constexpr (step == 3) {
|
||||||
|
logitPattern |= static_cast<uint32_t>(thresholdBinIdx & 0x7ff)
|
||||||
|
<< patternShift;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto distributeToBins = [&](float logit, int /* idx */ = 0) {
|
||||||
|
if (isPartialMatch<patternShift>(logit, logitPattern)) {
|
||||||
|
uint32_t binIdx = extractBinIdx<step>(logit);
|
||||||
|
atomicAdd(&smemFinal.histo.data[binIdx], 1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Distribute the elements to the histogram bins.
|
||||||
|
if (stride1 == 1) {
|
||||||
|
vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
|
||||||
|
rowEnd - rowStart, distributeToBins);
|
||||||
|
} else {
|
||||||
|
for (int idx = rowStart + threadIdx.x; idx < rowEnd;
|
||||||
|
idx += kNumThreadsPerBlock) {
|
||||||
|
float logit = logits[idx * stride1];
|
||||||
|
distributeToBins(logit, idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Make sure the histogram is ready.
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Reads the value of the starting position in the smemOutput array
|
||||||
|
int lastValue = smemFoundTopKValues[0];
|
||||||
|
|
||||||
|
for (int round = 0; round < kNumBins / kNumThreadsPerBlock; round++) {
|
||||||
|
// Read the values from SMEM.
|
||||||
|
int idx = threadIdx.x + kNumThreadsPerBlock * round;
|
||||||
|
int binCount{0};
|
||||||
|
binCount = smemFinal.histo.data[idx];
|
||||||
|
|
||||||
|
// Make sure each thread has read its value.
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Compute the prefix sum.
|
||||||
|
int prefixSum{0}, totalSum{0};
|
||||||
|
using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
|
||||||
|
Scan(smemFinal.histo.scan).ExclusiveSum(binCount, prefixSum, totalSum);
|
||||||
|
|
||||||
|
// Update the histogram with the prefix sums.
|
||||||
|
prefixSum += lastValue;
|
||||||
|
totalSum += lastValue;
|
||||||
|
smemFinal.histo.data[idx] = prefixSum;
|
||||||
|
|
||||||
|
// Make sure the data is in shared memory.
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Find the last valid bin.
|
||||||
|
bool foundThreshold = false;
|
||||||
|
if (prefixSum < topK) {
|
||||||
|
int nextPrefixSum = threadIdx.x == kNumThreadsPerBlock - 1
|
||||||
|
? totalSum
|
||||||
|
: smemFinal.histo.data[idx + 1];
|
||||||
|
|
||||||
|
if (nextPrefixSum >= topK) {
|
||||||
|
smemThresholdBinIdx[0] = idx;
|
||||||
|
smemFinalBinSize[0] = nextPrefixSum - prefixSum;
|
||||||
|
foundThreshold = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Early exit: if any thread found the threshold, we can skip remaining
|
||||||
|
// rounds
|
||||||
|
if (__syncthreads_or(foundThreshold)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
lastValue = totalSum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure the data is in shared memory.
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// The threshold bin.
|
||||||
|
thresholdBinIdx = smemThresholdBinIdx[0];
|
||||||
|
|
||||||
|
auto processBins = [&](float logit, int idx) {
|
||||||
|
if (isPartialMatch<patternShift>(logit, logitPattern)) {
|
||||||
|
uint32_t binIdx = extractBinIdx<step>(logit);
|
||||||
|
if (binIdx < thresholdBinIdx) {
|
||||||
|
// The element is part of the top-k selection
|
||||||
|
int dstIdx = atomicAdd(&smemFoundTopKValues[0], 1);
|
||||||
|
|
||||||
|
if constexpr (mergeBlocks) {
|
||||||
|
smemOutput[dstIdx] = indices[idx];
|
||||||
|
} else if constexpr (multipleBlocksPerRow) {
|
||||||
|
smemOutput[dstIdx] = idx + rowStart;
|
||||||
|
reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
|
||||||
|
} else {
|
||||||
|
smemOutput[dstIdx] = idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if constexpr (step < 3) {
|
||||||
|
// Only fill the final items for sorting if the threshold bin fits
|
||||||
|
if (binIdx == thresholdBinIdx &&
|
||||||
|
smemFinalBinSize[0] <= kNumFinalItems) {
|
||||||
|
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
|
||||||
|
smemFinal.items.logits[dstIdx] = logit;
|
||||||
|
if constexpr (mergeBlocks) {
|
||||||
|
smemFinal.items.indices[dstIdx] = indices[idx];
|
||||||
|
} else if constexpr (multipleBlocksPerRow) {
|
||||||
|
smemFinal.items.indices[dstIdx] = idx + rowStart;
|
||||||
|
} else {
|
||||||
|
smemFinal.items.indices[dstIdx] = idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (binIdx == thresholdBinIdx) {
|
||||||
|
// The elements in the threshold bin share the same 32 bits at step 3
|
||||||
|
int dstIdx = atomicAdd(&smemFinal.histo.data[binIdx], 1);
|
||||||
|
if (dstIdx < topK) {
|
||||||
|
if constexpr (mergeBlocks) {
|
||||||
|
smemOutput[dstIdx] = indices[idx];
|
||||||
|
} else if constexpr (multipleBlocksPerRow) {
|
||||||
|
smemOutput[dstIdx] = idx + rowStart;
|
||||||
|
reinterpret_cast<float*>(smemOutput + topK)[dstIdx] = logit;
|
||||||
|
} else {
|
||||||
|
smemOutput[dstIdx] = idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (stride1 == 1) {
|
||||||
|
vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart,
|
||||||
|
rowEnd - rowStart, processBins);
|
||||||
|
} else {
|
||||||
|
for (int idx = rowStart + threadIdx.x; idx < rowEnd;
|
||||||
|
idx += kNumThreadsPerBlock) {
|
||||||
|
float logit = logits[idx * stride1];
|
||||||
|
processBins(logit, idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure the elements are in shared memory.
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Check if we should continue to next step
|
||||||
|
return smemFinalBinSize[0] > kNumFinalItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Follows half - 11 - 11 - 10 bit iterations
|
||||||
|
template <int kNumThreadsPerBlock, int kNumBins, bool useRadixSort,
|
||||||
|
bool multipleBlocksPerRow = false, bool mergeBlocks = false>
|
||||||
|
static __device__ void topKPerRowJob(const int* indices, const float* logits,
|
||||||
|
int rowStart, int rowEnd, int* outIndices,
|
||||||
|
float* outLogits, int stride1, int topK) {
|
||||||
// The number of slots for the final pass.
|
// The number of slots for the final pass.
|
||||||
static constexpr int kNumFinalItems = 3072;
|
static constexpr int kNumFinalItems = 2048;
|
||||||
// The number of elements per thread for the final sort.
|
// The number of elements per thread for the final sort.
|
||||||
static constexpr int kNumFinalItemsPerThread =
|
static constexpr int kNumFinalItemsPerThread =
|
||||||
kNumFinalItems / kNumThreadsPerBlock;
|
kNumFinalItems / kNumThreadsPerBlock;
|
||||||
// The class to sort the elements during the final pass.
|
// The class to sort the elements during the final pass.
|
||||||
using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
|
using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
|
||||||
kNumFinalItemsPerThread, int>;
|
kNumFinalItemsPerThread, int>;
|
||||||
|
using FinalSortTempStorage =
|
||||||
|
std::conditional_t<useRadixSort, typename FinalSort::TempStorage, int>;
|
||||||
// The class to compute the inclusive prefix-sum over the histogram.
|
// The class to compute the inclusive prefix-sum over the histogram.
|
||||||
using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
|
using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
|
||||||
|
|
||||||
// Shared memory to compute the block scan.
|
|
||||||
__shared__ typename Scan::TempStorage smemScan;
|
|
||||||
|
|
||||||
// The structure to store the final items (for the final pass).
|
// The structure to store the final items (for the final pass).
|
||||||
struct FinalItems {
|
struct FinalItems {
|
||||||
// Shared memory to store the indices for the final pass.
|
// Shared memory to store the indices for the final pass.
|
||||||
@ -87,200 +346,225 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
|
|||||||
float logits[kNumFinalItems];
|
float logits[kNumFinalItems];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Histogram {
|
||||||
|
typename Scan::TempStorage scan;
|
||||||
|
int data[kNumBins];
|
||||||
|
};
|
||||||
|
|
||||||
// Shared memory to compute the block sort.
|
// Shared memory to compute the block sort.
|
||||||
__shared__ union {
|
__shared__ union {
|
||||||
FinalItems items;
|
FinalItems items;
|
||||||
typename FinalSort::TempStorage finalSort;
|
FinalSortTempStorage finalSort;
|
||||||
typename TopKSort::TempStorage topKSort;
|
Histogram histo;
|
||||||
} smemFinal;
|
} smemFinal;
|
||||||
|
|
||||||
// Shared memory to store the histogram.
|
|
||||||
__shared__ int smemHistogram[kNumBins];
|
|
||||||
// Shared memory to store the selected indices.
|
// Shared memory to store the selected indices.
|
||||||
__shared__ int smemIndices[kTopK];
|
// If we are processing using multiple blocks, we need to store the logits and
|
||||||
|
// indices.
|
||||||
|
extern __shared__ int32_t smemOutput[];
|
||||||
|
|
||||||
// Shared memory to store the threshold bin.
|
// Shared memory to store the threshold bin.
|
||||||
__shared__ int smemThresholdBinIdx[1];
|
__shared__ int smemThresholdBinIdx[1];
|
||||||
// Shared memory counter to register the candidates for the final phase.
|
// Shared memory counter to register the candidates for the final phase.
|
||||||
__shared__ int smemFinalDstIdx[1];
|
__shared__ int smemFinalDstIdx[1];
|
||||||
|
// Shared memory to determine if the threshold bin fits in the final items.
|
||||||
|
__shared__ int smemFinalBinSize[1];
|
||||||
|
// Shared memory to keep track of the top-k values found so far by the
|
||||||
|
// previous iterations
|
||||||
|
__shared__ int smemFoundTopKValues[1];
|
||||||
|
|
||||||
// The length of the row.
|
// The length of the row.
|
||||||
int rowLen = rowEnd - rowStart;
|
int rowLen = rowEnd - rowStart;
|
||||||
|
|
||||||
// Shortcut if the length of the row is smaller than Top-K. Indices are not
|
// Shortcut if the length of the row is smaller than Top-K. Indices are not
|
||||||
// sorted by their corresponding logit.
|
// sorted by their corresponding logit.
|
||||||
if (rowLen <= kTopK) {
|
if (rowLen <= topK) {
|
||||||
for (int rowIt = threadIdx.x; rowIt < rowLen;
|
for (int rowIt = threadIdx.x; rowIt < rowLen;
|
||||||
rowIt += kNumThreadsPerBlock) {
|
rowIt += kNumThreadsPerBlock) {
|
||||||
int idx = rowStart + rowIt;
|
if constexpr (multipleBlocksPerRow) {
|
||||||
outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
|
outIndices[rowIt] = rowIt + rowStart;
|
||||||
|
outLogits[rowIt] = logits[rowIt + rowStart];
|
||||||
|
} else {
|
||||||
|
outIndices[rowIt] = rowIt;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
|
for (int rowIt = rowLen + threadIdx.x; rowIt < topK;
|
||||||
rowIt += kNumThreadsPerBlock) {
|
rowIt += kNumThreadsPerBlock) {
|
||||||
outIndices[rowIdx * kTopK + rowIt] = -1;
|
outIndices[rowIt] = -1;
|
||||||
|
if constexpr (multipleBlocksPerRow) {
|
||||||
|
outLogits[rowIt] = -FLT_MAX;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// Initialize values
|
||||||
// Clear the histogram.
|
|
||||||
if (threadIdx.x < kNumBins) {
|
|
||||||
smemHistogram[threadIdx.x] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure the histogram is ready.
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Fetch elements one-by-one.
|
|
||||||
for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
|
|
||||||
rowIt += kNumThreadsPerBlock) {
|
|
||||||
uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
|
|
||||||
atomicAdd(&smemHistogram[idx], 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure the histogram is ready.
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Read the values from SMEM.
|
|
||||||
int binCount{0};
|
|
||||||
if (threadIdx.x < kNumBins) {
|
|
||||||
binCount = smemHistogram[threadIdx.x];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure each thread has read its value.
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Compute the prefix sum.
|
|
||||||
int prefixSum{0}, totalSum{0};
|
|
||||||
Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);
|
|
||||||
|
|
||||||
// Update the histogram with the prefix sums.
|
|
||||||
if (threadIdx.x < kNumBins) {
|
|
||||||
smemHistogram[threadIdx.x] = prefixSum;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure the data is in shared memory.
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Find the last valid bin.
|
|
||||||
if (threadIdx.x < kNumBins) {
|
|
||||||
int nextPrefixSum =
|
|
||||||
threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
|
|
||||||
if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
|
|
||||||
smemThresholdBinIdx[0] = threadIdx.x;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear the counter to store the items for the final phase.
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
smemFinalDstIdx[0] = 0;
|
smemFinalDstIdx[0] = 0;
|
||||||
|
smemFoundTopKValues[0] = 0;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
int thresholdBinIdx = -1;
|
||||||
|
uint32_t logitPattern = 0;
|
||||||
|
|
||||||
|
// Step 0: Process first 11 bits of half representation
|
||||||
|
bool continueToNextStep =
|
||||||
|
processHistogramStep<0, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
|
||||||
|
multipleBlocksPerRow, mergeBlocks>(
|
||||||
|
indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
|
||||||
|
smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
|
||||||
|
smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
|
||||||
|
|
||||||
|
if (continueToNextStep) {
|
||||||
|
// Step 1: Process next 11 bits
|
||||||
|
continueToNextStep =
|
||||||
|
processHistogramStep<1, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
|
||||||
|
multipleBlocksPerRow, mergeBlocks>(
|
||||||
|
indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
|
||||||
|
smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
|
||||||
|
smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure the data is in shared memory.
|
if (continueToNextStep) {
|
||||||
__syncthreads();
|
// Step 2: Process next 11 bits
|
||||||
|
continueToNextStep =
|
||||||
|
processHistogramStep<2, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
|
||||||
|
multipleBlocksPerRow, mergeBlocks>(
|
||||||
|
indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
|
||||||
|
smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
|
||||||
|
smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
|
||||||
|
}
|
||||||
|
|
||||||
// The threshold bin.
|
if (continueToNextStep) {
|
||||||
int thresholdBinIdx = smemThresholdBinIdx[0];
|
// Step 3: Process last 10 bits
|
||||||
|
processHistogramStep<3, kNumThreadsPerBlock, kNumBins, kNumFinalItems,
|
||||||
|
multipleBlocksPerRow, mergeBlocks>(
|
||||||
|
indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput,
|
||||||
|
smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize,
|
||||||
|
smemFoundTopKValues, smemFinal, stride1, rowStart, topK);
|
||||||
|
}
|
||||||
|
|
||||||
// Fetch elements one-by-one and populate the shared memory buffers.
|
if (!continueToNextStep) {
|
||||||
for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
|
// The histogram did not proceed to the final 10 bits, therefore we need to
|
||||||
rowIt += kNumThreadsPerBlock) {
|
// sort the final items The logits of the elements to be sorted in the final
|
||||||
float logit = logits[rowIdx * stride0 + rowIt * stride1];
|
// pass.
|
||||||
uint16_t idx = extractBinIdx(logit);
|
if constexpr (useRadixSort) {
|
||||||
if (idx < thresholdBinIdx) {
|
// Sorting with radix sort
|
||||||
int dstIdx = atomicAdd(&smemHistogram[idx], 1);
|
float finalLogits[kNumFinalItemsPerThread];
|
||||||
smemIndices[dstIdx] = rowIt;
|
// The indices of the elements to be sorted in the final pass.
|
||||||
} else if (idx == thresholdBinIdx) {
|
int finalIndices[kNumFinalItemsPerThread];
|
||||||
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
|
|
||||||
if (dstIdx < kNumFinalItems) {
|
#pragma unroll
|
||||||
smemFinal.items.logits[dstIdx] = logit;
|
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||||
smemFinal.items.indices[dstIdx] = rowIt;
|
finalLogits[ii] = -FLT_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the elements from SMEM.
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||||
|
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
||||||
|
if (srcIdx < smemFinalDstIdx[0]) {
|
||||||
|
finalLogits[ii] = smemFinal.items.logits[srcIdx];
|
||||||
|
finalIndices[ii] = smemFinal.items.indices[srcIdx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Make sure the shared memory has been read.
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Sort the elements.
|
||||||
|
FinalSort(smemFinal.finalSort)
|
||||||
|
.SortDescendingBlockedToStriped(finalLogits, finalIndices);
|
||||||
|
|
||||||
|
// Copy the data back to the shared memory storage.
|
||||||
|
int baseIdx = smemFoundTopKValues[0];
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
||||||
|
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
||||||
|
int dstIdx = baseIdx + srcIdx;
|
||||||
|
|
||||||
|
if (dstIdx < topK) {
|
||||||
|
smemOutput[dstIdx] = finalIndices[ii];
|
||||||
|
if constexpr (multipleBlocksPerRow) {
|
||||||
|
reinterpret_cast<float*>(smemOutput + topK)[dstIdx] =
|
||||||
|
finalLogits[ii];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Sorting with insertion sort
|
||||||
|
auto baseIdx = smemFoundTopKValues[0];
|
||||||
|
for (int i = threadIdx.x; i < smemFinalDstIdx[0];
|
||||||
|
i += kNumThreadsPerBlock) {
|
||||||
|
int outIndex = 0;
|
||||||
|
auto logit = smemFinal.items.logits[i];
|
||||||
|
for (int j = 0; j < smemFinalDstIdx[0]; j++) {
|
||||||
|
auto otherLogit = smemFinal.items.logits[j];
|
||||||
|
if (logit < otherLogit || (logit == otherLogit && i < j)) {
|
||||||
|
outIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Store if outIndex is in bounds
|
||||||
|
if (outIndex + baseIdx < topK) {
|
||||||
|
smemOutput[outIndex + baseIdx] = smemFinal.items.indices[i];
|
||||||
|
if constexpr (multipleBlocksPerRow) {
|
||||||
|
reinterpret_cast<float*>(smemOutput + topK)[outIndex + baseIdx] =
|
||||||
|
smemFinal.items.logits[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store to global memory.
|
||||||
|
for (int i = threadIdx.x; i < topK; i += kNumThreadsPerBlock) {
|
||||||
|
if constexpr (multipleBlocksPerRow) {
|
||||||
|
outIndices[i] = smemOutput[i];
|
||||||
|
outLogits[i] = reinterpret_cast<float*>(smemOutput + topK)[i];
|
||||||
|
} else {
|
||||||
|
if (stride1 == 1) {
|
||||||
|
// stride1 == 1 will use vectorized_process, which indexes already skip
|
||||||
|
// the rowStart.
|
||||||
|
outIndices[i] = smemOutput[i];
|
||||||
|
} else {
|
||||||
|
outIndices[i] = smemOutput[i] - rowStart;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure the elements are in shared memory.
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// The logits of the elements to be sorted in the final pass.
|
|
||||||
float finalLogits[kNumFinalItemsPerThread];
|
|
||||||
// The indices of the elements to be sorted in the final pass.
|
|
||||||
int finalIndices[kNumFinalItemsPerThread];
|
|
||||||
|
|
||||||
// Init.
|
|
||||||
#pragma unroll
|
|
||||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
|
||||||
finalLogits[ii] = -FLT_MAX;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read the elements from SMEM.
|
|
||||||
#pragma unroll
|
|
||||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
|
||||||
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
|
||||||
if (srcIdx < smemFinalDstIdx[0]) {
|
|
||||||
finalLogits[ii] = smemFinal.items.logits[srcIdx];
|
|
||||||
finalIndices[ii] = smemFinal.items.indices[srcIdx];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure the shared memory has been read.
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Sort the elements.
|
|
||||||
FinalSort(smemFinal.finalSort)
|
|
||||||
.SortDescendingBlockedToStriped(finalLogits, finalIndices);
|
|
||||||
|
|
||||||
// Copy the data back to the shared memory storage.
|
|
||||||
int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
|
|
||||||
#pragma unroll
|
|
||||||
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
|
|
||||||
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
|
|
||||||
int dstIdx = baseIdx + srcIdx;
|
|
||||||
if (dstIdx < kTopK) {
|
|
||||||
smemIndices[dstIdx] = finalIndices[ii];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure the data is in shared memory.
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Store to global memory.
|
|
||||||
#pragma unroll
|
|
||||||
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
|
|
||||||
int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
|
|
||||||
outIndices[offset] =
|
|
||||||
smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int kNumThreadsPerBlock = 512>
|
template <int kNumThreadsPerBlock, bool useRadixSort>
|
||||||
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
|
static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
|
||||||
const int* rowEnds, int* outIndices,
|
const float* logits, const int* rowStarts, const int* rowEnds,
|
||||||
int stride0, int stride1) {
|
int* outIndices, int stride0, int stride1, const int topK,
|
||||||
|
const int offsetIndex) {
|
||||||
// The number of bins in the histogram.
|
// The number of bins in the histogram.
|
||||||
static constexpr int kNumBins = 512;
|
static constexpr int kNumBins = 2048;
|
||||||
|
|
||||||
// The top-k width.
|
|
||||||
static constexpr int kTopK = 2048;
|
|
||||||
|
|
||||||
// The row computed by this block.
|
// The row computed by this block.
|
||||||
int rowIdx = blockIdx.x;
|
int rowIdx = blockIdx.x + offsetIndex;
|
||||||
|
|
||||||
// The range of logits within the row.
|
// The range of logits within the row.
|
||||||
int rowStart = rowStarts[rowIdx];
|
int rowStart = rowStarts[rowIdx];
|
||||||
int rowEnd = rowEnds[rowIdx];
|
int rowEnd = rowEnds[rowIdx];
|
||||||
|
|
||||||
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
|
// Local pointers to this block
|
||||||
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
|
outIndices += rowIdx * topK;
|
||||||
|
logits += rowIdx * stride0;
|
||||||
|
|
||||||
|
topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
|
||||||
|
nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int kNumThreadsPerBlock = 512>
|
template <int kNumThreadsPerBlock, bool useRadixSort,
|
||||||
static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
|
bool multipleBlocksPerRow = false, bool mergeBlocks = false>
|
||||||
int* outIndices, int stride0,
|
static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
|
||||||
int stride1, int next_n) {
|
const float* logits, const int* seqLens, int* outIndices, int stride0,
|
||||||
|
int stride1, const int topK, int next_n, float* outLogits = nullptr,
|
||||||
|
const int numBlocksToMerge = 0, const int* indices = nullptr) {
|
||||||
// The number of bins in the histogram.
|
// The number of bins in the histogram.
|
||||||
static constexpr int kNumBins = 512;
|
static constexpr int kNumBins = 2048;
|
||||||
|
|
||||||
// The top-k width.
|
|
||||||
static constexpr int kTopK = 2048;
|
|
||||||
|
|
||||||
// The row computed by this block.
|
// The row computed by this block.
|
||||||
int rowIdx = blockIdx.x;
|
int rowIdx = blockIdx.x;
|
||||||
@ -290,8 +574,25 @@ static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
|
|||||||
int seq_len = seqLens[rowIdx / next_n];
|
int seq_len = seqLens[rowIdx / next_n];
|
||||||
int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
|
int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
|
||||||
|
|
||||||
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
|
// Local pointers to this block
|
||||||
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
|
if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
|
||||||
|
outIndices += rowIdx * topK;
|
||||||
|
} else if constexpr (multipleBlocksPerRow) {
|
||||||
|
const auto blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192
|
||||||
|
rowStart = blockSize * blockIdx.y; // 8192 * 1 = 8192
|
||||||
|
rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
|
||||||
|
outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
|
||||||
|
outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
|
||||||
|
} else if constexpr (mergeBlocks) {
|
||||||
|
rowEnd = numBlocksToMerge * topK;
|
||||||
|
indices += rowIdx * numBlocksToMerge * topK;
|
||||||
|
outIndices += rowIdx * topK;
|
||||||
|
}
|
||||||
|
logits += rowIdx * stride0;
|
||||||
|
|
||||||
|
topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
|
||||||
|
multipleBlocksPerRow, mergeBlocks>(
|
||||||
|
indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
@ -339,28 +640,84 @@ void apply_repetition_penalties_(
|
|||||||
|
|
||||||
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
|
||||||
const torch::Tensor& seqLens, torch::Tensor& indices,
|
const torch::Tensor& seqLens, torch::Tensor& indices,
|
||||||
int64_t numRows, int64_t stride0, int64_t stride1) {
|
int64_t numRows, int64_t stride0, int64_t stride1,
|
||||||
// Compute the results on the device.
|
int64_t topK) {
|
||||||
|
constexpr int kSortingAlgorithmThreshold = 12288;
|
||||||
|
constexpr int kSplitWorkThreshold = 200 * 1000;
|
||||||
|
constexpr int kNumThreadsPerBlock = 512;
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
const auto numColumns = logits.size(1);
|
||||||
|
|
||||||
|
if (numColumns < kSortingAlgorithmThreshold) {
|
||||||
|
// Use insertion sort
|
||||||
|
vllm::topKPerRowDecode<kNumThreadsPerBlock, false>
|
||||||
|
<<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
|
||||||
|
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||||
|
indices.data_ptr<int>(), static_cast<int>(stride0),
|
||||||
|
static_cast<int>(stride1), static_cast<int>(topK),
|
||||||
|
static_cast<int>(next_n));
|
||||||
|
} else if (numColumns < kSplitWorkThreshold) {
|
||||||
|
// From this threshold, use radix sort instead
|
||||||
|
vllm::topKPerRowDecode<kNumThreadsPerBlock, true>
|
||||||
|
<<<numRows, kNumThreadsPerBlock, topK * sizeof(int32_t), stream>>>(
|
||||||
|
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||||
|
indices.data_ptr<int>(), static_cast<int>(stride0),
|
||||||
|
static_cast<int>(stride1), static_cast<int>(topK),
|
||||||
|
static_cast<int>(next_n));
|
||||||
|
} else {
|
||||||
|
// Long sequences are run in two steps
|
||||||
|
constexpr auto multipleBlocksPerRowConfig = 10;
|
||||||
|
|
||||||
|
const auto outIndicesAux =
|
||||||
|
torch::empty({numRows, multipleBlocksPerRowConfig, topK},
|
||||||
|
torch::dtype(torch::kInt32).device(logits.device()));
|
||||||
|
const auto outLogitsAux =
|
||||||
|
torch::empty({numRows, multipleBlocksPerRowConfig, topK},
|
||||||
|
torch::dtype(torch::kFloat).device(logits.device()));
|
||||||
|
|
||||||
|
vllm::topKPerRowDecode<kNumThreadsPerBlock, true, true>
|
||||||
|
<<<dim3(numRows, multipleBlocksPerRowConfig), kNumThreadsPerBlock,
|
||||||
|
2 * topK * sizeof(int32_t), stream>>>(
|
||||||
|
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||||
|
outIndicesAux.data_ptr<int>(), static_cast<int>(stride0),
|
||||||
|
static_cast<int>(stride1), static_cast<int>(topK),
|
||||||
|
static_cast<int>(next_n), outLogitsAux.data_ptr<float>());
|
||||||
|
|
||||||
|
constexpr int kNumThreadsPerBlockMerge = 1024;
|
||||||
|
vllm::topKPerRowDecode<kNumThreadsPerBlockMerge, true, false, true>
|
||||||
|
<<<numRows, kNumThreadsPerBlockMerge, topK * sizeof(int32_t), stream>>>(
|
||||||
|
outLogitsAux.data_ptr<float>(), seqLens.data_ptr<int>(),
|
||||||
|
indices.data_ptr<int>(), multipleBlocksPerRowConfig * topK, 1,
|
||||||
|
static_cast<int>(topK), static_cast<int>(next_n), nullptr,
|
||||||
|
multipleBlocksPerRowConfig, outIndicesAux.data_ptr<int>());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void top_k_per_row_prefill(const torch::Tensor& logits,
|
||||||
|
const torch::Tensor& rowStarts,
|
||||||
|
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
||||||
|
int64_t numRows, int64_t stride0, int64_t stride1,
|
||||||
|
int64_t topK) {
|
||||||
|
constexpr int kSortingAlgorithmThreshold = 12288;
|
||||||
constexpr int kNumThreadsPerBlock = 512;
|
constexpr int kNumThreadsPerBlock = 512;
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
vllm::topKPerRowDecode<kNumThreadsPerBlock>
|
int numInsertionBlocks =
|
||||||
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
std::min(static_cast<int>(numRows), kSortingAlgorithmThreshold);
|
||||||
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
|
vllm::topKPerRowPrefill<kNumThreadsPerBlock, false>
|
||||||
indices.data_ptr<int>(), static_cast<int>(stride0),
|
<<<numInsertionBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
|
||||||
static_cast<int>(stride1), static_cast<int>(next_n));
|
stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
||||||
}
|
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
||||||
|
static_cast<int>(stride0), static_cast<int>(stride1),
|
||||||
|
static_cast<int>(topK), 0);
|
||||||
|
|
||||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
|
if (numRows > kSortingAlgorithmThreshold) {
|
||||||
const torch::Tensor& rowEnds, torch::Tensor& indices,
|
int numRadixBlocks = numRows - kSortingAlgorithmThreshold;
|
||||||
int64_t numRows, int64_t stride0, int64_t stride1) {
|
vllm::topKPerRowPrefill<kNumThreadsPerBlock, true>
|
||||||
// Compute the results on the device.
|
<<<numRadixBlocks, kNumThreadsPerBlock, topK * sizeof(int32_t),
|
||||||
constexpr int kNumThreadsPerBlock = 512;
|
stream>>>(logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
||||||
|
static_cast<int>(stride0), static_cast<int>(stride1),
|
||||||
vllm::topKPerRow<kNumThreadsPerBlock>
|
static_cast<int>(topK), kSortingAlgorithmThreshold);
|
||||||
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
|
}
|
||||||
logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
|
|
||||||
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
|
|
||||||
static_cast<int>(stride0), static_cast<int>(stride1));
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -179,15 +179,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
|
|
||||||
// Optimized top-k per row operation
|
// Optimized top-k per row operation
|
||||||
ops.def(
|
ops.def(
|
||||||
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
|
"top_k_per_row_prefill(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
|
||||||
"Tensor! indices, int numRows, int stride0, "
|
"Tensor! indices, int numRows, int stride0, "
|
||||||
"int stride1) -> ()");
|
"int stride1, int topK) -> ()");
|
||||||
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
|
ops.impl("top_k_per_row_prefill", torch::kCUDA, &top_k_per_row_prefill);
|
||||||
|
|
||||||
ops.def(
|
ops.def(
|
||||||
"top_k_per_row_decode(Tensor logits, int next_n, "
|
"top_k_per_row_decode(Tensor logits, int next_n, "
|
||||||
"Tensor seq_lens, Tensor! indices, int numRows, "
|
"Tensor seq_lens, Tensor! indices, "
|
||||||
"int stride0, int stride1) -> ()");
|
"int numRows, int stride0, int stride1, int topK) -> ()");
|
||||||
ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
|
ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
|
||||||
|
|
||||||
// Layernorm-quant
|
// Layernorm-quant
|
||||||
@ -215,6 +215,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
|
ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
|
||||||
&rms_norm_dynamic_per_token_quant);
|
&rms_norm_dynamic_per_token_quant);
|
||||||
|
|
||||||
|
// Fused Layernorm + Block quant kernels
|
||||||
|
ops.def(
|
||||||
|
"rms_norm_per_block_quant(Tensor! result, Tensor input, "
|
||||||
|
"Tensor weight, Tensor! scale, float epsilon, "
|
||||||
|
"Tensor? scale_ub, Tensor!? residual, int group_size, "
|
||||||
|
"bool is_scale_transposed) -> ()");
|
||||||
|
ops.impl("rms_norm_per_block_quant", torch::kCUDA, &rms_norm_per_block_quant);
|
||||||
|
|
||||||
// Rotary embedding
|
// Rotary embedding
|
||||||
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
|
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
|
||||||
ops.def(
|
ops.def(
|
||||||
@ -342,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
|
ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
|
||||||
// conditionally compiled so impl registration is in source file
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|
||||||
|
// CUTLASS w4a8 grouped GEMM
|
||||||
|
ops.def(
|
||||||
|
"cutlass_w4a8_moe_mm("
|
||||||
|
" Tensor! out_tensors,"
|
||||||
|
" Tensor a_tensors,"
|
||||||
|
" Tensor b_tensors,"
|
||||||
|
" Tensor a_scales,"
|
||||||
|
" Tensor b_scales,"
|
||||||
|
" Tensor b_group_scales,"
|
||||||
|
" int b_group_size,"
|
||||||
|
" Tensor expert_offsets,"
|
||||||
|
" Tensor problem_sizes,"
|
||||||
|
" Tensor a_strides,"
|
||||||
|
" Tensor b_strides,"
|
||||||
|
" Tensor c_strides,"
|
||||||
|
" Tensor group_scale_strides,"
|
||||||
|
" str? maybe_schedule"
|
||||||
|
") -> ()");
|
||||||
|
ops.def(
|
||||||
|
"cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, "
|
||||||
|
"Tensor)");
|
||||||
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Dequantization for GGML.
|
// Dequantization for GGML.
|
||||||
@ -458,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
" Tensor! problem_sizes1, "
|
" Tensor! problem_sizes1, "
|
||||||
" Tensor! problem_sizes2, "
|
" Tensor! problem_sizes2, "
|
||||||
" int num_experts, int n, int k, "
|
" int num_experts, int n, int k, "
|
||||||
" Tensor? blockscale_offsets) -> ()");
|
" Tensor? blockscale_offsets, "
|
||||||
|
" bool? force_swap_ab) -> ()");
|
||||||
ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
|
ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
|
||||||
&get_cutlass_moe_mm_problem_sizes);
|
&get_cutlass_moe_mm_problem_sizes);
|
||||||
|
|
||||||
@ -617,6 +649,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
||||||
&per_token_group_quant_fp8);
|
&per_token_group_quant_fp8);
|
||||||
|
|
||||||
|
// Compute per-token-group 8-bit quantized tensor and UE8M0-packed,
|
||||||
|
// TMA-aligned scales for DeepGEMM.
|
||||||
|
ops.def(
|
||||||
|
"per_token_group_fp8_quant_packed(Tensor input, Tensor! output_q, "
|
||||||
|
"Tensor! output_s_packed, int group_size, float eps, float fp8_min, "
|
||||||
|
"float fp8_max) -> ()");
|
||||||
|
ops.impl("per_token_group_fp8_quant_packed", torch::kCUDA,
|
||||||
|
&per_token_group_quant_8bit_packed);
|
||||||
|
|
||||||
// Compute per-token-group INT8 quantized tensor and scaling factor.
|
// Compute per-token-group INT8 quantized tensor and scaling factor.
|
||||||
ops.def(
|
ops.def(
|
||||||
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
|
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
|
||||||
@ -713,6 +754,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
"Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
|
"Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
|
||||||
cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
|
cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
|
||||||
|
|
||||||
|
cache_ops.def(
|
||||||
|
"cp_gather_and_upconvert_fp8_kv_cache(Tensor src_cache, Tensor! dst, "
|
||||||
|
"Tensor block_table, Tensor seq_lens, Tensor workspace_starts, int "
|
||||||
|
"batch_size) -> ()");
|
||||||
|
cache_ops.impl("cp_gather_and_upconvert_fp8_kv_cache", torch::kCUDA,
|
||||||
|
&cp_gather_and_upconvert_fp8_kv_cache);
|
||||||
|
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache, Tensor "
|
"indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache, Tensor "
|
||||||
"slot_mapping, "
|
"slot_mapping, "
|
||||||
|
|||||||
@ -32,7 +32,7 @@ ARG DEADSNAKES_GPGKEY_URL
|
|||||||
|
|
||||||
# The PyPA get-pip.py script is a self contained script+zip file, that provides
|
# The PyPA get-pip.py script is a self contained script+zip file, that provides
|
||||||
# both the installer script and the pip base85-encoded zip archive. This allows
|
# both the installer script and the pip base85-encoded zip archive. This allows
|
||||||
# bootstrapping pip in environment where a dsitribution package does not exist.
|
# bootstrapping pip in environment where a distribution package does not exist.
|
||||||
#
|
#
|
||||||
# By parameterizing the URL for get-pip.py installation script, we allow
|
# By parameterizing the URL for get-pip.py installation script, we allow
|
||||||
# third-party to use their own copy of the script stored in a private mirror.
|
# third-party to use their own copy of the script stored in a private mirror.
|
||||||
@ -73,15 +73,13 @@ ARG INSTALL_KV_CONNECTORS=false
|
|||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM ${BUILD_BASE_IMAGE} AS base
|
FROM ${BUILD_BASE_IMAGE} AS base
|
||||||
|
|
||||||
ARG CUDA_VERSION
|
ARG CUDA_VERSION
|
||||||
ARG PYTHON_VERSION
|
ARG PYTHON_VERSION
|
||||||
ARG TARGETPLATFORM
|
|
||||||
ARG INSTALL_KV_CONNECTORS=false
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
ARG GET_PIP_URL
|
# Install system dependencies including build tools
|
||||||
|
|
||||||
# Install system dependencies and uv, then create Python virtual environment
|
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
@ -107,32 +105,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
|
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
|
||||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
|
||||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
|
||||||
|
|
||||||
# Activate virtual environment and add uv to PATH
|
# Activate virtual environment and add uv to PATH
|
||||||
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
|
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
|
||||||
ENV VIRTUAL_ENV="/opt/venv"
|
ENV VIRTUAL_ENV="/opt/venv"
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# Environment for uv
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
|
||||||
ENV UV_LINK_MODE=copy
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
RUN <<EOF
|
# Verify GCC version
|
||||||
gcc --version
|
RUN gcc --version
|
||||||
EOF
|
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for triton/pytorch issues
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
|
||||||
# this won't be needed for future versions of this docker image
|
|
||||||
# or future versions of triton.
|
|
||||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# SLOW-CHANGING DEPENDENCIES BELOW
|
||||||
|
# These are the expensive layers that we want to cache
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
# Install PyTorch and core CUDA dependencies
|
||||||
|
# This is ~2GB and rarely changes
|
||||||
|
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# install build and runtime dependencies
|
# install build and runtime dependencies
|
||||||
@ -142,13 +138,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# CUDA arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# Explicitly set the list to avoid issues with torch 2.2
|
||||||
# explicitly set the list to avoid issues with torch 2.2
|
# See https://github.com/pytorch/pytorch/pull/123243
|
||||||
# see https://github.com/pytorch/pytorch/pull/123243
|
|
||||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BUILD BASE IMAGE ####################
|
||||||
|
|
||||||
#################### CSRC BUILD IMAGE ####################
|
#################### CSRC BUILD IMAGE ####################
|
||||||
FROM base AS csrc-build
|
FROM base AS csrc-build
|
||||||
@ -196,6 +191,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
|||||||
|
|
||||||
# Flag to control whether to use pre-built vLLM wheels
|
# Flag to control whether to use pre-built vLLM wheels
|
||||||
ARG VLLM_USE_PRECOMPILED=""
|
ARG VLLM_USE_PRECOMPILED=""
|
||||||
|
ARG VLLM_MERGE_BASE_COMMIT=""
|
||||||
ARG VLLM_MAIN_CUDA_VERSION=""
|
ARG VLLM_MAIN_CUDA_VERSION=""
|
||||||
|
|
||||||
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
|
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
|
||||||
@ -216,6 +212,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||||
&& export CMAKE_BUILD_TYPE=Release \
|
&& export CMAKE_BUILD_TYPE=Release \
|
||||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||||
|
&& export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" \
|
||||||
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
|
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
|
||||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||||
&& sccache --show-stats \
|
&& sccache --show-stats \
|
||||||
@ -233,11 +230,54 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
|||||||
rm -rf .deps && \
|
rm -rf .deps && \
|
||||||
mkdir -p .deps && \
|
mkdir -p .deps && \
|
||||||
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||||
|
export VLLM_PRECOMPILED_WHEEL_COMMIT="${VLLM_MERGE_BASE_COMMIT}" && \
|
||||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
fi
|
fi
|
||||||
#################### CSRC BUILD IMAGE ####################
|
#################### CSRC BUILD IMAGE ####################
|
||||||
|
|
||||||
|
#################### EXTENSIONS BUILD IMAGE ####################
|
||||||
|
# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
|
||||||
|
# This stage is independent and doesn't affect csrc cache
|
||||||
|
FROM base AS extensions-build
|
||||||
|
ARG CUDA_VERSION
|
||||||
|
|
||||||
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# Build DeepGEMM wheel
|
||||||
|
ARG DEEPGEMM_GIT_REF
|
||||||
|
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
mkdir -p /tmp/deepgemm/dist && \
|
||||||
|
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh \
|
||||||
|
--cuda-version "${CUDA_VERSION}" \
|
||||||
|
${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
|
||||||
|
--wheel-dir /tmp/deepgemm/dist || \
|
||||||
|
echo "DeepGEMM build skipped (CUDA version requirement not met)"
|
||||||
|
|
||||||
|
# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
|
||||||
|
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
|
||||||
|
|
||||||
|
# Build pplx-kernels and DeepEP wheels
|
||||||
|
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
|
||||||
|
ARG PPLX_COMMIT_HASH
|
||||||
|
ARG DEEPEP_COMMIT_HASH
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
mkdir -p /tmp/ep_kernels_workspace/dist && \
|
||||||
|
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
|
||||||
|
/tmp/install_python_libraries.sh \
|
||||||
|
--workspace /tmp/ep_kernels_workspace \
|
||||||
|
--mode wheel \
|
||||||
|
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
|
||||||
|
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
|
||||||
|
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
|
||||||
|
#################### EXTENSIONS BUILD IMAGE ####################
|
||||||
|
|
||||||
#################### WHEEL BUILD IMAGE ####################
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
FROM base AS build
|
FROM base AS build
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
@ -262,6 +302,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# Copy pre-built csrc wheel directly
|
||||||
COPY --from=csrc-build /workspace/dist /precompiled-wheels
|
COPY --from=csrc-build /workspace/dist /precompiled-wheels
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
@ -283,27 +324,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
fi && \
|
fi && \
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
||||||
|
|
||||||
# Install DeepGEMM from source
|
# Copy extension wheels from extensions-build stage for later use
|
||||||
ARG DEEPGEMM_GIT_REF
|
COPY --from=extensions-build /tmp/deepgemm/dist /tmp/deepgemm/dist
|
||||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
COPY --from=extensions-build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels_workspace/dist
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
|
|
||||||
|
|
||||||
# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
|
|
||||||
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
|
|
||||||
|
|
||||||
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
|
|
||||||
# Install EP kernels(pplx-kernels and DeepEP)
|
|
||||||
ARG PPLX_COMMIT_HASH
|
|
||||||
ARG DEEPEP_COMMIT_HASH
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
|
|
||||||
/tmp/install_python_libraries.sh \
|
|
||||||
--workspace /tmp/ep_kernels_workspace \
|
|
||||||
--mode wheel \
|
|
||||||
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
|
|
||||||
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
|
|
||||||
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
|
|
||||||
|
|
||||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
@ -341,32 +364,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||||
|
|
||||||
ARG CUDA_VERSION
|
ARG CUDA_VERSION
|
||||||
ARG PYTHON_VERSION
|
ARG PYTHON_VERSION
|
||||||
ARG INSTALL_KV_CONNECTORS=false
|
|
||||||
WORKDIR /vllm-workspace
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
ARG TARGETPLATFORM
|
|
||||||
|
|
||||||
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
|
|
||||||
ARG GDRCOPY_CUDA_VERSION=12.8
|
|
||||||
# Keep in line with FINAL_BASE_IMAGE
|
|
||||||
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
|
|
||||||
|
|
||||||
SHELL ["/bin/bash", "-c"]
|
|
||||||
|
|
||||||
ARG DEADSNAKES_MIRROR_URL
|
ARG DEADSNAKES_MIRROR_URL
|
||||||
ARG DEADSNAKES_GPGKEY_URL
|
ARG DEADSNAKES_GPGKEY_URL
|
||||||
ARG GET_PIP_URL
|
ARG GET_PIP_URL
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
WORKDIR /vllm-workspace
|
||||||
|
|
||||||
|
|
||||||
|
# Python version string for paths (e.g., "312" for 3.12)
|
||||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||||
|
|
||||||
# Install Python and other dependencies
|
# Install Python and system dependencies
|
||||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
@ -405,63 +421,104 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
# Install CUDA development tools and build essentials for runtime JIT compilation
|
# Install CUDA development tools for runtime JIT compilation
|
||||||
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
|
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
|
||||||
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
|
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
|
||||||
apt-get update -y && \
|
apt-get update -y && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
cuda-nvcc-${CUDA_VERSION_DASH} \
|
cuda-nvcc-${CUDA_VERSION_DASH} \
|
||||||
cuda-cudart-${CUDA_VERSION_DASH} \
|
cuda-cudart-${CUDA_VERSION_DASH} \
|
||||||
cuda-nvrtc-${CUDA_VERSION_DASH} \
|
cuda-nvrtc-${CUDA_VERSION_DASH} \
|
||||||
cuda-cuobjdump-${CUDA_VERSION_DASH} \
|
cuda-cuobjdump-${CUDA_VERSION_DASH} \
|
||||||
# https://github.com/vllm-project/vllm/issues/29590
|
libcurand-dev-${CUDA_VERSION_DASH} \
|
||||||
libcurand-dev-${CUDA_VERSION_DASH} \
|
libcublas-${CUDA_VERSION_DASH} \
|
||||||
libcublas-${CUDA_VERSION_DASH} \
|
# Fixes nccl_allocator requiring nccl.h at runtime
|
||||||
# Fixes nccl_allocator requiring nccl.h at runtime
|
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
|
||||||
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
|
libnccl-dev && \
|
||||||
libnccl-dev && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install uv for faster pip installs
|
||||||
|
RUN python3 -m pip install uv
|
||||||
|
|
||||||
|
# Environment for uv
|
||||||
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
|
# Workaround for triton/pytorch issues
|
||||||
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# SLOW-CHANGING DEPENDENCIES BELOW
|
||||||
|
# These are the expensive layers that we want to cache
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
# Install PyTorch and core CUDA dependencies
|
||||||
|
# This is ~2GB and rarely changes
|
||||||
|
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||||
|
COPY requirements/common.txt /tmp/common.txt
|
||||||
|
COPY requirements/cuda.txt /tmp/requirements-cuda.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv pip install --system -r /tmp/requirements-cuda.txt \
|
||||||
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
|
||||||
|
rm /tmp/requirements-cuda.txt /tmp/common.txt
|
||||||
|
|
||||||
|
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||||
|
# This is ~1.1GB and only changes when FlashInfer version bumps
|
||||||
|
# https://docs.flashinfer.ai/installation.html
|
||||||
|
ARG FLASHINFER_VERSION=0.5.3
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
|
||||||
|
&& uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
|
||||||
|
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||||
|
&& flashinfer show-config
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# OPENAI API SERVER DEPENDENCIES
|
||||||
|
# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
# Install gdrcopy (saves ~6s per build)
|
||||||
|
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
|
||||||
|
ARG GDRCOPY_CUDA_VERSION=12.8
|
||||||
|
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
|
||||||
|
ARG TARGETPLATFORM
|
||||||
|
COPY tools/install_gdrcopy.sh /tmp/install_gdrcopy.sh
|
||||||
|
RUN set -eux; \
|
||||||
|
case "${TARGETPLATFORM}" in \
|
||||||
|
linux/arm64) UUARCH="aarch64" ;; \
|
||||||
|
linux/amd64) UUARCH="x64" ;; \
|
||||||
|
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
||||||
|
esac; \
|
||||||
|
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}" && \
|
||||||
|
rm /tmp/install_gdrcopy.sh
|
||||||
|
|
||||||
|
# Install vllm-openai dependencies (saves ~2.6s per build)
|
||||||
|
# These are stable packages that don't depend on vLLM itself
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
|
BITSANDBYTES_VERSION="0.42.0"; \
|
||||||
|
else \
|
||||||
|
BITSANDBYTES_VERSION="0.46.1"; \
|
||||||
|
fi; \
|
||||||
|
uv pip install --system accelerate hf_transfer modelscope \
|
||||||
|
"bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# VLLM INSTALLATION (depends on build stage)
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||||
|
|
||||||
# Install uv for faster pip installs
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
python3 -m pip install uv
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
|
||||||
ENV UV_LINK_MODE=copy
|
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
|
||||||
# this won't be needed for future versions of this docker image
|
|
||||||
# or future versions of triton.
|
|
||||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
|
||||||
|
|
||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system dist/*.whl --verbose \
|
uv pip install --system dist/*.whl --verbose \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
|
||||||
# https://docs.flashinfer.ai/installation.html
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system flashinfer-cubin==0.5.3 \
|
|
||||||
&& uv pip install --system flashinfer-jit-cache==0.5.3 \
|
|
||||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
|
||||||
&& flashinfer show-config
|
|
||||||
|
|
||||||
COPY examples examples
|
|
||||||
COPY benchmarks benchmarks
|
|
||||||
COPY ./vllm/collect_env.py .
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
uv pip list
|
uv pip list
|
||||||
@ -475,7 +532,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
echo "No DeepGEMM wheels to install; skipping."; \
|
echo "No DeepGEMM wheels to install; skipping."; \
|
||||||
fi'
|
fi'
|
||||||
|
|
||||||
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
|
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
|
||||||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
|
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
|
||||||
@ -484,23 +541,17 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
|
|||||||
uv pip install --system ep_kernels/dist/*.whl --verbose \
|
uv pip install --system ep_kernels/dist/*.whl --verbose \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
|
|
||||||
set -eux; \
|
|
||||||
case "${TARGETPLATFORM}" in \
|
|
||||||
linux/arm64) UUARCH="aarch64" ;; \
|
|
||||||
linux/amd64) UUARCH="x64" ;; \
|
|
||||||
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
|
||||||
esac; \
|
|
||||||
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
|
|
||||||
|
|
||||||
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
|
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
|
||||||
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
|
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
|
||||||
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
|
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
|
||||||
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
|
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
|
||||||
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
|
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
|
||||||
|
|
||||||
|
# Copy examples and benchmarks at the end to minimize cache invalidation
|
||||||
|
COPY examples examples
|
||||||
|
COPY benchmarks benchmarks
|
||||||
|
COPY ./vllm/collect_env.py .
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
# image to run unit testing suite
|
# image to run unit testing suite
|
||||||
# note that this uses vllm installed by `pip`
|
# note that this uses vllm installed by `pip`
|
||||||
@ -566,18 +617,12 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
|||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install kv_connectors if requested
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
|
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
|
||||||
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
|
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
|
||||||
uv pip install --system -r /tmp/kv_connectors.txt; \
|
uv pip install --system -r /tmp/kv_connectors.txt; \
|
||||||
fi; \
|
fi
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
|
||||||
BITSANDBYTES_VERSION="0.42.0"; \
|
|
||||||
else \
|
|
||||||
BITSANDBYTES_VERSION="0.46.1"; \
|
|
||||||
fi; \
|
|
||||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
|
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
|||||||
@ -76,6 +76,9 @@ RUN python3 -m pip install -e tests/vllm_test_utils
|
|||||||
ENV NIXL_VERSION=0.7.0
|
ENV NIXL_VERSION=0.7.0
|
||||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||||
|
|
||||||
|
# PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
|
||||||
|
RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf
|
||||||
|
|
||||||
# remove torch bundled oneccl to avoid conflicts
|
# remove torch bundled oneccl to avoid conflicts
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip uninstall oneccl oneccl-devel -y
|
pip uninstall oneccl oneccl-devel -y
|
||||||
|
|||||||
@ -59,6 +59,7 @@ nav:
|
|||||||
- CLI Reference: cli
|
- CLI Reference: cli
|
||||||
- Community:
|
- Community:
|
||||||
- community/*
|
- community/*
|
||||||
|
- Governance: governance
|
||||||
- Blog: https://blog.vllm.ai
|
- Blog: https://blog.vllm.ai
|
||||||
- Forum: https://discuss.vllm.ai
|
- Forum: https://discuss.vllm.ai
|
||||||
- Slack: https://slack.vllm.ai
|
- Slack: https://slack.vllm.ai
|
||||||
|
|||||||
@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes.
|
|||||||
- [vllm.config.MultiModalConfig][]
|
- [vllm.config.MultiModalConfig][]
|
||||||
- [vllm.config.PoolerConfig][]
|
- [vllm.config.PoolerConfig][]
|
||||||
- [vllm.config.StructuredOutputsConfig][]
|
- [vllm.config.StructuredOutputsConfig][]
|
||||||
|
- [vllm.config.ProfilerConfig][]
|
||||||
- [vllm.config.ObservabilityConfig][]
|
- [vllm.config.ObservabilityConfig][]
|
||||||
- [vllm.config.KVTransferConfig][]
|
- [vllm.config.KVTransferConfig][]
|
||||||
- [vllm.config.CompilationConfig][]
|
- [vllm.config.CompilationConfig][]
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 174 KiB After Width: | Height: | Size: 205 KiB |
@ -84,7 +84,7 @@ Total input tokens: 1369
|
|||||||
Total generated tokens: 2212
|
Total generated tokens: 2212
|
||||||
Request throughput (req/s): 1.73
|
Request throughput (req/s): 1.73
|
||||||
Output token throughput (tok/s): 382.89
|
Output token throughput (tok/s): 382.89
|
||||||
Total Token throughput (tok/s): 619.85
|
Total token throughput (tok/s): 619.85
|
||||||
---------------Time to First Token----------------
|
---------------Time to First Token----------------
|
||||||
Mean TTFT (ms): 71.54
|
Mean TTFT (ms): 71.54
|
||||||
Median TTFT (ms): 73.88
|
Median TTFT (ms): 73.88
|
||||||
|
|||||||
@ -24,11 +24,13 @@ Compute Resources:
|
|||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
|
- IBM
|
||||||
- Intel
|
- Intel
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
- Novita AI
|
- Novita AI
|
||||||
- NVIDIA
|
- NVIDIA
|
||||||
|
- Red Hat
|
||||||
- Replicate
|
- Replicate
|
||||||
- Roblox
|
- Roblox
|
||||||
- RunPod
|
- RunPod
|
||||||
|
|||||||
@ -7,7 +7,7 @@ This guide covers optimization strategies and performance tuning for vLLM V1.
|
|||||||
|
|
||||||
## Preemption
|
## Preemption
|
||||||
|
|
||||||
Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
|
Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
|
||||||
In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
|
In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
|
||||||
available again. When this occurs, you may see the following warning:
|
available again. When this occurs, you may see the following warning:
|
||||||
|
|
||||||
|
|||||||
160
docs/contributing/ci/nightly_builds.md
Normal file
160
docs/contributing/ci/nightly_builds.md
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
# Nightly Builds of vLLM Wheels
|
||||||
|
|
||||||
|
vLLM maintains a per-commit wheel repository (commonly referred to as "nightly") at `https://wheels.vllm.ai` that provides pre-built wheels for every commit on the `main` branch since `v0.5.3`. This document explains how the nightly wheel index mechanism works.
|
||||||
|
|
||||||
|
## Build and Upload Process on CI
|
||||||
|
|
||||||
|
### Wheel Building
|
||||||
|
|
||||||
|
Wheels are built in the `Release` pipeline (`.buildkite/release-pipeline.yaml`) after a PR is merged into the main branch, with multiple variants:
|
||||||
|
|
||||||
|
- **Backend variants**: `cpu` and `cuXXX` (e.g., `cu129`, `cu130`).
|
||||||
|
- **Architecture variants**: `x86_64` and `aarch64`.
|
||||||
|
|
||||||
|
Each build step:
|
||||||
|
|
||||||
|
1. Builds the wheel in a Docker container.
|
||||||
|
2. Renames the wheel filename to use the correct manylinux tag (currently `manylinux_2_31`) for PEP 600 compliance.
|
||||||
|
3. Uploads the wheel to S3 bucket `vllm-wheels` under `/{commit_hash}/`.
|
||||||
|
|
||||||
|
### Index Generation
|
||||||
|
|
||||||
|
After uploading each wheel, the `.buildkite/scripts/upload-wheels.sh` script:
|
||||||
|
|
||||||
|
1. **Lists all existing wheels** in the commit directory from S3
|
||||||
|
2. **Generates indices** using `.buildkite/scripts/generate-nightly-index.py`:
|
||||||
|
- Parses wheel filenames to extract metadata (version, variant, platform tags).
|
||||||
|
- Creates HTML index files (`index.html`) for PyPI compatibility.
|
||||||
|
- Generates machine-readable `metadata.json` files.
|
||||||
|
3. **Uploads indices** to multiple locations (overriding existing ones):
|
||||||
|
- `/{commit_hash}/` - Always uploaded for commit-specific access.
|
||||||
|
- `/nightly/` - Only for commits on `main` branch (not PRs).
|
||||||
|
- `/{version}/` - Only for release wheels (no `dev` in its version).
|
||||||
|
|
||||||
|
!!! tip "Handling Concurrent Builds"
|
||||||
|
The index generation script can handle multiple variants being built concurrently by always listing all wheels in the commit directory before generating indices, avoiding race conditions.
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
The S3 bucket structure follows this pattern:
|
||||||
|
|
||||||
|
```text
|
||||||
|
s3://vllm-wheels/
|
||||||
|
├── {commit_hash}/ # Commit-specific wheels and indices
|
||||||
|
│ ├── vllm-*.whl # All wheel files
|
||||||
|
│ ├── index.html # Project list (default variant)
|
||||||
|
│ ├── vllm/
|
||||||
|
│ │ ├── index.html # Package index (default variant)
|
||||||
|
│ │ └── metadata.json # Metadata (default variant)
|
||||||
|
│ ├── cu129/ # Variant subdirectory
|
||||||
|
│ │ ├── index.html # Project list (cu129 variant)
|
||||||
|
│ │ └── vllm/
|
||||||
|
│ │ ├── index.html # Package index (cu129 variant)
|
||||||
|
│ │ └── metadata.json # Metadata (cu129 variant)
|
||||||
|
│ ├── cu130/ # Variant subdirectory
|
||||||
|
│ ├── cpu/ # Variant subdirectory
|
||||||
|
│ └── .../ # More variant subdirectories
|
||||||
|
├── nightly/ # Latest main branch wheels (mirror of latest commit)
|
||||||
|
└── {version}/ # Release version indices (e.g., 0.11.2)
|
||||||
|
```
|
||||||
|
|
||||||
|
All built wheels are stored in `/{commit_hash}/`, while different indices are generated and reference them.
|
||||||
|
This avoids duplication of wheel files.
|
||||||
|
|
||||||
|
For example, you can specify the following URLs to use different indices:
|
||||||
|
|
||||||
|
- `https://wheels.vllm.ai/nightly/cu130` for the latest main branch wheels built with CUDA 13.0.
|
||||||
|
- `https://wheels.vllm.ai/{commit_hash}` for wheels built at a specific commit (default variant).
|
||||||
|
- `https://wheels.vllm.ai/0.12.0/cpu` for 0.12.0 release wheels built for CPU variant.
|
||||||
|
|
||||||
|
Please note that not all variants are present on every commit. The available variants are subject to change over time, e.g., changing cu130 to cu131.
|
||||||
|
|
||||||
|
### Variant Organization
|
||||||
|
|
||||||
|
Indices are organized by variant:
|
||||||
|
|
||||||
|
- **Default variant**: Wheels without variant suffix (i.e., built with the current `VLLM_MAIN_CUDA_VERSION`) are placed in the root.
|
||||||
|
- **Variant subdirectories**: Wheels with variant suffixes (e.g., `+cu130`, `.cpu`) are organized in subdirectories.
|
||||||
|
- **Alias to default**: The default variant can have an alias (e.g., `cu129` for now) for consistency and convenience.
|
||||||
|
|
||||||
|
The variant is extracted from the wheel filename (as described in the [file name convention](https://packaging.python.org/en/latest/specifications/binary-distribution-format/#file-name-convention)):
|
||||||
|
|
||||||
|
- The variant is encoded in the local version identifier (e.g. `+cu129` or `dev<N>+g<hash>.cu130`).
|
||||||
|
- Examples:
|
||||||
|
- `vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl` → default variant
|
||||||
|
- `vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl` → `cu129` variant
|
||||||
|
- `vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl` → `cu130` variant
|
||||||
|
|
||||||
|
## Index Generation Details
|
||||||
|
|
||||||
|
The `generate-nightly-index.py` script performs the following:
|
||||||
|
|
||||||
|
1. **Parses wheel filenames** using regex to extract:
|
||||||
|
- Package name
|
||||||
|
- Version (with variant extracted)
|
||||||
|
- Python tag, ABI tag, platform tag
|
||||||
|
- Build tag (if present)
|
||||||
|
2. **Groups wheels by variant**, then by package name:
|
||||||
|
- Currently only `vllm` is built, but the structure supports multiple packages in the future.
|
||||||
|
3. **Generates HTML indices** (compliant with the [Simple repository API](https://packaging.python.org/en/latest/specifications/simple-repository-api/#simple-repository-api)):
|
||||||
|
- Top-level `index.html`: Lists all packages and variant subdirectories
|
||||||
|
- Package-level `index.html`: Lists all wheel files for that package
|
||||||
|
- Uses relative paths to wheel files for portability
|
||||||
|
4. **Generates metadata.json**:
|
||||||
|
- Machine-readable JSON containing all wheel metadata
|
||||||
|
- Includes `path` field with URL-encoded relative path to wheel file
|
||||||
|
- Used by `setup.py` to locate compatible pre-compiled wheels during Python-only builds
|
||||||
|
|
||||||
|
### Special Handling for AWS Services
|
||||||
|
|
||||||
|
The wheels and indices are directly stored on AWS S3, and we use AWS CloudFront as a CDN in front of the S3 bucket.
|
||||||
|
|
||||||
|
Since S3 does not provide proper directory listing, to support PyPI-compatible simple repository API behavior, we deploy a CloudFront Function that:
|
||||||
|
|
||||||
|
- redirects any URL that does not end with `/` and does not look like a file (i.e., does not contain a dot `.` in the last path segment) to the same URL with a trailing `/`
|
||||||
|
- appends `/index.html` to any URL that ends with `/`
|
||||||
|
|
||||||
|
For example, the following requests would be handled as:
|
||||||
|
|
||||||
|
- `/nightly` -> `/nightly/index.html`
|
||||||
|
- `/nightly/cu130/` -> `/nightly/cu130/index.html`
|
||||||
|
- `/nightly/index.html` or `/nightly/vllm.whl` -> unchanged
|
||||||
|
|
||||||
|
!!! note "AWS S3 Filename Escaping"
|
||||||
|
|
||||||
|
S3 will automatically escape filenames upon upload according to its [naming rule](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html). The direct impact on vllm is that `+` in filenames will be converted to `%2B`. We take special care in the index generation script to escape filenames properly when generating the HTML indices and JSON metadata, to ensure the URLs are correct and can be directly used.
|
||||||
|
|
||||||
|
## Usage of precompiled wheels in `setup.py` {#precompiled-wheels-usage}
|
||||||
|
|
||||||
|
When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
|
||||||
|
|
||||||
|
1. **Determines wheel location** via `precompiled_wheel_utils.determine_wheel_url()`:
|
||||||
|
- Env var `VLLM_PRECOMPILED_WHEEL_LOCATION` (user-specified URL/path) always takes precedence and skips all other steps.
|
||||||
|
- Determines the variant from `VLLM_MAIN_CUDA_VERSION` (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_VARIANT`); the default variant will also be tried as a fallback.
|
||||||
|
- Determines the _base commit_ (explained later) of this branch (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_COMMIT`).
|
||||||
|
2. **Fetches metadata** from `https://wheels.vllm.ai/{commit}/vllm/metadata.json` (for the default variant) or `https://wheels.vllm.ai/{commit}/{variant}/vllm/metadata.json` (for a specific variant).
|
||||||
|
3. **Selects compatible wheel** based on:
|
||||||
|
- Package name (`vllm`)
|
||||||
|
- Platform tag (architecture match)
|
||||||
|
4. **Downloads and extracts** precompiled binaries from the wheel:
|
||||||
|
- C++ extension modules (`.so` files)
|
||||||
|
- Flash Attention Python modules
|
||||||
|
- Triton kernel Python files
|
||||||
|
5. **Patches package_data** to include extracted files in the installation
|
||||||
|
|
||||||
|
!!! note "What is the base commit?"
|
||||||
|
|
||||||
|
The base commit is determined by finding the merge-base
|
||||||
|
between the current branch and upstream `main`, ensuring
|
||||||
|
compatibility between source code and precompiled binaries.
|
||||||
|
|
||||||
|
_Note: it's users' responsibility to ensure there is no native code (e.g., C++ or CUDA) changes before using precompiled wheels._
|
||||||
|
|
||||||
|
## Implementation Files
|
||||||
|
|
||||||
|
Key files involved in the nightly wheel mechanism:
|
||||||
|
|
||||||
|
- **`.buildkite/release-pipeline.yaml`**: CI pipeline that builds wheels
|
||||||
|
- **`.buildkite/scripts/upload-wheels.sh`**: Script that uploads wheels and generates indices
|
||||||
|
- **`.buildkite/scripts/generate-nightly-index.py`**: Python script that generates PyPI-compatible indices
|
||||||
|
- **`setup.py`**: Contains `precompiled_wheel_utils` class for fetching and using precompiled wheels
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user