Add FlashInfer wheel building capability to buildkite pipeline

Signed-off-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
mgoin 2025-07-30 11:48:04 -04:00
parent 176bbce1db
commit f626cc9300
5 changed files with 184 additions and 25 deletions

View File

@ -41,6 +41,40 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- block: "Build FlashInfer wheels"
key: block-build-flashinfer-wheels
depends_on: ~
- label: "Build FlashInfer wheels - CUDA 12.8"
depends_on: block-build-flashinfer-wheels
id: build-flashinfer-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/build-flashinfer-wheel.sh 12.8.1"
env:
DOCKER_BUILDKIT: "1"
- label: "Build FlashInfer wheels - CUDA 12.6"
depends_on: block-build-flashinfer-wheels
id: build-flashinfer-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/build-flashinfer-wheel.sh 12.6.3"
env:
DOCKER_BUILDKIT: "1"
- label: "Build FlashInfer wheels - CUDA 11.8"
depends_on: block-build-flashinfer-wheels
id: build-flashinfer-wheel-cuda-11-8
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/build-flashinfer-wheel.sh 11.8.0"
env:
DOCKER_BUILDKIT: "1"
- block: "Build release image"
depends_on: ~
key: block-release-image-build

View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
set -ex
CUDA_VERSION="${1:-12.8.1}"
FLASHINFER_VERSION="${FLASHINFER_VERSION:-v0.2.9rc2}"
echo "Building FlashInfer wheel for CUDA ${CUDA_VERSION} using vLLM Dockerfile"
# Build the FlashInfer wheel using the existing Dockerfile stage
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg USE_SCCACHE=1 \
--build-arg CUDA_VERSION="${CUDA_VERSION}" \
--build-arg FLASHINFER_GIT_REF="${FLASHINFER_VERSION}" \
--tag flashinfer-wheel-builder:${CUDA_VERSION} \
--target flashinfer-wheel-builder \
--progress plain \
-f docker/Dockerfile .
# Extract the wheel
mkdir -p artifacts/dist
docker run --rm -v $(pwd)/artifacts:/output_host flashinfer-wheel-builder:${CUDA_VERSION} \
bash -c 'cp /output/*.whl /output_host/dist/ && chmod -R a+rw /output_host'
# Upload the wheel
bash .buildkite/scripts/upload-flashinfer-wheels.sh
echo "FlashInfer wheel built and uploaded successfully for CUDA ${CUDA_VERSION}"
ls -la artifacts/dist/

View File

@ -0,0 +1,52 @@
#!/usr/bin/env bash
set -ex
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
# Check that exactly one wheel is found
if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
# Get the single wheel file
wheel="${wheel_files[0]}"
echo "Processing FlashInfer wheel: $wheel"
# Rename 'linux' to 'manylinux1' in the wheel filename for compatibility
new_wheel="${wheel/linux/manylinux1}"
if [[ "$wheel" != "$new_wheel" ]]; then
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
echo "Renamed wheel to: $wheel"
fi
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "FlashInfer version: $version"
# Upload the wheel to S3 under flashinfer directory
aws s3 cp "$wheel" "s3://vllm-wheels/flashinfer/"
# Generate simple index.html for the package (following pip index pattern)
wheel_name=$(basename "$wheel")
cat > flashinfer_index.html << EOF
<!DOCTYPE html>
<html>
<head><title>Links for flashinfer-python</title></head>
<body>
<h1>Links for flashinfer-python</h1>
<a href="$wheel_name">$wheel_name</a><br/>
</body>
</html>
EOF
aws s3 cp flashinfer_index.html "s3://vllm-wheels/flashinfer/index.html"
# Clean up
rm -f flashinfer_index.html
echo "Successfully uploaded FlashInfer wheel $wheel_name (version $version)"

View File

@ -268,6 +268,17 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
else \
echo "Skipping wheel size check."; \
fi
#################### FLASHINFER WHEEL BUILD IMAGE ####################
FROM base AS flashinfer-wheel-builder
ARG CUDA_VERSION
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
COPY tools/build-flashinfer.sh /tmp/build-flashinfer.sh
RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \
BUILD_WHEEL=true /tmp/build-flashinfer.sh
#################### EXTENSION Build IMAGE ####################
#################### DEV IMAGE ####################
@ -395,31 +406,10 @@ ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
# Needed to build AOT kernels
pushd flashinfer
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip install --system --no-build-isolation --force-reinstall --no-deps .
popd
rm -rf flashinfer
BASH
COPY tools/build-flashinfer.sh /tmp/build-flashinfer.sh
RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \
/tmp/build-flashinfer.sh
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .

53
tools/build-flashinfer.sh Executable file
View File

@ -0,0 +1,53 @@
#!/usr/bin/env bash
set -ex
# Build FlashInfer with AOT kernels
# This script is used by both the Dockerfile and standalone wheel building
FLASHINFER_GIT_REPO="${FLASHINFER_GIT_REPO:-https://github.com/flashinfer-ai/flashinfer.git}"
FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF:-v0.2.9rc2}"
CUDA_VERSION="${CUDA_VERSION:-12.8.1}"
BUILD_WHEEL="${BUILD_WHEEL:-false}"
echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
# Clone FlashInfer
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
# Set CUDA arch list based on CUDA version
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
# Build AOT kernels and install/build wheel
pushd flashinfer
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
if [[ "${BUILD_WHEEL}" == "true" ]]; then
# Build wheel for distribution
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip wheel --no-deps --wheel-dir /wheels .
mkdir -p /output && cp /wheels/*.whl /output/
echo "✅ FlashInfer wheel built successfully"
else
# Install directly (for Dockerfile)
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip install --system --no-build-isolation --force-reinstall --no-deps .
echo "✅ FlashInfer installed successfully"
fi
popd
# Cleanup
rm -rf flashinfer