diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 6314afd652340..47866b209e69a 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -41,6 +41,40 @@ steps: env: DOCKER_BUILDKIT: "1" + - block: "Build FlashInfer wheels" + key: block-build-flashinfer-wheels + depends_on: ~ + + - label: "Build FlashInfer wheels - CUDA 12.8" + depends_on: block-build-flashinfer-wheels + id: build-flashinfer-wheel-cuda-12-8 + agents: + queue: cpu_queue_postmerge + commands: + - "bash .buildkite/scripts/build-flashinfer-wheel.sh 12.8.1" + env: + DOCKER_BUILDKIT: "1" + + - label: "Build FlashInfer wheels - CUDA 12.6" + depends_on: block-build-flashinfer-wheels + id: build-flashinfer-wheel-cuda-12-6 + agents: + queue: cpu_queue_postmerge + commands: + - "bash .buildkite/scripts/build-flashinfer-wheel.sh 12.6.3" + env: + DOCKER_BUILDKIT: "1" + + - label: "Build FlashInfer wheels - CUDA 11.8" + depends_on: block-build-flashinfer-wheels + id: build-flashinfer-wheel-cuda-11-8 + agents: + queue: cpu_queue_postmerge + commands: + - "bash .buildkite/scripts/build-flashinfer-wheel.sh 11.8.0" + env: + DOCKER_BUILDKIT: "1" + - block: "Build release image" depends_on: ~ key: block-release-image-build diff --git a/.buildkite/scripts/build-flashinfer-wheel.sh b/.buildkite/scripts/build-flashinfer-wheel.sh new file mode 100755 index 0000000000000..c941bff0fc3e4 --- /dev/null +++ b/.buildkite/scripts/build-flashinfer-wheel.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -ex + +CUDA_VERSION="${1:-12.8.1}" +FLASHINFER_VERSION="${FLASHINFER_VERSION:-v0.2.9rc2}" + +echo "Building FlashInfer wheel for CUDA ${CUDA_VERSION} using vLLM Dockerfile" + +# Build the FlashInfer wheel using the existing Dockerfile stage +DOCKER_BUILDKIT=1 docker build \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg CUDA_VERSION="${CUDA_VERSION}" \ + --build-arg FLASHINFER_GIT_REF="${FLASHINFER_VERSION}" \ + --tag flashinfer-wheel-builder:${CUDA_VERSION} \ + --target flashinfer-wheel-builder \ + --progress plain \ + -f docker/Dockerfile . + +# Extract the wheel +mkdir -p artifacts/dist +docker run --rm -v $(pwd)/artifacts:/output_host flashinfer-wheel-builder:${CUDA_VERSION} \ + bash -c 'cp /output/*.whl /output_host/dist/ && chmod -R a+rw /output_host' + +# Upload the wheel +bash .buildkite/scripts/upload-flashinfer-wheels.sh + +echo "FlashInfer wheel built and uploaded successfully for CUDA ${CUDA_VERSION}" +ls -la artifacts/dist/ \ No newline at end of file diff --git a/.buildkite/scripts/upload-flashinfer-wheels.sh b/.buildkite/scripts/upload-flashinfer-wheels.sh new file mode 100755 index 0000000000000..495c20fec2e6e --- /dev/null +++ b/.buildkite/scripts/upload-flashinfer-wheels.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +set -ex + +# Assume wheels are in artifacts/dist/*.whl +wheel_files=(artifacts/dist/*.whl) + +# Check that exactly one wheel is found +if [[ ${#wheel_files[@]} -ne 1 ]]; then + echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" + exit 1 +fi + +# Get the single wheel file +wheel="${wheel_files[0]}" + +echo "Processing FlashInfer wheel: $wheel" + +# Rename 'linux' to 'manylinux1' in the wheel filename for compatibility +new_wheel="${wheel/linux/manylinux1}" +if [[ "$wheel" != "$new_wheel" ]]; then + mv -- "$wheel" "$new_wheel" + wheel="$new_wheel" + echo "Renamed wheel to: $wheel" +fi + +# Extract the version from the wheel +version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) +echo "FlashInfer version: $version" + +# Upload the wheel to S3 under flashinfer directory +aws s3 cp "$wheel" "s3://vllm-wheels/flashinfer/" + +# Generate simple index.html for the package (following pip index pattern) +wheel_name=$(basename "$wheel") +cat > flashinfer_index.html << EOF + + +Links for flashinfer-python + +

Links for flashinfer-python

+$wheel_name
+ + +EOF + +aws s3 cp flashinfer_index.html "s3://vllm-wheels/flashinfer/index.html" + +# Clean up +rm -f flashinfer_index.html + +echo "Successfully uploaded FlashInfer wheel $wheel_name (version $version)" \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 75b5ab0230c87..9d8c024126fa9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -268,6 +268,17 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ else \ echo "Skipping wheel size check."; \ fi +#################### FLASHINFER WHEEL BUILD IMAGE #################### +FROM base AS flashinfer-wheel-builder +ARG CUDA_VERSION +ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" +ARG FLASHINFER_GIT_REF="v0.2.9rc2" + +COPY tools/build-flashinfer.sh /tmp/build-flashinfer.sh +RUN --mount=type=cache,target=/root/.cache/uv \ + . /etc/environment && \ + BUILD_WHEEL=true /tmp/build-flashinfer.sh + #################### EXTENSION Build IMAGE #################### #################### DEV IMAGE #################### @@ -395,31 +406,10 @@ ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt # We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel. ARG FLASHINFER_GIT_REF="v0.2.9rc2" -RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' - . /etc/environment - git clone --depth 1 --recursive --shallow-submodules \ - --branch ${FLASHINFER_GIT_REF} \ - ${FLASHINFER_GIT_REPO} flashinfer - # Exclude CUDA arches for older versions (11.x and 12.0-12.7) - # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. - if [[ "${CUDA_VERSION}" == 11.* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" - elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" - else - # CUDA 12.8+ supports 10.0a and 12.0 - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" - fi - echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}" - # Needed to build AOT kernels - pushd flashinfer - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer.aot - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - uv pip install --system --no-build-isolation --force-reinstall --no-deps . - popd - rm -rf flashinfer -BASH +COPY tools/build-flashinfer.sh /tmp/build-flashinfer.sh +RUN --mount=type=cache,target=/root/.cache/uv \ + . /etc/environment && \ + /tmp/build-flashinfer.sh COPY examples examples COPY benchmarks benchmarks COPY ./vllm/collect_env.py . diff --git a/tools/build-flashinfer.sh b/tools/build-flashinfer.sh new file mode 100755 index 0000000000000..4182bc22bf067 --- /dev/null +++ b/tools/build-flashinfer.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +set -ex + +# Build FlashInfer with AOT kernels +# This script is used by both the Dockerfile and standalone wheel building + +FLASHINFER_GIT_REPO="${FLASHINFER_GIT_REPO:-https://github.com/flashinfer-ai/flashinfer.git}" +FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF:-v0.2.9rc2}" +CUDA_VERSION="${CUDA_VERSION:-12.8.1}" +BUILD_WHEEL="${BUILD_WHEEL:-false}" + +echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}" + +# Clone FlashInfer +git clone --depth 1 --recursive --shallow-submodules \ + --branch ${FLASHINFER_GIT_REF} \ + ${FLASHINFER_GIT_REPO} flashinfer + +# Set CUDA arch list based on CUDA version +# Exclude CUDA arches for older versions (11.x and 12.0-12.7) +if [[ "${CUDA_VERSION}" == 11.* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" +elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" +else + # CUDA 12.8+ supports 10.0a and 12.0 + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" +fi + +echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}" + +# Build AOT kernels and install/build wheel +pushd flashinfer + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer.aot + + if [[ "${BUILD_WHEEL}" == "true" ]]; then + # Build wheel for distribution + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + uv pip wheel --no-deps --wheel-dir /wheels . + mkdir -p /output && cp /wheels/*.whl /output/ + echo "✅ FlashInfer wheel built successfully" + else + # Install directly (for Dockerfile) + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + uv pip install --system --no-build-isolation --force-reinstall --no-deps . + echo "✅ FlashInfer installed successfully" + fi +popd + +# Cleanup +rm -rf flashinfer \ No newline at end of file