Add flashinfer-build.sh and register precompiled cu128 wheel in Dockerfile (#25782)

Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-12-24 07:25:02 +08:00 · 2025-09-26 21:54:09 -04:00 · 2025-09-26 21:54:09 -04:00 · 55971f85c9
commit 55971f85c9
parent dbb7782d5b
2 changed files with 83 additions and 10 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -391,18 +391,28 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
        ${FLASHINFER_GIT_REPO} flashinfer
    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
    if [[ "${CUDA_VERSION}" == 11.* ]]; then
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
    else
        # CUDA 12.8+ supports 10.0a and 12.0
        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
    fi
    pushd flashinfer
-        if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
+        if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
-            # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+            # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
-            # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+            echo "🏗️  Installing FlashInfer from pre-compiled wheel"
-            if [[ "${CUDA_VERSION}" == 11.* ]]; then
+            uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-            elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+            if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+                # Download pre-compiled cubins
-            else
+                TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                # CUDA 12.8+ supports 10.0a and 12.0
+                    python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
            fi
        elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
--- a/tools/flashinfer-build.sh
+++ b/tools/flashinfer-build.sh
@ -0,0 +1,63 @@
 #!/usr/bin/env bash
 # This script is used to build FlashInfer wheels with AOT kernels
 set -ex
 # FlashInfer configuration
 FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF}"
 CUDA_VERSION="${CUDA_VERSION}"
 BUILD_WHEEL="${BUILD_WHEEL:-true}"
 if [[ -z "${FLASHINFER_GIT_REF}" ]]; then
    echo "❌ FLASHINFER_GIT_REF must be specified" >&2
    exit 1
 fi
 if [[ -z "${CUDA_VERSION}" ]]; then
    echo "❌ CUDA_VERSION must be specified" >&2
    exit 1
 fi
 echo "🏗️  Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
 # Clone FlashInfer
 git clone --depth 1 --recursive --shallow-submodules \
    --branch ${FLASHINFER_GIT_REF} \
    ${FLASHINFER_GIT_REPO} flashinfer
 # Set CUDA arch list based on CUDA version
 # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
 if [[ "${CUDA_VERSION}" == 11.* ]]; then
    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
 elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
 else
    # CUDA 12.8+ supports 10.0a and 12.0
    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
 fi
 echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
 pushd flashinfer
    # Make sure the wheel is built for the correct CUDA version
    export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
    # Build AOT kernels
    export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
    export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
    python3 -m flashinfer.aot
    if [[ "${BUILD_WHEEL}" == "true" ]]; then
        # Build wheel for distribution
        uv build --no-build-isolation --wheel --out-dir ../flashinfer-dist .
        echo "✅ FlashInfer wheel built successfully in flashinfer-dist/"
    else
        # Install directly (for Dockerfile)
        uv pip install --system --no-build-isolation --force-reinstall .
        echo "✅ FlashInfer installed successfully"
    fi
 popd
 # Cleanup
 rm -rf flashinfer