diff --git a/docker/Dockerfile b/docker/Dockerfile index c0f55a7eeba07..fad62be798a1e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -391,18 +391,28 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' git clone --depth 1 --recursive --shallow-submodules \ --branch ${FLASHINFER_GIT_REF} \ ${FLASHINFER_GIT_REPO} flashinfer + # Exclude CUDA arches for older versions (11.x and 12.0-12.7) + # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. + if [[ "${CUDA_VERSION}" == 11.* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" + elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" + else + # CUDA 12.8+ supports 10.0a and 12.0 + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" + fi pushd flashinfer - if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then - # Exclude CUDA arches for older versions (11.x and 12.0-12.7) - # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. - if [[ "${CUDA_VERSION}" == 11.* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" - elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" - else - # CUDA 12.8+ supports 10.0a and 12.0 - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" + if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then + # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh + echo "🏗️ Installing FlashInfer from pre-compiled wheel" + uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \ + --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then + # Download pre-compiled cubins + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." fi + elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future diff --git a/tools/flashinfer-build.sh b/tools/flashinfer-build.sh new file mode 100644 index 0000000000000..6c14d87348c3a --- /dev/null +++ b/tools/flashinfer-build.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# This script is used to build FlashInfer wheels with AOT kernels + +set -ex + +# FlashInfer configuration +FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" +FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF}" +CUDA_VERSION="${CUDA_VERSION}" +BUILD_WHEEL="${BUILD_WHEEL:-true}" + +if [[ -z "${FLASHINFER_GIT_REF}" ]]; then + echo "❌ FLASHINFER_GIT_REF must be specified" >&2 + exit 1 +fi + +if [[ -z "${CUDA_VERSION}" ]]; then + echo "❌ CUDA_VERSION must be specified" >&2 + exit 1 +fi + +echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}" + +# Clone FlashInfer +git clone --depth 1 --recursive --shallow-submodules \ + --branch ${FLASHINFER_GIT_REF} \ + ${FLASHINFER_GIT_REPO} flashinfer + +# Set CUDA arch list based on CUDA version +# Exclude CUDA arches for older versions (11.x and 12.0-12.7) +if [[ "${CUDA_VERSION}" == 11.* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" +elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" +else + # CUDA 12.8+ supports 10.0a and 12.0 + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" +fi + +echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}" + +pushd flashinfer + # Make sure the wheel is built for the correct CUDA version + export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + + # Build AOT kernels + export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" + export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" + python3 -m flashinfer.aot + + if [[ "${BUILD_WHEEL}" == "true" ]]; then + # Build wheel for distribution + uv build --no-build-isolation --wheel --out-dir ../flashinfer-dist . + echo "✅ FlashInfer wheel built successfully in flashinfer-dist/" + else + # Install directly (for Dockerfile) + uv pip install --system --no-build-isolation --force-reinstall . + echo "✅ FlashInfer installed successfully" + fi +popd + +# Cleanup +rm -rf flashinfer \ No newline at end of file