[Docker] Allow FlashInfer to be built in the ARM CUDA Dockerfile (#21013)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-05-24 09:51:19 +08:00 · 2025-07-16 22:37:13 -04:00 · 2025-07-16 22:37:13 -04:00 · a50d918225
commit a50d918225
parent c9ba8104ed
1 changed files with 27 additions and 41 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -388,48 +388,33 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
-# Allow specifying a version, Git revision or local .whl file
+# Install FlashInfer from source
 ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
 ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 ARG FLASHINFER_GIT_REF="v0.2.8rc1"
 # Flag to control whether to use pre-built FlashInfer wheels (set to false to force build from source)
 # TODO: Currently disabled because the pre-built wheels are not available for FLASHINFER_GIT_REF
 ARG USE_FLASHINFER_PREBUILT_WHEEL=false
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
  . /etc/environment
-  if [ "$TARGETPLATFORM" != "linux/arm64" ]; then
+    git clone --depth 1 --recursive --shallow-submodules \
-      # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
+        --branch ${FLASHINFER_GIT_REF} \
-      if [[ "$CUDA_VERSION" == 12.8* ]] && [[ "$USE_FLASHINFER_PREBUILT_WHEEL" == "true" ]]; then
+        ${FLASHINFER_GIT_REPO} flashinfer
-          uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL}
+    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-      else
+    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-          # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+    if [[ "${CUDA_VERSION}" == 11.* ]]; then
-          # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-          if [[ "${CUDA_VERSION}" == 11.* ]]; then
+    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-          elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+    else
-              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+        # CUDA 12.8+ supports 10.0a and 12.0
-          else
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
-              # CUDA 12.8+ supports 10.0a and 12.0
+    fi
-              FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+    echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
-          fi
+    # Needed to build AOT kernels
-          echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+    pushd flashinfer
-
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-          git clone --depth 1 --recursive --shallow-submodules \
+            python3 -m flashinfer.aot
-            --branch ${FLASHINFER_GIT_REF} \
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            ${FLASHINFER_GIT_REPO} flashinfer
+            uv pip install --system --no-build-isolation .
-
+    popd
-          # Needed to build AOT kernels
+    rm -rf flashinfer
          pushd flashinfer
            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
              python3 -m flashinfer.aot
            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
              uv pip install --system --no-build-isolation .
          popd
          rm -rf flashinfer
      fi \
  fi
 BASH
 COPY examples examples
 COPY benchmarks benchmarks
@ -521,10 +506,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        uv pip install --system -r requirements/kv_connectors.txt; \
    fi; \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        BITSANDBYTES_VERSION="0.42.0"; \
    else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        BITSANDBYTES_VERSION="0.46.1"; \
-    fi
+    fi; \
    uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
 ENV VLLM_USAGE_SOURCE production-docker-image