mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-24 09:51:19 +08:00
[Docker] Allow FlashInfer to be built in the ARM CUDA Dockerfile (#21013)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
c9ba8104ed
commit
a50d918225
@ -388,48 +388,33 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
|
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
|
||||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
|
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
|
||||||
|
|
||||||
# Allow specifying a version, Git revision or local .whl file
|
# Install FlashInfer from source
|
||||||
ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
|
|
||||||
ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
|
|
||||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||||
ARG FLASHINFER_GIT_REF="v0.2.8rc1"
|
ARG FLASHINFER_GIT_REF="v0.2.8rc1"
|
||||||
# Flag to control whether to use pre-built FlashInfer wheels (set to false to force build from source)
|
|
||||||
# TODO: Currently disabled because the pre-built wheels are not available for FLASHINFER_GIT_REF
|
|
||||||
ARG USE_FLASHINFER_PREBUILT_WHEEL=false
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||||
. /etc/environment
|
. /etc/environment
|
||||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then
|
git clone --depth 1 --recursive --shallow-submodules \
|
||||||
# FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
|
--branch ${FLASHINFER_GIT_REF} \
|
||||||
if [[ "$CUDA_VERSION" == 12.8* ]] && [[ "$USE_FLASHINFER_PREBUILT_WHEEL" == "true" ]]; then
|
${FLASHINFER_GIT_REPO} flashinfer
|
||||||
uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL}
|
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||||
else
|
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
else
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
# CUDA 12.8+ supports 10.0a and 12.0
|
||||||
else
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||||
# CUDA 12.8+ supports 10.0a and 12.0
|
fi
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
fi
|
# Needed to build AOT kernels
|
||||||
echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
pushd flashinfer
|
||||||
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
git clone --depth 1 --recursive --shallow-submodules \
|
python3 -m flashinfer.aot
|
||||||
--branch ${FLASHINFER_GIT_REF} \
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
${FLASHINFER_GIT_REPO} flashinfer
|
uv pip install --system --no-build-isolation .
|
||||||
|
popd
|
||||||
# Needed to build AOT kernels
|
rm -rf flashinfer
|
||||||
pushd flashinfer
|
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
|
||||||
python3 -m flashinfer.aot
|
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
|
||||||
uv pip install --system --no-build-isolation .
|
|
||||||
popd
|
|
||||||
|
|
||||||
rm -rf flashinfer
|
|
||||||
fi \
|
|
||||||
fi
|
|
||||||
BASH
|
BASH
|
||||||
COPY examples examples
|
COPY examples examples
|
||||||
COPY benchmarks benchmarks
|
COPY benchmarks benchmarks
|
||||||
@ -521,10 +506,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install --system -r requirements/kv_connectors.txt; \
|
uv pip install --system -r requirements/kv_connectors.txt; \
|
||||||
fi; \
|
fi; \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
BITSANDBYTES_VERSION="0.42.0"; \
|
||||||
else \
|
else \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
BITSANDBYTES_VERSION="0.46.1"; \
|
||||||
fi
|
fi; \
|
||||||
|
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user