mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 10:26:15 +08:00
Update to flashinfer-python==0.2.12 and disable AOT compile for non-release image (#23129)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
68fcd3fa73
commit
50df09fe13
@ -68,7 +68,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Annotate release workflow"
|
- label: "Annotate release workflow"
|
||||||
|
|||||||
@ -372,31 +372,45 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
|
|
||||||
# Install FlashInfer from source
|
# Install FlashInfer from source
|
||||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||||
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
|
# Keep this in sync with "flashinfer" extra in setup.py
|
||||||
# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
|
ARG FLASHINFER_GIT_REF="v0.2.12"
|
||||||
ARG FLASHINFER_GIT_REF="v0.2.11"
|
# Flag to control whether to compile FlashInfer AOT kernels
|
||||||
|
# Set to "true" to enable AOT compilation:
|
||||||
|
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
|
||||||
|
ARG FLASHINFER_AOT_COMPILE=false
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||||
. /etc/environment
|
. /etc/environment
|
||||||
git clone --depth 1 --recursive --shallow-submodules \
|
git clone --depth 1 --recursive --shallow-submodules \
|
||||||
--branch ${FLASHINFER_GIT_REF} \
|
--branch ${FLASHINFER_GIT_REF} \
|
||||||
${FLASHINFER_GIT_REPO} flashinfer
|
${FLASHINFER_GIT_REPO} flashinfer
|
||||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
|
||||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
|
||||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
|
||||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
|
||||||
else
|
|
||||||
# CUDA 12.8+ supports 10.0a and 12.0
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
|
||||||
fi
|
|
||||||
echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
|
||||||
# Needed to build AOT kernels
|
|
||||||
pushd flashinfer
|
pushd flashinfer
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||||
python3 -m flashinfer.aot
|
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||||
uv pip install --system --no-build-isolation --force-reinstall --no-deps .
|
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||||
|
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||||
|
else
|
||||||
|
# CUDA 12.8+ supports 10.0a and 12.0
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||||
|
fi
|
||||||
|
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
|
# Build AOT kernels
|
||||||
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
|
python3 -m flashinfer.aot
|
||||||
|
# Install with no-build-isolation since we already built AOT kernels
|
||||||
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
|
uv pip install --system --no-build-isolation . \
|
||||||
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
# Download pre-compiled cubins
|
||||||
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
|
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
|
||||||
|
else
|
||||||
|
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
|
||||||
|
uv pip install --system . \
|
||||||
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
fi
|
||||||
popd
|
popd
|
||||||
rm -rf flashinfer
|
rm -rf flashinfer
|
||||||
BASH
|
BASH
|
||||||
|
|||||||
2
setup.py
2
setup.py
@ -685,7 +685,7 @@ setup(
|
|||||||
"mistral_common[audio]"], # Required for audio processing
|
"mistral_common[audio]"], # Required for audio processing
|
||||||
"video": [], # Kept for backwards compatibility
|
"video": [], # Kept for backwards compatibility
|
||||||
# FlashInfer should be updated together with the Dockerfile
|
# FlashInfer should be updated together with the Dockerfile
|
||||||
"flashinfer": ["flashinfer-python==0.2.11"],
|
"flashinfer": ["flashinfer-python==0.2.12"],
|
||||||
},
|
},
|
||||||
cmdclass=cmdclass,
|
cmdclass=cmdclass,
|
||||||
package_data=package_data,
|
package_data=package_data,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user