diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh index 8c64e14606d3b..f69e4b06680f5 100644 --- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh +++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh @@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ --build-arg RUN_WHEEL_CHECK=false \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" + --build-arg torch_cuda_arch_list="9.0+PTX" # Setup cleanup remove_docker_container() { docker rm -f gh200-test || true; } diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index 0f010832b465d..c69ebbb42da5a 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda export MAX_JOBS=1 # Make sure release wheels are built for the following architectures export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" -export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" bash tools/check_repo.sh diff --git a/docker/Dockerfile b/docker/Dockerfile index 75b5ab0230c87..43522ef8fb8dd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -164,9 +164,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # see https://github.com/pytorch/pytorch/pull/123243 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} -# Override the arch list for flash-attn to reduce the binary size -ARG vllm_fa_cmake_gpu_arches='80-real;90-real' -ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} #################### BASE BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index 8d43de77aad59..e147b97f0e056 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -114,9 +114,6 @@ RUN cat torch_build_versions.txt # explicitly set the list to avoid issues with torch 2.2 # see https://github.com/pytorch/pytorch/pull/123243 -# Override the arch list for flash-attn to reduce the binary size -ARG vllm_fa_cmake_gpu_arches='80-real;90-real' -ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} #################### BASE BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 5f6cfcb00a37a..1f19f2fecfab1 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -106,8 +106,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" + --build-arg torch_cuda_arch_list="9.0 10.0+PTX" ``` !!! note