diff --git a/docker/Dockerfile b/docker/Dockerfile index c49b5da2714c9..c11d74fa89d00 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -164,6 +164,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # see https://github.com/pytorch/pytorch/pull/123243 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all -compress-mode=size" # Override the arch list for flash-attn to reduce the binary size ARG vllm_fa_cmake_gpu_arches='80-real;90-real' ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}