Try nvcc compress-mode to reduce binary size

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
mgoin 2025-07-09 12:28:06 -04:00
parent 4ac9c33f78
commit 0204263598

View File

@ -164,6 +164,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all -compress-mode=size"
# Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}