diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e36742dddb3c..c3719526c4ae0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,6 +171,13 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") endif() +# +# Set nvcc fatbin compression. +# +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND VLLM_GPU_LANG STREQUAL "CUDA") + list(APPEND VLLM_GPU_FLAGS "-Xfatbin" "-compress-all" "-compress-mode=size") +endif() + # # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. @@ -392,7 +399,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require # CUDA 12.0 or later cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" @@ -408,7 +415,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " "later if you intend on running FP8 quantized models on " @@ -423,7 +430,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require # CUDA 12.8 or later cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu" @@ -437,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is " "not >= 12.8, we recommend upgrading to CUDA 12.8 or " "later if you intend on running FP8 quantized models on " @@ -452,7 +459,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # require CUDA 12.8 or later cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" @@ -467,7 +474,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is " "not >= 12.8, we recommend upgrading to CUDA 12.8 or " "later if you intend on running FP8 quantized models on " @@ -510,7 +517,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor # require CUDA 12.2 or later (and only work on Hopper). cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS) set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -519,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1") message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS) message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is " "not >= 12.2, we recommend upgrading to CUDA 12.2 or later " "if you intend on running FP8 sparse quantized models on Hopper.") @@ -531,7 +538,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # FP4 Archs and flags cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_experts_quant.cu" @@ -552,7 +559,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # CUTLASS MLA Archs and flags cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) set(SRCS "csrc/attention/mla/cutlass_mla_kernels.cu") set_gencode_flags_for_srcs( @@ -641,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The machete kernels only work on hopper and require CUDA 12.0 or later. # Only build Machete kernels if we are building for something compatible with sm90a cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS) # # For the Machete kernels we automatically generate sources for various # preselected input type pairs and schedules. @@ -693,7 +700,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS) message(STATUS "Not building Machete kernels as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or "