diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c5856fc59097..35baea9417564 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,7 +168,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") endif() - # # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. @@ -308,148 +307,70 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") - if (MARLIN_ARCHS) + # Marlin kernels: generate and build for supported architectures + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}") + optional_cuda_sources( + NAME Marlin + ARCHS "8.0;9.0+PTX" + GEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py" + GEN_GLOB "csrc/quantization/gptq_marlin/kernel_*.cu" + SRCS + "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" + "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" + "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" + "csrc/quantization/gptq_marlin/gptq_marlin.cu" + "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" + "csrc/quantization/gptq_marlin/awq_marlin_repack.cu" + NO_ARCH_MSG + "Not building Marlin kernels as no compatible archs found in CUDA target architectures" + ) - # - # For the Marlin kernels we automatically generate sources for various - # preselected input type pairs and schedules. - # Generate sources: - set(MARLIN_GEN_SCRIPT - ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py) - file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH) - - message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}") - message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}") - - if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH} - OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH}) - execute_process( - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=$PYTHONPATH - ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} - RESULT_VARIABLE marlin_generation_result - OUTPUT_VARIABLE marlin_generation_result - OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log - ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log - ) - - if (NOT marlin_generation_result EQUAL 0) - message(FATAL_ERROR "Marlin generation failed." - " Result: \"${marlin_generation_result}\"" - "\nCheck the log for details: " - "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log") - else() - set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH} - CACHE STRING "Last run Marlin generate script hash" FORCE) - message(STATUS "Marlin generation completed successfully.") - endif() - else() - message(STATUS "Marlin generation script has not changed, skipping generation.") - endif() - - file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu") - set_gencode_flags_for_srcs( - SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" - CUDA_ARCHS "${MARLIN_ARCHS}") - - list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) - - set(MARLIN_SRCS - "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" - "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" - "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" - "csrc/quantization/gptq_marlin/gptq_marlin.cu" - "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" - "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") - set_gencode_flags_for_srcs( - SRCS "${MARLIN_SRCS}" - CUDA_ARCHS "${MARLIN_ARCHS}") - list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") - message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") - else() - message(STATUS "Not building Marlin kernels as no compatible archs found" - " in CUDA target architectures") - endif() - - # Only build AllSpark kernels if we are building for at least some compatible archs. - cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}") - if (ALLSPARK_ARCHS) - set(ALLSPARK_SRCS - "csrc/quantization/gptq_allspark/allspark_repack.cu" - "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu") - set_gencode_flags_for_srcs( - SRCS "${ALLSPARK_SRCS}" - CUDA_ARCHS "${ALLSPARK_ARCHS}") - list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}") - message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}") - else() - message(STATUS "Not building AllSpark kernels as no compatible archs found" - " in CUDA target architectures") - endif() + # AllSpark kernels + optional_cuda_sources( + NAME AllSpark + ARCHS "8.0;8.6;8.7;8.9" + SRCS + "csrc/quantization/gptq_allspark/allspark_repack.cu" + "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu" + NO_ARCH_MSG "Not building AllSpark kernels as no compatible archs found in CUDA target architectures" + ) set(SCALED_MM_3X_ARCHS) - # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later - cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) - set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1") - # Let scaled_mm_c2x know it doesn't need to build these arches - list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") - message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) - message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is " - "not >= 12.0, we recommend upgrading to CUDA 12.0 or " - "later if you intend on running FP8 quantized models on " - "Hopper.") - else() - message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found " - "in CUDA target architectures") - endif() - endif() + # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require CUDA 12.0 or later + optional_cuda_sources( + NAME scaled_mm_c3x_sm90 + MIN_VERSION 12.0 + ARCHS "9.0a" + SRCS + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu" + FLAGS "-DENABLE_SCALED_MM_SM90=1" + APPEND_ARCHS SCALED_MM_3X_ARCHS + VERSION_MSG + "Not building scaled_mm_c3x_sm90: CUDA Compiler version is not >= 12.0.\n" + "Please upgrade to CUDA 12.0 or later to run FP8 quantized models on Hopper." + ) - # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.8 or later - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) - set(SRCS + # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require CUDA 12.8 or later + optional_cuda_sources( + NAME scaled_mm_c3x_sm100 + MIN_VERSION 12.8 + ARCHS "10.0a;10.1a;12.0a" + SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu" - ) - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1") - # Let scaled_mm_c2x know it doesn't need to build these arches - list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") - message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) - message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is " - "not >= 12.8, we recommend upgrading to CUDA 12.8 or " - "later if you intend on running FP8 quantized models on " - "Blackwell.") - else() - message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found " - "in CUDA target architectures") - endif() - endif() + FLAGS "-DENABLE_SCALED_MM_SM100=1" + APPEND_ARCHS SCALED_MM_3X_ARCHS + VERSION_MSG + "Not building scaled_mm_c3x_sm100: CUDA Compiler version is not >= 12.8.\n" + "Please upgrade to CUDA 12.8 or later to run FP8 quantized models on Blackwell." + ) - # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. # (Build 8.9 for FP8) @@ -457,184 +378,87 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) - if (SCALED_MM_2X_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_2X_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1") - message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}") - else() - if (SCALED_MM_3X_ARCHS) - message(STATUS "Not building scaled_mm_c2x as all archs are already built" - " for and covered by scaled_mm_c3x") - else() - message(STATUS "Not building scaled_mm_c2x as no compatible archs found " - "in CUDA target architectures") - endif() - endif() + optional_cuda_sources( + NAME scaled_mm_c2x + ARCHS "${SCALED_MM_2X_ARCHS}" + SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" + FLAGS "-DENABLE_SCALED_MM_C2X=1" + NO_ARCH_MSG "Not building scaled_mm_c2x as no compatible archs found in CUDA target architectures, " + "or is already covered by scaled_mm_c3x." + ) # # 2:4 Sparse Kernels - - # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor - # require CUDA 12.2 or later (and only work on Hopper). - cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) - set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1") - message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) - message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is " - "not >= 12.2, we recommend upgrading to CUDA 12.2 or later " - "if you intend on running FP8 sparse quantized models on Hopper.") - else() - message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found " - "in CUDA target architectures") - endif() - endif() + optional_cuda_sources( + NAME sparse_scaled_mm_c3x + MIN_VERSION 12.2 + ARCHS "9.0a;" + SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu" + FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1" + VERSION_MSG + "Not building sparse_scaled_mm_c3x: CUDA Compiler version is not >= 12.2.\n" + "Please upgrade to CUDA 12.2 or later to run FP8 sparse quantized models on Hopper." + ) # FP4 Archs and flags - cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS) - set(SRCS + optional_cuda_sources( + NAME NVFP4 + MIN_VERSION 12.8 + ARCHS "10.0a" + SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" - "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${FP4_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1") - message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") - else() - message(STATUS "Not building NVFP4 as no compatible archs were found.") - # clear FP4_ARCHS - set(FP4_ARCHS) - endif() + "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu" + FLAGS "-DENABLE_NVFP4=1" + NO_ARCH_MSG "Not building NVFP4 as no compatible archs were found." + ) # CUTLASS MLA Archs and flags - cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS) - set(SRCS - "csrc/attention/mla/cutlass_mla_kernels.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${MLA_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1") - # Add MLA-specific include directories only to MLA source files - set_source_files_properties(${SRCS} - PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common") - message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}") - else() - message(STATUS "Not building CUTLASS MLA as no compatible archs were found.") - # clear MLA_ARCHS - set(MLA_ARCHS) - endif() + optional_cuda_sources( + NAME CUTLASS_MLA + MIN_VERSION 12.8 + ARCHS "10.0a" + SRCS "csrc/attention/mla/cutlass_mla_kernels.cu" + FLAGS "-DENABLE_CUTLASS_MLA=1" + NO_ARCH_MSG "Not building CUTLASS MLA as no compatible archs were found." + ) + # Add MLA-specific include directories only to MLA source files + set_source_files_properties( + "csrc/attention/mla/cutlass_mla_kernels.cu" + PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common" + ) # CUTLASS MoE kernels - - # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works - # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible - # to compile MoE kernels that use its output. - cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" - "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1") - message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) - message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is " - "not >= 12.3, we recommend upgrading to CUDA 12.3 or later " - "if you intend on running FP8 quantized MoE models on Hopper.") - else() - message(STATUS "Not building grouped_mm_c3x as no compatible archs found " - "in CUDA target architectures") - endif() - endif() + optional_cuda_sources( + NAME grouped_mm_c3x + MIN_VERSION 12.3 + ARCHS "9.0a;10.0a" + SRCS + "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" + "csrc/quantization/cutlass_w8a8/moe/moe_data.cu" + FLAGS "-DENABLE_CUTLASS_MOE_SM90=1" + VERSION_MSG "Not building grouped_mm_c3x kernels as CUDA Compiler version is not >= 12.3, we recommend upgrading to CUDA 12.3 or later if you intend on running FP8 quantized MoE models on Hopper." + ) # # Machete kernels - # The machete kernels only work on hopper and require CUDA 12.0 or later. - # Only build Machete kernels if we are building for something compatible with sm90a + # Machete kernels: generate and build for supported architectures cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) - # - # For the Machete kernels we automatically generate sources for various - # preselected input type pairs and schedules. - # Generate sources: - set(MACHETE_GEN_SCRIPT - ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py) - file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH) - - message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}") - message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}") - - if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH} - OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) - execute_process( - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH - ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} - RESULT_VARIABLE machete_generation_result - OUTPUT_VARIABLE machete_generation_output - OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log - ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log - ) - - if (NOT machete_generation_result EQUAL 0) - message(FATAL_ERROR "Machete generation failed." - " Result: \"${machete_generation_result}\"" - "\nCheck the log for details: " - "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") - else() - set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} - CACHE STRING "Last run machete generate script hash" FORCE) - message(STATUS "Machete generation completed successfully.") - endif() - else() - message(STATUS "Machete generation script has not changed, skipping generation.") - endif() - - # Add machete generated sources - file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu") - list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES}) - - # forward compatible - set_gencode_flags_for_srcs( - SRCS "${MACHETE_GEN_SOURCES}" - CUDA_ARCHS "${MACHETE_ARCHS}") - - list(APPEND VLLM_EXT_SRC - csrc/quantization/machete/machete_pytorch.cu) - - message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 - AND MACHETE_ARCHS) - message(STATUS "Not building Machete kernels as CUDA Compiler version is " - "not >= 12.0, we recommend upgrading to CUDA 12.0 or " - "later if you intend on running w4a16 quantized models on " - "Hopper.") - else() - message(STATUS "Not building Machete kernels as no compatible archs " - "found in CUDA target architectures") - endif() - endif() + optional_cuda_sources( + NAME Machete + MIN_VERSION 12.0 + ARCHS "${MACHETE_ARCHS}" + GEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py" + GEN_GLOB "csrc/quantization/machete/generated/*.cu" + SRCS "csrc/quantization/machete/machete_pytorch.cu" + VERSION_MSG + "Not building Machete kernels as CUDA Compiler version is less than 12.0." + "We recommend upgrading to CUDA 12.0 or later to run w4a16 quantized models on Hopper." + NO_ARCH_MSG + "Not building Machete kernels as no compatible archs found in CUDA target architectures" + ) # if CUDA endif endif() @@ -666,76 +490,22 @@ set(VLLM_MOE_EXT_SRC "csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") -if(VLLM_GPU_LANG STREQUAL "CUDA") - list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu") -endif() - +# Apply gencode flags to base MOE extension sources set_gencode_flags_for_srcs( SRCS "${VLLM_MOE_EXT_SRC}" CUDA_ARCHS "${CUDA_ARCHS}") +## Marlin MOE kernels: generate and include for supported architectures if(VLLM_GPU_LANG STREQUAL "CUDA") - set(VLLM_MOE_WNA16_SRC - "csrc/moe/moe_wna16.cu") - - set_gencode_flags_for_srcs( - SRCS "${VLLM_MOE_WNA16_SRC}" - CUDA_ARCHS "${CUDA_ARCHS}") - - list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") - # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") - if (MARLIN_MOE_ARCHS) - - # - # For the Marlin MOE kernels we automatically generate sources for various - # preselected input type pairs and schedules. - # Generate sources: - set(MOE_MARLIN_GEN_SCRIPT - ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py) - file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH) - - message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}") - message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}") - - if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} - OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH}) - execute_process( - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=$PYTHONPATH - ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} - RESULT_VARIABLE moe_marlin_generation_result - OUTPUT_VARIABLE moe_marlin_generation_output - OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log - ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log - ) - - if (NOT moe_marlin_generation_result EQUAL 0) - message(FATAL_ERROR "Marlin MOE generation failed." - " Result: \"${moe_marlin_generation_result}\"" - "\nCheck the log for details: " - "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log") - else() - set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH} - CACHE STRING "Last run Marlin MOE generate script hash" FORCE) - message(STATUS "Marlin MOE generation completed successfully.") - endif() - else() - message(STATUS "Marlin MOE generation script has not changed, skipping generation.") - endif() - - file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu") - set_gencode_flags_for_srcs( - SRCS "${MOE_WNAA16_MARLIN_SRC}" - CUDA_ARCHS "${MARLIN_MOE_ARCHS}") - - list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) - - message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") - else() - message(STATUS "Not building Marlin MOE kernels as no compatible archs found" - " in CUDA target architectures") - endif() + optional_cuda_sources( + NAME "Marlin MOE" + ARCHS "8.0;9.0+PTX" + GEN_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py" + GEN_GLOB "csrc/moe/marlin_moe_wna16/*.cu" + SRCS "csrc/moe/moe_wna16.cu" + NO_ARCH_MSG "Not building Marlin MOE kernels as no compatible archs found in CUDA target architectures" + OUT_SRCS_VAR VLLM_MOE_EXT_SRC + ) endif() if(VLLM_GPU_LANG STREQUAL "CUDA") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 12e4e39024f5d..2ef0eb43897da 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -39,6 +39,34 @@ function (run_python OUT EXPR ERR_MSG) set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) endfunction() +# Generate CUDA sources via a Python script with caching based on script hash +function(generate_cuda_sources NAME SCRIPT GLOB OUT_SRCS) + string(TOUPPER "${NAME}" _UPPER_NAME) + set(_CACHE_VAR "${_UPPER_NAME}_GEN_SCRIPT_HASH") + file(MD5 "${SCRIPT}" _GEN_HASH) + message(STATUS "${NAME} generation script hash: ${_GEN_HASH}") + message(STATUS "Last run ${NAME} generation script hash: $CACHE{${_CACHE_VAR}}") + if(NOT DEFINED CACHE{${_CACHE_VAR}} OR NOT $CACHE{${_CACHE_VAR}} STREQUAL "${_GEN_HASH}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH=$ENV{PYTHONPATH} + ${Python_EXECUTABLE} "${SCRIPT}" + RESULT_VARIABLE _GEN_RESULT + OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${NAME}_generation.log" + ERROR_FILE "${CMAKE_CURRENT_BINARY_DIR}/${NAME}_generation.log" + ) + if(NOT _GEN_RESULT EQUAL 0) + message(FATAL_ERROR "${NAME} generation failed. Result: \"${_GEN_RESULT}\"\nCheck the log for details: ${CMAKE_CURRENT_BINARY_DIR}/${NAME}_generation.log") + else() + set(${_CACHE_VAR} "${_GEN_HASH}" CACHE STRING "Last run ${NAME} generation script hash" FORCE) + message(STATUS "${NAME} generation completed successfully.") + endif() + else() + message(STATUS "${NAME} generation script has not changed, skipping generation.") + endif() + file(GLOB ${OUT_SRCS} "${GLOB}") +endfunction() + # Run `EXPR` in python after importing `PKG`. Use the result of this to extend # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. macro (append_cmake_prefix_path PKG EXPR) @@ -86,6 +114,78 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE) endfunction() +## Macro to conditionally include CUDA sources based on architecture and CUDA +## compiler version, optionally generating sources via a Python script. +## Usage: +## optional_cuda_sources( +## NAME +## [MIN_VERSION ] +## ARCHS +## SRCS [ ...] +## [FLAGS ...] +## [VERSION_MSG [ ...]] +## [NO_ARCH_MSG [ ...]] +## [GEN_SCRIPT ] +## [GEN_GLOB ] +## [APPEND_ARCHS ] +## This will run GEN_SCRIPT once when version and arch checks pass, globbing +## sources matching GEN_GLOB and appending them alongside SRCS. +macro(optional_cuda_sources) + set(oneValueArgs NAME MIN_VERSION APPEND_ARCHS GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR) + set(multiValueArgs ARCHS SRCS FLAGS VERSION_MSG NO_ARCH_MSG) + cmake_parse_arguments(OCS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(NOT OCS_NAME) + message(FATAL_ERROR "optional_cuda_sources: NAME is required") + endif() + if(NOT OCS_ARCHS) + message(FATAL_ERROR "optional_cuda_sources ${OCS_NAME}: ARCHS is required") + endif() + if(NOT OCS_SRCS) + message(FATAL_ERROR "optional_cuda_sources ${OCS_NAME}: SRCS is required") + endif() + if(NOT OCS_MIN_VERSION) + set(OCS_MIN_VERSION "0.0") + endif() + cuda_archs_loose_intersection(_OCS_ARCHS "${OCS_ARCHS}" "${CUDA_ARCHS}") + if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL ${OCS_MIN_VERSION}) + if(_OCS_ARCHS) + set(_OCS_SRCS ${OCS_SRCS}) + # Generate sources if a script is provided + if(OCS_GEN_SCRIPT AND OCS_GEN_GLOB) + generate_cuda_sources(${OCS_NAME} "${OCS_GEN_SCRIPT}" "${OCS_GEN_GLOB}" _OCS_GEN_SRCS) + list(APPEND _OCS_SRCS ${_OCS_GEN_SRCS}) + endif() + set_gencode_flags_for_srcs(SRCS "${_OCS_SRCS}" CUDA_ARCHS "${_OCS_ARCHS}") + if(OCS_OUT_SRCS_VAR) + list(APPEND ${OCS_OUT_SRCS_VAR} ${_OCS_SRCS}) + else() + list(APPEND VLLM_EXT_SRC ${_OCS_SRCS}) + endif() + if(OCS_FLAGS) + list(APPEND VLLM_GPU_FLAGS ${OCS_FLAGS}) + endif() + if(OCS_APPEND_ARCHS) + list(APPEND ${OCS_APPEND_ARCHS} ${_OCS_ARCHS}) + endif() + message(STATUS "Building ${OCS_NAME} for archs: ${_OCS_ARCHS}") + else() + if(OCS_NO_ARCH_MSG) + list(JOIN OCS_NO_ARCH_MSG "\n" _OCS_NO_ARCH_JOINED) + message(STATUS "${_OCS_NO_ARCH_JOINED}") + else() + message(STATUS "Not building ${OCS_NAME}: no compatible architectures found in CUDA target architectures") + endif() + endif() + else() + if(OCS_VERSION_MSG) + list(JOIN OCS_VERSION_MSG "\n" _OCS_VERSION_JOINED) + message(STATUS "${_OCS_VERSION_JOINED}") + else() + message(STATUS "Not building ${OCS_NAME}: CUDA Compiler version is less than ${OCS_MIN_VERSION}") + endif() + endif() +endmacro() + # # Get additional GPU compiler flags from torch. #