From bf75539f9e8130dd5272c9a01ba776c1c76c917c Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 16 May 2025 20:52:49 +0000 Subject: [PATCH] no need to append with the PTX change Signed-off-by: Lucas Wilkinson --- CMakeLists.txt | 14 ++------------ cmake/utils.cmake | 6 +----- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ce9343bb016b4..11e13053bde48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -332,8 +332,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu" ) - - set(SCALED_MM_3X_ARCHS) # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require CUDA 12.0 or later optional_cuda_sources( NAME scaled_mm_c3x_sm90 @@ -346,7 +344,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu" FLAGS "-DENABLE_SCALED_MM_SM90=1" - APPEND_ARCHS SCALED_MM_3X_ARCHS VERSION_MSG "Not building scaled_mm_c3x_sm90: CUDA Compiler version is not >= 12.0.\n" "Please upgrade to CUDA 12.0 or later to run FP8 quantized models on Hopper." @@ -362,22 +359,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu" FLAGS "-DENABLE_SCALED_MM_SM100=1" - APPEND_ARCHS SCALED_MM_3X_ARCHS VERSION_MSG "Not building scaled_mm_c3x_sm100: CUDA Compiler version is not >= 12.8.\n" "Please upgrade to CUDA 12.8 or later to run FP8 quantized models on Blackwell." ) - # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) - # kernels for the remaining archs that are not already built for 3x. - # (Build 8.9 for FP8) - cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}") - # subtract out the archs that are already built for 3x - list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) + # For the cutlass_scaled_mm kernels for Pre-hopper (c2x, i.e. CUTLASS 2.x) optional_cuda_sources( NAME scaled_mm_c2x - ARCHS "${SCALED_MM_2X_ARCHS}" + ARCHS "7.5;8.0;8.9+PTX" SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" FLAGS "-DENABLE_SCALED_MM_C2X=1" NO_ARCH_MSG "Not building scaled_mm_c2x as no compatible archs found in CUDA target architectures, " diff --git a/cmake/utils.cmake b/cmake/utils.cmake index a736da24fc200..7303e07f63db4 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -127,11 +127,10 @@ endfunction() # [NO_ARCH_MSG [ ...]] # [GEN_SCRIPT ] # [GEN_GLOB ] -# [APPEND_ARCHS ] # This will run GEN_SCRIPT once when version and arch checks pass, globbing # sources matching GEN_GLOB and appending them alongside SRCS. macro(optional_cuda_sources) - set(oneValueArgs NAME MIN_VERSION APPEND_ARCHS GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR) + set(oneValueArgs NAME MIN_VERSION GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR) set(multiValueArgs ARCHS SRCS FLAGS VERSION_MSG NO_ARCH_MSG) cmake_parse_arguments(OCS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(NOT OCS_NAME) @@ -164,9 +163,6 @@ macro(optional_cuda_sources) if(OCS_FLAGS) list(APPEND VLLM_GPU_FLAGS ${OCS_FLAGS}) endif() - if(OCS_APPEND_ARCHS) - list(APPEND ${OCS_APPEND_ARCHS} ${_OCS_ARCHS}) - endif() message(STATUS "Building ${OCS_NAME} for archs: ${_OCS_ARCHS}") else() if(OCS_NO_ARCH_MSG)