no need to append with the PTX change

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
This commit is contained in:
Lucas Wilkinson 2025-05-16 20:52:49 +00:00
parent 7ebe64f94b
commit bf75539f9e
2 changed files with 3 additions and 17 deletions

View File

@ -332,8 +332,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu"
)
set(SCALED_MM_3X_ARCHS)
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require CUDA 12.0 or later
optional_cuda_sources(
NAME scaled_mm_c3x_sm90
@ -346,7 +344,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu"
FLAGS "-DENABLE_SCALED_MM_SM90=1"
APPEND_ARCHS SCALED_MM_3X_ARCHS
VERSION_MSG
"Not building scaled_mm_c3x_sm90: CUDA Compiler version is not >= 12.0.\n"
"Please upgrade to CUDA 12.0 or later to run FP8 quantized models on Hopper."
@ -362,22 +359,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
FLAGS "-DENABLE_SCALED_MM_SM100=1"
APPEND_ARCHS SCALED_MM_3X_ARCHS
VERSION_MSG
"Not building scaled_mm_c3x_sm100: CUDA Compiler version is not >= 12.8.\n"
"Please upgrade to CUDA 12.8 or later to run FP8 quantized models on Blackwell."
)
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
# For the cutlass_scaled_mm kernels for Pre-hopper (c2x, i.e. CUTLASS 2.x)
optional_cuda_sources(
NAME scaled_mm_c2x
ARCHS "${SCALED_MM_2X_ARCHS}"
ARCHS "7.5;8.0;8.9+PTX"
SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
FLAGS "-DENABLE_SCALED_MM_C2X=1"
NO_ARCH_MSG "Not building scaled_mm_c2x as no compatible archs found in CUDA target architectures, "

View File

@ -127,11 +127,10 @@ endfunction()
# [NO_ARCH_MSG <line1> [<line2> ...]]
# [GEN_SCRIPT <path/to/generate_script.py>]
# [GEN_GLOB <glob_pattern_for_generated_sources>]
# [APPEND_ARCHS <var_to_append_archs>]
# This will run GEN_SCRIPT once when version and arch checks pass, globbing
# sources matching GEN_GLOB and appending them alongside SRCS.
macro(optional_cuda_sources)
set(oneValueArgs NAME MIN_VERSION APPEND_ARCHS GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR)
set(oneValueArgs NAME MIN_VERSION GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR)
set(multiValueArgs ARCHS SRCS FLAGS VERSION_MSG NO_ARCH_MSG)
cmake_parse_arguments(OCS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if(NOT OCS_NAME)
@ -164,9 +163,6 @@ macro(optional_cuda_sources)
if(OCS_FLAGS)
list(APPEND VLLM_GPU_FLAGS ${OCS_FLAGS})
endif()
if(OCS_APPEND_ARCHS)
list(APPEND ${OCS_APPEND_ARCHS} ${_OCS_ARCHS})
endif()
message(STATUS "Building ${OCS_NAME} for archs: ${_OCS_ARCHS}")
else()
if(OCS_NO_ARCH_MSG)