mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-12 06:17:03 +08:00
no need to append with the PTX change
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
This commit is contained in:
parent
7ebe64f94b
commit
bf75539f9e
@ -332,8 +332,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu"
|
||||
)
|
||||
|
||||
|
||||
set(SCALED_MM_3X_ARCHS)
|
||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require CUDA 12.0 or later
|
||||
optional_cuda_sources(
|
||||
NAME scaled_mm_c3x_sm90
|
||||
@ -346,7 +344,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu"
|
||||
FLAGS "-DENABLE_SCALED_MM_SM90=1"
|
||||
APPEND_ARCHS SCALED_MM_3X_ARCHS
|
||||
VERSION_MSG
|
||||
"Not building scaled_mm_c3x_sm90: CUDA Compiler version is not >= 12.0.\n"
|
||||
"Please upgrade to CUDA 12.0 or later to run FP8 quantized models on Hopper."
|
||||
@ -362,22 +359,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||
FLAGS "-DENABLE_SCALED_MM_SM100=1"
|
||||
APPEND_ARCHS SCALED_MM_3X_ARCHS
|
||||
VERSION_MSG
|
||||
"Not building scaled_mm_c3x_sm100: CUDA Compiler version is not >= 12.8.\n"
|
||||
"Please upgrade to CUDA 12.8 or later to run FP8 quantized models on Blackwell."
|
||||
)
|
||||
|
||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||
# kernels for the remaining archs that are not already built for 3x.
|
||||
# (Build 8.9 for FP8)
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
# For the cutlass_scaled_mm kernels for Pre-hopper (c2x, i.e. CUTLASS 2.x)
|
||||
optional_cuda_sources(
|
||||
NAME scaled_mm_c2x
|
||||
ARCHS "${SCALED_MM_2X_ARCHS}"
|
||||
ARCHS "7.5;8.0;8.9+PTX"
|
||||
SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
||||
FLAGS "-DENABLE_SCALED_MM_C2X=1"
|
||||
NO_ARCH_MSG "Not building scaled_mm_c2x as no compatible archs found in CUDA target architectures, "
|
||||
|
||||
@ -127,11 +127,10 @@ endfunction()
|
||||
# [NO_ARCH_MSG <line1> [<line2> ...]]
|
||||
# [GEN_SCRIPT <path/to/generate_script.py>]
|
||||
# [GEN_GLOB <glob_pattern_for_generated_sources>]
|
||||
# [APPEND_ARCHS <var_to_append_archs>]
|
||||
# This will run GEN_SCRIPT once when version and arch checks pass, globbing
|
||||
# sources matching GEN_GLOB and appending them alongside SRCS.
|
||||
macro(optional_cuda_sources)
|
||||
set(oneValueArgs NAME MIN_VERSION APPEND_ARCHS GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR)
|
||||
set(oneValueArgs NAME MIN_VERSION GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR)
|
||||
set(multiValueArgs ARCHS SRCS FLAGS VERSION_MSG NO_ARCH_MSG)
|
||||
cmake_parse_arguments(OCS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
if(NOT OCS_NAME)
|
||||
@ -164,9 +163,6 @@ macro(optional_cuda_sources)
|
||||
if(OCS_FLAGS)
|
||||
list(APPEND VLLM_GPU_FLAGS ${OCS_FLAGS})
|
||||
endif()
|
||||
if(OCS_APPEND_ARCHS)
|
||||
list(APPEND ${OCS_APPEND_ARCHS} ${_OCS_ARCHS})
|
||||
endif()
|
||||
message(STATUS "Building ${OCS_NAME} for archs: ${_OCS_ARCHS}")
|
||||
else()
|
||||
if(OCS_NO_ARCH_MSG)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user