no need to append with the PTX change

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2026-05-27 08:54:26 +08:00 · 2025-05-16 20:52:49 +00:00 · 2025-05-16 20:52:49 +00:00 · bf75539f9e
commit bf75539f9e
parent 7ebe64f94b
2 changed files with 3 additions and 17 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -332,8 +332,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu"
  )

-
-  set(SCALED_MM_3X_ARCHS)
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require CUDA 12.0 or later
  optional_cuda_sources(
    NAME scaled_mm_c3x_sm90
@ -346,7 +344,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu"
    FLAGS "-DENABLE_SCALED_MM_SM90=1"
-    APPEND_ARCHS SCALED_MM_3X_ARCHS
    VERSION_MSG
      "Not building scaled_mm_c3x_sm90: CUDA Compiler version is not >= 12.0.\n"
      "Please upgrade to CUDA 12.0 or later to run FP8 quantized models on Hopper."
@ -362,22 +359,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    FLAGS "-DENABLE_SCALED_MM_SM100=1"
-    APPEND_ARCHS SCALED_MM_3X_ARCHS
    VERSION_MSG
      "Not building scaled_mm_c3x_sm100: CUDA Compiler version is not >= 12.8.\n"
      "Please upgrade to CUDA 12.8 or later to run FP8 quantized models on Blackwell."
  )

-  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
-  # kernels for the remaining archs that are not already built for 3x.
-  # (Build 8.9 for FP8)
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
-  # subtract out the archs that are already built for 3x
-  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
+  # For the cutlass_scaled_mm kernels for Pre-hopper (c2x, i.e. CUTLASS 2.x)
  optional_cuda_sources(
    NAME scaled_mm_c2x
-    ARCHS "${SCALED_MM_2X_ARCHS}"
+    ARCHS "7.5;8.0;8.9+PTX"
    SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
    FLAGS "-DENABLE_SCALED_MM_C2X=1"
    NO_ARCH_MSG "Not building scaled_mm_c2x as no compatible archs found in CUDA target architectures, "
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -127,11 +127,10 @@ endfunction()
 #     [NO_ARCH_MSG <line1> [<line2> ...]]
 #     [GEN_SCRIPT <path/to/generate_script.py>]
 #     [GEN_GLOB <glob_pattern_for_generated_sources>]
-#     [APPEND_ARCHS <var_to_append_archs>]
 # This will run GEN_SCRIPT once when version and arch checks pass, globbing
 # sources matching GEN_GLOB and appending them alongside SRCS.
 macro(optional_cuda_sources)
-  set(oneValueArgs NAME MIN_VERSION APPEND_ARCHS GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR)
+  set(oneValueArgs NAME MIN_VERSION GEN_SCRIPT GEN_GLOB OUT_SRCS_VAR)
  set(multiValueArgs ARCHS SRCS FLAGS VERSION_MSG NO_ARCH_MSG)
  cmake_parse_arguments(OCS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  if(NOT OCS_NAME)
@ -164,9 +163,6 @@ macro(optional_cuda_sources)
      if(OCS_FLAGS)
        list(APPEND VLLM_GPU_FLAGS ${OCS_FLAGS})
      endif()
-      if(OCS_APPEND_ARCHS)
-        list(APPEND ${OCS_APPEND_ARCHS} ${_OCS_ARCHS})
-      endif()
      message(STATUS "Building ${OCS_NAME} for archs: ${_OCS_ARCHS}")
    else()
      if(OCS_NO_ARCH_MSG)