Fix cuda_archs_loose_intersection when handling sm_*a (#20207)

Signed-off-by: Huy Do <huydhn@gmail.com>
2025-12-14 19:05:35 +08:00 · 2025-06-29 16:52:34 -07:00 · 2025-06-29 16:52:34 -07:00 · 6c9837a761
commit 6c9837a761
parent 6f2f53a82d
2 changed files with 26 additions and 21 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -562,7 +562,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "if you intend on running FP8 quantized MoE models on Hopper.")
    else()
      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
-                     "in CUDA target architectures")
+                     "in CUDA target architectures.")
    endif()
  endif()
@ -574,7 +574,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      SRCS "${SRCS}"
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
-  endif() 
+    message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
      message(STATUS "Not building moe_data as CUDA Compiler version is "
                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
                     "if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
    else()
      message(STATUS "Not building moe_data as no compatible archs found "
                     "in CUDA target architectures.")
    endif()
  endif()
  #
  # Machete kernels
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -265,8 +265,8 @@ macro(set_gencode_flags_for_srcs)
 endmacro()
 #
-# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form 
+# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
-#  `<major>.<minor>[letter]` compute the "loose intersection" with the 
+#  `<major>.<minor>[letter]` compute the "loose intersection" with the
 #  `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
 #  `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
 #  is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
@ -278,7 +278,7 @@ endmacro()
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
 # We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
 #  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
-#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). 
+#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@ -313,21 +313,16 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
  set(_CUDA_ARCHS)
-  if ("9.0a" IN_LIST _SRC_CUDA_ARCHS)
+  foreach(_arch ${_SRC_CUDA_ARCHS})
-    list(REMOVE_ITEM _SRC_CUDA_ARCHS "9.0a")
+    if(_arch MATCHES "\\a$")
-    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
-      list(REMOVE_ITEM _TGT_CUDA_ARCHS "9.0")
+      string(REPLACE "a" "" _base "${_arch}")
-      set(_CUDA_ARCHS "9.0a")
+      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
        list(APPEND _CUDA_ARCHS "${_arch}")
      endif()
    endif()
-  endif()
+  endforeach()
  if ("10.0a" IN_LIST _SRC_CUDA_ARCHS)
    list(REMOVE_ITEM _SRC_CUDA_ARCHS "10.0a")
    if ("10.0" IN_LIST TGT_CUDA_ARCHS)
      list(REMOVE_ITEM _TGT_CUDA_ARCHS "10.0")
      set(_CUDA_ARCHS "10.0a")
    endif()
  endif()
  list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
@ -359,7 +354,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
  endforeach()
  list(REMOVE_DUPLICATES _CUDA_ARCHS)
-  
+
  # reapply +PTX suffix to architectures that requested PTX
  set(_FINAL_ARCHS)
  foreach(_arch ${_CUDA_ARCHS})
@ -370,7 +365,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
    endif()
  endforeach()
  set(_CUDA_ARCHS ${_FINAL_ARCHS})
-  
+
  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
 endfunction()