From 84135b1489123f1a392a53069394a380f192b507 Mon Sep 17 00:00:00 2001
From: Jun Jiang <jasl9187@hotmail.com>
Date: Fri, 3 Oct 2025 23:48:32 +0800
Subject: [PATCH] Fix undefined symbol: cutlass_moe_mm_sm100 (#26098)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jun Jiang <jasl9187@hotmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 CMakeLists.txt                                |  2 +-
 .../w8a8/cutlass/scaled_mm_entry.cu           |  4 ++--
 vllm/utils/__init__.py                        | 20 +++++++++++++++++++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4aa0f84fe2d74..66967b655a1a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -668,7 +668,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
   endif()
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index 04b64a35da376..1001af05ff003 100644
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -254,7 +254,7 @@ void cutlass_moe_mm(
     bool per_act_token, bool per_out_ch) {
   int32_t version_num = get_sm_version_num();
 #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
-  if (version_num >= 100) {
+  if (version_num >= 100 && version_num < 110) {
     cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                          expert_offsets, problem_sizes, a_strides, b_strides,
                          c_strides, per_act_token, per_out_ch);
@@ -262,7 +262,7 @@ void cutlass_moe_mm(
   }
 #endif
 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
-  if (version_num >= 90) {
+  if (version_num >= 90 && version_num < 100) {
     cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
                         expert_offsets, problem_sizes, a_strides, b_strides,
                         c_strides, per_act_token, per_out_ch);
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 8c69870b2bc35..6b208bca69869 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2747,6 +2747,8 @@ class MemorySnapshot:
             self.measure()
 
     def measure(self):
+        from vllm.platforms import current_platform
+
         # we measure the torch peak memory usage via allocated_bytes,
         # rather than `torch.cuda.memory_reserved()` .
         # After `torch.cuda.reset_peak_memory_stats()`,
@@ -2756,6 +2758,24 @@ class MemorySnapshot:
             "allocated_bytes.all.peak", 0)
 
         self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        shared_sysmem_device_mem_sms = (
+            (8, 7), (11, 0), (12, 1))  # Orin, Thor, Spark
+        if current_platform.is_cuda() and \
+            current_platform.get_device_capability() in \
+            shared_sysmem_device_mem_sms:
+            # On UMA (Orin, Thor and Spark) platform,
+            # where both CPU and GPU rely on system memory,
+            # the cudaMemGetInfo function shows the amount of free system memory
+            # rather than what’s actually available.
+            # In the case,
+            # torch.cuda.mem_get_info() only reports "free" memory,
+            # which can be lower than what is actually
+            # available due to not including cache memory.
+            # There’s also a comprehensive reference page
+            # that explains how you can compute the proper value yourself.
+            # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device
+            self.free_memory = psutil.virtual_memory().available
+
         self.cuda_memory = self.total_memory - self.free_memory
 
         # torch.cuda.memory_reserved() is how many bytes