From 84135b1489123f1a392a53069394a380f192b507 Mon Sep 17 00:00:00 2001 From: Jun Jiang Date: Fri, 3 Oct 2025 23:48:32 +0800 Subject: [PATCH] Fix undefined symbol: cutlass_moe_mm_sm100 (#26098) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jun Jiang Co-authored-by: Luka Govedič Signed-off-by: yewentao256 --- CMakeLists.txt | 2 +- .../w8a8/cutlass/scaled_mm_entry.cu | 4 ++-- vllm/utils/__init__.py | 20 +++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4aa0f84fe2d74..66967b655a1a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -668,7 +668,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") endif() diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu index 04b64a35da376..1001af05ff003 100644 --- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu +++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu @@ -254,7 +254,7 @@ void cutlass_moe_mm( bool per_act_token, bool per_out_ch) { int32_t version_num = get_sm_version_num(); #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100 - if (version_num >= 100) { + if (version_num >= 100 && version_num < 110) { cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, problem_sizes, a_strides, b_strides, c_strides, per_act_token, per_out_ch); @@ -262,7 +262,7 @@ void cutlass_moe_mm( } #endif #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90 - if (version_num >= 90) { + if (version_num >= 90 && version_num < 100) { cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, problem_sizes, a_strides, b_strides, c_strides, per_act_token, per_out_ch); diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 8c69870b2bc35..6b208bca69869 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2747,6 +2747,8 @@ class MemorySnapshot: self.measure() def measure(self): + from vllm.platforms import current_platform + # we measure the torch peak memory usage via allocated_bytes, # rather than `torch.cuda.memory_reserved()` . # After `torch.cuda.reset_peak_memory_stats()`, @@ -2756,6 +2758,24 @@ class MemorySnapshot: "allocated_bytes.all.peak", 0) self.free_memory, self.total_memory = torch.cuda.mem_get_info() + shared_sysmem_device_mem_sms = ( + (8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark + if current_platform.is_cuda() and \ + current_platform.get_device_capability() in \ + shared_sysmem_device_mem_sms: + # On UMA (Orin, Thor and Spark) platform, + # where both CPU and GPU rely on system memory, + # the cudaMemGetInfo function shows the amount of free system memory + # rather than what’s actually available. + # In the case, + # torch.cuda.mem_get_info() only reports "free" memory, + # which can be lower than what is actually + # available due to not including cache memory. + # There’s also a comprehensive reference page + # that explains how you can compute the proper value yourself. + # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device + self.free_memory = psutil.virtual_memory().available + self.cuda_memory = self.total_memory - self.free_memory # torch.cuda.memory_reserved() is how many bytes