mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-07 12:59:08 +08:00
Fix undefined symbol: cutlass_moe_mm_sm100 (#26098)
Signed-off-by: Jun Jiang <jasl9187@hotmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
parent
d78fda7cda
commit
4f8f47e87e
@ -667,7 +667,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
else()
|
else()
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@ -254,7 +254,7 @@ void cutlass_moe_mm(
|
|||||||
bool per_act_token, bool per_out_ch) {
|
bool per_act_token, bool per_out_ch) {
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
||||||
if (version_num >= 100) {
|
if (version_num >= 100 && version_num < 110) {
|
||||||
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
@ -262,7 +262,7 @@ void cutlass_moe_mm(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||||
if (version_num >= 90) {
|
if (version_num >= 90 && version_num < 100) {
|
||||||
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
|
|||||||
@ -2747,6 +2747,8 @@ class MemorySnapshot:
|
|||||||
self.measure()
|
self.measure()
|
||||||
|
|
||||||
def measure(self):
|
def measure(self):
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
# we measure the torch peak memory usage via allocated_bytes,
|
# we measure the torch peak memory usage via allocated_bytes,
|
||||||
# rather than `torch.cuda.memory_reserved()` .
|
# rather than `torch.cuda.memory_reserved()` .
|
||||||
# After `torch.cuda.reset_peak_memory_stats()`,
|
# After `torch.cuda.reset_peak_memory_stats()`,
|
||||||
@ -2756,6 +2758,24 @@ class MemorySnapshot:
|
|||||||
"allocated_bytes.all.peak", 0)
|
"allocated_bytes.all.peak", 0)
|
||||||
|
|
||||||
self.free_memory, self.total_memory = torch.cuda.mem_get_info()
|
self.free_memory, self.total_memory = torch.cuda.mem_get_info()
|
||||||
|
shared_sysmem_device_mem_sms = (
|
||||||
|
(8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark
|
||||||
|
if current_platform.is_cuda() and \
|
||||||
|
current_platform.get_device_capability() in \
|
||||||
|
shared_sysmem_device_mem_sms:
|
||||||
|
# On UMA (Orin, Thor and Spark) platform,
|
||||||
|
# where both CPU and GPU rely on system memory,
|
||||||
|
# the cudaMemGetInfo function shows the amount of free system memory
|
||||||
|
# rather than what’s actually available.
|
||||||
|
# In the case,
|
||||||
|
# torch.cuda.mem_get_info() only reports "free" memory,
|
||||||
|
# which can be lower than what is actually
|
||||||
|
# available due to not including cache memory.
|
||||||
|
# There’s also a comprehensive reference page
|
||||||
|
# that explains how you can compute the proper value yourself.
|
||||||
|
# https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device
|
||||||
|
self.free_memory = psutil.virtual_memory().available
|
||||||
|
|
||||||
self.cuda_memory = self.total_memory - self.free_memory
|
self.cuda_memory = self.total_memory - self.free_memory
|
||||||
|
|
||||||
# torch.cuda.memory_reserved() is how many bytes
|
# torch.cuda.memory_reserved() is how many bytes
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user