diff --git a/csrc/launch_bounds_utils.h b/csrc/launch_bounds_utils.h index d5a89690111bc..92d7ef802f97f 100644 --- a/csrc/launch_bounds_utils.h +++ b/csrc/launch_bounds_utils.h @@ -8,11 +8,37 @@ #define VLLM_LAUNCH_BLOCKS_CAP 4 #endif -// compile-time estimate of max threads per SM for launch bounds. +// Compile-time estimate of max threads per SM for launch bounds. +// Families: 1024, 1536, 2048 threads/SM. #ifndef VLLM_MAX_THREADS_PER_SM - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 - #define VLLM_MAX_THREADS_PER_SM 1536 + #ifdef __CUDA_ARCH__ + + /* 1024 thr/SM: Turing (sm_75) */ + #if (__CUDA_ARCH__ == 750) + #define VLLM_MAX_THREADS_PER_SM 1024 + + /* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89), + GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */ + #elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \ + (__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) || \ + (__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \ + (__CUDA_ARCH__ == 1210) + #define VLLM_MAX_THREADS_PER_SM 1536 + + /* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80), + Hopper (sm_90), Blackwell (sm_100/103) */ + #elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \ + (__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) || \ + (__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030) + #define VLLM_MAX_THREADS_PER_SM 2048 + + /* Fallback: use 2048 for unknown future CCs */ + #else + #define VLLM_MAX_THREADS_PER_SM 2048 + #endif + #else + /* Host pass (no __CUDA_ARCH__): neutral default */ #define VLLM_MAX_THREADS_PER_SM 2048 #endif #endif