diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 064b76c9cd42..ea4ff67ef3e4 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -20,6 +20,7 @@ #include #include #include "../cuda_compat.h" +#include #ifndef USE_ROCM #include @@ -62,7 +63,7 @@ __launch_bounds__(TPB) __global__ const int thread_row_offset = blockIdx.x * num_cols; - cub::Sum sum; + cuda::std::plus sum; float threadData(-FLT_MAX); // Don't touch finished rows.