From cbbc904470668b9420e71595edeef76d673a2d59 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 30 Jul 2024 13:50:42 -0400 Subject: [PATCH] [Kernel] Squash a few more warnings (#6914) --- csrc/attention/attention_kernels.cu | 4 ++-- csrc/quantization/aqlm/gemm_kernels.cu | 2 -- csrc/quantization/fp8/amd/quant_utils.cuh | 2 ++ csrc/quantization/fp8/nvidia/quant_utils.cuh | 2 ++ csrc/quantization/squeezellm/quant_cuda_kernel.cu | 3 ++- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 875570a1e894..bcd170411e7c 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -706,7 +706,7 @@ void paged_attention_v1_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); assert(head_size % thread_group_size == 0); // NOTE: alibi_slopes is optional. @@ -865,7 +865,7 @@ void paged_attention_v2_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); assert(head_size % thread_group_size == 0); // NOTE: alibi_slopes is optional. diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu index 8fb985680086..22da5e4f08a1 100644 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ b/csrc/quantization/aqlm/gemm_kernels.cu @@ -273,8 +273,6 @@ __global__ void Code2x8Dequant( } __syncthreads(); - float res = 0; - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; while (iters--) { if (pred && a_gl_rd < a_gl_end) { diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index 35123d7fc65d..eb66834222f3 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) { } #endif assert(false); + return {}; // Squash missing return statement warning } template @@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { } #endif assert(false); + return {}; // Squash missing return statement warning } // The following macro is used to dispatch the conversion function based on diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh index cde26dbda18c..e32684eaed24 100644 --- a/csrc/quantization/fp8/nvidia/quant_utils.cuh +++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh @@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) { } #endif assert(false); + return {}; // Squash missing return statement warning } template @@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { } #endif assert(false); + return {}; // Squash missing return statement warning } // The following macro is used to dispatch the conversion function based on diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu index 714907428a1a..8ed918b3d7c2 100644 --- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu +++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu @@ -203,7 +203,8 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, #endif mat.data_ptr(), #ifndef USE_ROCM - (half2*)mul.data(), (__half*)lookup_table.data_ptr(), + (half2*)mul.data_ptr(), + (__half*)lookup_table.data_ptr(), #else (float2*)mul.data_ptr(), (__half*)lookup_table.data_ptr(),