From c6703d1e0d488a09dc76562c7335306f0f3486c0 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Mon, 16 Jun 2025 11:05:28 +0800 Subject: [PATCH] [MISC] Remove unused variableds in C++ (#19609) Signed-off-by: Lu Fang --- csrc/attention/paged_attention_v1.cu | 5 +---- csrc/attention/paged_attention_v2.cu | 5 +---- csrc/prepare_inputs/advance_step.cu | 1 - csrc/quantization/fp8/amd/quant_utils.cuh | 2 -- csrc/quantization/gptq/q_gemm.cu | 8 -------- csrc/rocm/attention.cu | 20 -------------------- 6 files changed, 2 insertions(+), 39 deletions(-) diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index 9b3a5c4b1014a..46108a32d719b 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -65,9 +65,6 @@ void paged_attention_v1_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - // NOTE: alibi_slopes is optional. const float* alibi_slopes_ptr = alibi_slopes @@ -193,4 +190,4 @@ void paged_attention_v1( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index 9935359e02fb1..9358c0d9f6a2a 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -66,9 +66,6 @@ void paged_attention_v2_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - // NOTE: alibi_slopes is optional. const float* alibi_slopes_ptr = alibi_slopes @@ -203,4 +200,4 @@ void paged_attention_v2( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index fea4bc2ca0d8f..3d5077d9de461 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -274,7 +274,6 @@ void advance_step_flashinfer( cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - [[maybe_unused]] int block_tables_stride = block_tables.stride(0); TORCH_CHECK((blocks * threads > num_queries), "multi-step: not enough threads to map to num_queries = ", num_queries, " block_tables.stride(0) = ", block_tables.stride(0), diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index c4ed1b4757928..e51a4e14e518f 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -446,8 +446,6 @@ scaled_vec_conversion(const uint8_t& a, float scale) { template <> __inline__ __device__ uint32_t scaled_vec_conversion(const uint16_t& a, float scale) { - [[maybe_unused]] __half2_raw h2r = - __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret); union { __half2_raw h2r; uint32_t ui32; diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu index 6fad16e196bbc..43b245530e950 100644 --- a/csrc/quantization/gptq/q_gemm.cu +++ b/csrc/quantization/gptq/q_gemm.cu @@ -206,8 +206,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel( auto offset_m = blockIdx.y * m_count; auto offset_k = blockIdx.z * BLOCK_KN_SIZE; - [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -344,8 +342,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel( auto offset_m = blockIdx.y * m_count; auto offset_k = blockIdx.z * BLOCK_KN_SIZE; - [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -465,8 +461,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel( auto offset_m = blockIdx.y * m_count; auto offset_k = blockIdx.z * BLOCK_KN_SIZE; - [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -593,8 +587,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel( auto offset_m = blockIdx.y * m_count; auto offset_k = blockIdx.z * BLOCK_KN_SIZE; - [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index f1e7da1641998..39997030751b8 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -136,11 +136,6 @@ __device__ __forceinline__ T from_float(const float& inp) { template __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { - [[maybe_unused]] union tmpcvt { - uint16_t u; - _Float16 f; - __hip_bfloat16 b; - } t16; _B16x4 ret; if constexpr (std::is_same::value) { union h2cvt { @@ -169,11 +164,6 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { template __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1, const _B16x4& inp2) { - [[maybe_unused]] union tmpcvt { - uint16_t u; - _Float16 f; - __hip_bfloat16 b; - } t1, t2, res; _B16x4 ret; if constexpr (std::is_same::value) { union h2cvt { @@ -325,8 +315,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4); - [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1]; - [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1]; // shared_logits is used for multiple purposes __shared__ _B16x4 shared_logits[NWARPS][4][16][4]; @@ -444,8 +432,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; - [[maybe_unused]] const int kglobal_token_idx = - partition_start_token_idx + klocal_token_idx; const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; @@ -1309,9 +1295,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const auto warpid = threadIdx.x / WARP_SIZE; - [[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS @@ -2080,9 +2064,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; - [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS @@ -2816,9 +2798,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; - [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS