From c6703d1e0d488a09dc76562c7335306f0f3486c0 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Mon, 16 Jun 2025 11:05:28 +0800
Subject: [PATCH] [MISC] Remove unused variableds in C++ (#19609)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 csrc/attention/paged_attention_v1.cu      |  5 +----
 csrc/attention/paged_attention_v2.cu      |  5 +----
 csrc/prepare_inputs/advance_step.cu       |  1 -
 csrc/quantization/fp8/amd/quant_utils.cuh |  2 --
 csrc/quantization/gptq/q_gemm.cu          |  8 --------
 csrc/rocm/attention.cu                    | 20 --------------------
 6 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
index 9b3a5c4b1014a..46108a32d719b 100644
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
   // NOTE: alibi_slopes is optional.
   const float* alibi_slopes_ptr =
       alibi_slopes
@@ -193,4 +190,4 @@ void paged_attention_v1(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
index 9935359e02fb1..9358c0d9f6a2a 100644
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
   // NOTE: alibi_slopes is optional.
   const float* alibi_slopes_ptr =
       alibi_slopes
@@ -203,4 +200,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
index fea4bc2ca0d8f..3d5077d9de461 100644
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -274,7 +274,6 @@ void advance_step_flashinfer(
   cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
   cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
 
-  [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
   TORCH_CHECK((blocks * threads > num_queries),
               "multi-step: not enough threads to map to num_queries = ",
               num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index c4ed1b4757928..e51a4e14e518f 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -446,8 +446,6 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
 template <>
 __inline__ __device__ uint32_t
 scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
-  [[maybe_unused]] __half2_raw h2r =
-      __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
   union {
     __half2_raw h2r;
     uint32_t ui32;
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index 6fad16e196bbc..43b245530e950 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -206,8 +206,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
   auto offset_m = blockIdx.y * m_count;
   auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   int n = offset_n + t * 4;
@@ -344,8 +342,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
   auto offset_m = blockIdx.y * m_count;
   auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   int n = offset_n + t * 4;
@@ -465,8 +461,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
   auto offset_m = blockIdx.y * m_count;
   auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   int n = offset_n + t * 4;
@@ -593,8 +587,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
   auto offset_m = blockIdx.y * m_count;
   auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
-  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   int n = offset_n + t * 4;
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index f1e7da1641998..39997030751b8 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -136,11 +136,6 @@ __device__ __forceinline__ T from_float(const float& inp) {
 
 template <typename T>
 __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
-  [[maybe_unused]] union tmpcvt {
-    uint16_t u;
-    _Float16 f;
-    __hip_bfloat16 b;
-  } t16;
   _B16x4 ret;
   if constexpr (std::is_same<T, _Float16>::value) {
     union h2cvt {
@@ -169,11 +164,6 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
 template <typename T>
 __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
                                         const _B16x4& inp2) {
-  [[maybe_unused]] union tmpcvt {
-    uint16_t u;
-    _Float16 f;
-    __hip_bfloat16 b;
-  } t1, t2, res;
   _B16x4 ret;
   if constexpr (std::is_same<T, _Float16>::value) {
     union h2cvt {
@@ -325,8 +315,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
 
   constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);
 
-  [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
-  [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
   // shared_logits is used for multiple purposes
   __shared__ _B16x4 shared_logits[NWARPS][4][16][4];
 
@@ -444,8 +432,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
     const int klocal_token_idx =
         TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
-    [[maybe_unused]] const int kglobal_token_idx =
-        partition_start_token_idx + klocal_token_idx;
     const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
     const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
 
@@ -1309,9 +1295,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   const auto warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
   // max num partitions supported is warp_size * NPAR_LOOPS
@@ -2080,9 +2064,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   const int warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
   // max num partitions supported is warp_size * NPAR_LOOPS
@@ -2816,9 +2798,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   const int warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
   // max num partitions supported is warp_size * NPAR_LOOPS