From 79223704a1ec33e4147ab6f0d10934667cea7200 Mon Sep 17 00:00:00 2001 From: Anav Prasad Date: Mon, 1 Jun 2026 19:38:37 -0700 Subject: [PATCH] clean up unused variables warnings (llama/23975) --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 6 +++--- ggml/src/ggml-cuda/gated_delta_net.cu | 10 +++++----- ggml/src/ggml-cuda/mmf.cuh | 6 +++--- ggml/src/ggml-cuda/mmvf.cu | 13 ++++++------- ggml/src/ggml-cuda/mmvq.cu | 13 ++++--------- ggml/src/ggml-cuda/topk-moe.cu | 2 +- 6 files changed, 22 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 3c8b6eaaf..ac5abb133 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -568,7 +568,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( constexpr bool Q_in_reg = ggml_cuda_fattn_mma_get_Q_in_reg (DKQ, DV, ncols); constexpr int nstages = ggml_cuda_fattn_mma_get_nstages (DKQ, DV, ncols1, ncols2); - constexpr int stride_tile_Q = DKQ/2 + 4; constexpr int stride_tile_K = nbatch_K2 + 4; constexpr int stride_tile_V = V_is_K_view ? stride_tile_K : nbatch_V2 + 4; @@ -604,9 +603,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( #pragma unroll for (int k0_start = (DKQ/2-1) - (DKQ/2-1) % nbatch_K2; k0_start >= 0; k0_start -= nbatch_K2) { const int k0_stop = k0_start + nbatch_K2 < DKQ/2 ? k0_start + nbatch_K2 : DKQ/2; - const int k0_diff = k0_stop - k0_start; if constexpr (nstages <= 1) { + const int k0_diff = k0_stop - k0_start; constexpr bool use_cp_async = nstages == 1; flash_attn_ext_f16_load_tile (K_h2 + int64_t(k_VKQ_0)*stride_K + k0_start, tile_K, k0_diff, stride_K, k_VKQ_sup); @@ -640,6 +639,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } } } else { + constexpr int stride_tile_Q = DKQ/2 + 4; #pragma unroll for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) { load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q); @@ -954,9 +954,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( for (int i0_start = 0; i0_start < DV; i0_start += 2*nbatch_V2) { static_assert(DV % (2*nbatch_V2) == 0, "bad loop size"); const int i0_stop = i0_start + 2*nbatch_V2; - const int i0_diff = i0_stop - i0_start; if constexpr (nstages <= 1) { + const int i0_diff = i0_stop - i0_start; if (!V_is_K_view || i0_stop > 2*nbatch_K2) { constexpr bool use_cp_async = nstages == 1; flash_attn_ext_f16_load_tile diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu index 018d5d37d..7cfda6523 100644 --- a/ggml/src/ggml-cuda/gated_delta_net.cu +++ b/ggml/src/ggml-cuda/gated_delta_net.cu @@ -43,7 +43,6 @@ gated_delta_net_cuda(const float * q, // output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before. const int64_t state_in_offset = sequence * K * H * S_v * S_v + h_idx * S_v * S_v; const int64_t state_out_offset = (sequence * H + h_idx) * S_v * S_v; - const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output state += state_out_offset; curr_state += state_in_offset + col * S_v; attn_data += (sequence * n_tokens * H + h_idx) * S_v; @@ -61,10 +60,6 @@ gated_delta_net_cuda(const float * q, s_shard[r] = curr_state[i]; } - // slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots - // are written; earlier slots are left untouched (caller-owned). - const int shift = (int) n_tokens - K; - for (int t = 0; t < n_tokens; t++) { const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1; const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1; @@ -148,6 +143,11 @@ gated_delta_net_cuda(const float * q, attn_data += S_v * H; if constexpr (keep_rs_t) { + // slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots + // are written; earlier slots are left untouched (caller-owned). + const int shift = (int) n_tokens - K; + + const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output const int target_slot = t - shift; if (target_slot >= 0 && target_slot < K) { float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset; diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index c2a8d54c9..d55cc1ec7 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -91,7 +91,7 @@ static __global__ void mul_mat_f( const int row0 = blockIdx.x * rows_per_block; int expert_idx = 0; - int col_base = 0; + [[maybe_unused]] int col_base = 0; const int channel_dst = has_ids ? 0 : blockIdx.y; @@ -122,12 +122,12 @@ static __global__ void mul_mat_f( ids += col_offset * stride_row_id; } - const float2 * y2 = (const float2 *) y; + [[maybe_unused]] const float2 * y2 = (const float2 *) y; extern __shared__ char data_mmv[]; char * shmem_base = data_mmv; - int * slot_map = (int *) shmem_base; + [[maybe_unused]] int * slot_map = (int *) shmem_base; char * compute_base = has_ids ? (shmem_base + GGML_PAD(cols_per_block, 16) * sizeof(int)) : shmem_base; tile_C C[ntA][ntB]; diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 09d95f309..3d6de64b7 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -80,9 +80,8 @@ static __global__ void mul_mat_vec_f( gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; } - const int channel_bias = ids ? channel_x : channel_dst; - if constexpr (has_fusion) { + const int channel_bias = ids ? channel_x : channel_dst; if (use_bias) { x_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst; } @@ -95,7 +94,7 @@ static __global__ void mul_mat_vec_f( extern __shared__ char data_mmv[]; float * buf_iw = (float *) data_mmv; - float * buf_iw_gate = nullptr; + [[maybe_unused]] float * buf_iw_gate = nullptr; if constexpr (has_fusion) { buf_iw_gate = (float *) (data_mmv + warp_size*sizeof(float)); } @@ -123,7 +122,7 @@ static __global__ void mul_mat_vec_f( if constexpr (std::is_same_v) { const float2 * x2 = (const float2 *) x; - const float2 * gate_x2 = nullptr; + [[maybe_unused]] const float2 * gate_x2 = nullptr; if constexpr (has_fusion) { if (use_gate) { gate_x2 = (const float2 *) gate_x; @@ -155,7 +154,7 @@ static __global__ void mul_mat_vec_f( } } else if constexpr (std::is_same_v) { const half2 * x2 = (const half2 *) x; - const half2 * gate_x2 = nullptr; + [[maybe_unused]] const half2 * gate_x2 = nullptr; if constexpr (has_fusion) { if (use_gate) { gate_x2 = (const half2 *) gate_x; @@ -266,7 +265,7 @@ static __global__ void mul_mat_vec_f( } #else const nv_bfloat162 * x2 = (const nv_bfloat162 *) x; - const nv_bfloat162 * gate_x2 = nullptr; + [[maybe_unused]] const nv_bfloat162 * gate_x2 = nullptr; if constexpr (has_fusion) { if (use_gate) { gate_x2 = (const nv_bfloat162 *) gate_x; @@ -274,7 +273,7 @@ static __global__ void mul_mat_vec_f( } for (int col2 = tid; col2 < ncols2; col2 += block_size) { const nv_bfloat162 tmpx = x2[col2]; - nv_bfloat162 tmpx_gate; + [[maybe_unused]] nv_bfloat162 tmpx_gate; if constexpr (has_fusion) { if (use_gate) { tmpx_gate = gate_x2[col2]; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index ecb6fdeda..86b4a4930 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -515,7 +515,7 @@ static __global__ void mul_mat_vec_q( bool use_gate = false; bool use_bias = false; bool use_gate_bias = false; - const void * vgate = nullptr; + [[maybe_unused]] const void * vgate = nullptr; const float * x_bias = nullptr; const float * gate_bias = nullptr; ggml_glu_op active_glu; @@ -531,8 +531,8 @@ static __global__ void mul_mat_vec_q( } - float x_biases[ncols_dst] = { 0.0f }; - float gate_biases[ncols_dst] = { 0.0f }; + [[maybe_unused]] float x_biases[ncols_dst] = { 0.0f }; + [[maybe_unused]] float gate_biases[ncols_dst] = { 0.0f }; if constexpr (has_fusion) { const uint32_t channel_bias = ids ? channel_x : channel_dst; if (use_bias) { @@ -589,12 +589,7 @@ static __global__ void mul_mat_vec_q( } __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; - __shared__ float tmp_shared_gate[(has_fusion && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; - if constexpr (!has_fusion) { - (void) tmp_shared_gate; - } else if (!use_gate) { - (void) tmp_shared_gate; - } + [[maybe_unused]] __shared__ float tmp_shared_gate[(has_fusion && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; if (threadIdx.y > 0) { #pragma unroll diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index da20c9aab..c4253bfa4 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -134,7 +134,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * // selection_wt is only needed when bias is present (selection uses wt + bias) // when no bias, we use wt directly for both selection and weight values - float selection_wt[has_bias ? experts_per_thread : 1]; + [[maybe_unused]] float selection_wt[has_bias ? experts_per_thread : 1]; if constexpr (has_bias) { #pragma unroll