Skip to content

[MISC] Remove unused variableds in C++ #19609

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions csrc/attention/paged_attention_v1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1);

[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);

// NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr =
alibi_slopes
Expand Down Expand Up @@ -193,4 +190,4 @@ void paged_attention_v1(
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP
#undef DIVIDE_ROUND_UP
5 changes: 1 addition & 4 deletions csrc/attention/paged_attention_v2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1);

[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);

// NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr =
alibi_slopes
Expand Down Expand Up @@ -203,4 +200,4 @@ void paged_attention_v2(
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP
#undef DIVIDE_ROUND_UP
1 change: 0 additions & 1 deletion csrc/prepare_inputs/advance_step.cu
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,6 @@ void advance_step_flashinfer(
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);

[[maybe_unused]] int block_tables_stride = block_tables.stride(0);
TORCH_CHECK((blocks * threads > num_queries),
"multi-step: not enough threads to map to num_queries = ",
num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
Expand Down
2 changes: 0 additions & 2 deletions csrc/quantization/fp8/amd/quant_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,6 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
template <>
__inline__ __device__ uint32_t
scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
[[maybe_unused]] __half2_raw h2r =
__hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
union {
__half2_raw h2r;
uint32_t ui32;
Expand Down
8 changes: 0 additions & 8 deletions csrc/quantization/gptq/q_gemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
auto offset_m = blockIdx.y * m_count;
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

int n = offset_n + t * 4;
Expand Down Expand Up @@ -344,8 +342,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
auto offset_m = blockIdx.y * m_count;
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

int n = offset_n + t * 4;
Expand Down Expand Up @@ -465,8 +461,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
auto offset_m = blockIdx.y * m_count;
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

int n = offset_n + t * 4;
Expand Down Expand Up @@ -593,8 +587,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
auto offset_m = blockIdx.y * m_count;
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;

[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

int n = offset_n + t * 4;
Expand Down
20 changes: 0 additions & 20 deletions csrc/rocm/attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,6 @@ __device__ __forceinline__ T from_float(const float& inp) {

template <typename T>
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
[[maybe_unused]] union tmpcvt {
uint16_t u;
_Float16 f;
__hip_bfloat16 b;
} t16;
_B16x4 ret;
if constexpr (std::is_same<T, _Float16>::value) {
union h2cvt {
Expand Down Expand Up @@ -169,11 +164,6 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
template <typename T>
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
const _B16x4& inp2) {
[[maybe_unused]] union tmpcvt {
uint16_t u;
_Float16 f;
__hip_bfloat16 b;
} t1, t2, res;
_B16x4 ret;
if constexpr (std::is_same<T, _Float16>::value) {
union h2cvt {
Expand Down Expand Up @@ -325,8 +315,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(

constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);

[[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
[[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
// shared_logits is used for multiple purposes
__shared__ _B16x4 shared_logits[NWARPS][4][16][4];

Expand Down Expand Up @@ -444,8 +432,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
const int klocal_token_idx =
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
[[maybe_unused]] const int kglobal_token_idx =
partition_start_token_idx + klocal_token_idx;
const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;

Expand Down Expand Up @@ -1309,9 +1295,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const auto warpid = threadIdx.x / WARP_SIZE;
[[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE;

__shared__ float shared_global_exp_sum;
// max num partitions supported is warp_size * NPAR_LOOPS
Expand Down Expand Up @@ -2080,9 +2064,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int warpid = threadIdx.x / WARP_SIZE;
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;

__shared__ float shared_global_exp_sum;
// max num partitions supported is warp_size * NPAR_LOOPS
Expand Down Expand Up @@ -2816,9 +2798,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int warpid = threadIdx.x / WARP_SIZE;
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;

__shared__ float shared_global_exp_sum;
// max num partitions supported is warp_size * NPAR_LOOPS
Expand Down