Skip to content

Commit 807cdf3

Browse files
jinzhen-linyangw-dev
authored andcommitted
[Bugfix] fix use_atomic_add support of marlin kernel when using v1 engine (vllm-project#15946)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> Signed-off-by: Yang Wang <elainewy@meta.com>
1 parent b76ef0c commit 807cdf3

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

csrc/quantization/gptq_marlin/gptq_marlin.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1785,7 +1785,7 @@ __global__ void Marlin(
17851785
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
17861786
A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \
17871787
num_groups, prob_m, prob_n, prob_k, lda, locks, \
1788-
use_atomic_add, use_fp32_reduce); \
1788+
part_use_atomic_add, use_fp32_reduce); \
17891789
} \
17901790
}
17911791

@@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
22152215
thread_m_blocks = exec_cfg.max_m_blocks;
22162216
}
22172217

2218+
// atomic add reduce have better performance only when m * n is small
2219+
bool part_use_atomic_add =
2220+
use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
2221+
22182222
if (false) {
22192223
}
22202224
GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)

vllm/model_executor/layers/quantization/utils/marlin_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
305305

306306
# the performance of atomicAdd is better than global reduce
307307
# only when m*n is small and k is large
308-
return max(m, 64) * n < 64 * 2048 and k >= 2048
308+
return n < 2048 and k >= 2048
309309

310310

311311
def apply_gptq_marlin_linear(

0 commit comments

Comments
 (0)