[Bugfix] fix use_atomic_add support of marlin kernel when using v1 engine (vllm-project#15946)

jinzhen-lin · yangw-dev · commit 807cdf3e4e21 · 2025-04-21T10:08:11.000-07:00
Signed-off-by: Jinzhen Lin &lt;linjinzhen@hotmail.com&gt;
Signed-off-by: Yang Wang &lt;elainewy@meta.com&gt;
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1785,7 +1785,7 @@ __global__ void Marlin(
             <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                 A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
                 num_groups, prob_m, prob_n, prob_k, lda, locks,                \
-                use_atomic_add, use_fp32_reduce);                              \
+                part_use_atomic_add, use_fp32_reduce);                         \
       }                                                                        \
     }
 
@@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
       thread_m_blocks = exec_cfg.max_m_blocks;
     }
 
+    // atomic add reduce have better performance only when m * n is small
+    bool part_use_atomic_add =
+        use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
+
     if (false) {
     }
     GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -305,7 +305,7 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
 
     # the performance of atomicAdd is better than global reduce
     # only when m*n is small and k is large
-    return max(m, 64) * n < 64 * 2048 and k >= 2048
+    return n < 2048 and k >= 2048
 
 
 def apply_gptq_marlin_linear(

Original file line number	Diff line number	Diff line change
`@@ -1785,7 +1785,7 @@ __global__ void Marlin(`
`1785`	`1785`	`<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \`
`1786`	`1786`	`A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \`
`1787`	`1787`	`num_groups, prob_m, prob_n, prob_k, lda, locks, \`
`1788`		`- use_atomic_add, use_fp32_reduce); \`
	`1788`	`+ part_use_atomic_add, use_fp32_reduce); \`
`1789`	`1789`	`} \`
`1790`	`1790`	`}`
`1791`	`1791`
`@@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,`
`2215`	`2215`	`thread_m_blocks = exec_cfg.max_m_blocks;`
`2216`	`2216`	`}`
`2217`	`2217`
	`2218`	`+ // atomic add reduce have better performance only when m * n is small`
	`2219`	`+ bool part_use_atomic_add =`
	`2220`	`+ use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;`
	`2221`	`+`
`2218`	`2222`	`if (false) {`
`2219`	`2223`	`}`
`2220`	`2224`	`GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)`