Skip to content

Commit b2e9831

Browse files
committed
Add fastdiv, use it in modulo and use modulo in rms_norm_f32
Fastdiv is much faster way to do integer division, which was identified as bottleneck in rms_norm_f32
1 parent b66df9d commit b2e9831

File tree

2 files changed

+254
-82
lines changed

2 files changed

+254
-82
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,33 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
563563
#endif // CUDART_VERSION >= 12050
564564
}
565565

566+
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
567+
// Precompute mp (m' in the paper) and L such that division
568+
// can be computed using a multiply (high 32b of 64b result)
569+
// and a shift:
570+
//
571+
// n/d = (mulhi(n, mp) + n) >> L;
572+
static void init_fastdiv_values(uint32_t d, uint32_t & mp, uint32_t & L) {
573+
// compute L = ceil(log2(d));
574+
L = 0;
575+
while (L < 32 && (uint32_t{ 1 } << L) < d) {
576+
L++;
577+
}
578+
579+
mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
580+
}
581+
582+
static __device__ __forceinline__ uint32_t fastdiv(uint32_t n, uint32_t mp, uint32_t L) {
583+
// Compute high 32 bits of n * mp
584+
uint32_t hi = __umulhi(n, mp);
585+
// Apply the formula
586+
return (hi + n) >> L;
587+
}
588+
589+
static __device__ __forceinline__ uint32_t modulo(uint32_t n, uint32_t divisor, int mp, uint32_t L) {
590+
return n - fastdiv(n, mp, L) * divisor;
591+
}
592+
566593
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
567594

568595
static __device__ __forceinline__ float get_alibi_slope(

0 commit comments

Comments
 (0)