Skip to content

Commit 7391cd2

Browse files
ikawrakowIwan Kawrakow
authored andcommitted
iq4_nl: faster quantization (#76)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 1574016 commit 7391cd2

File tree

1 file changed

+21
-4
lines changed

1 file changed

+21
-4
lines changed

ggml/src/ggml-quants.c

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4636,6 +4636,23 @@ size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
46364636

46374637
// ============================ 4-bit non-linear quants
46384638

4639+
static const int8_t iq4nl_index[241] = {
4640+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4641+
1, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 18, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4642+
3, 3, 3, 3, 3, 3, 19, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
4643+
5, 5, 21, 21, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 22, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 23, 23, 8, 8, 8, 8,
4644+
8, 8, 8, 8, 8, 8, 24, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 25, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 26, 26,
4645+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 27, 27, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 28, 13, 13, 13,
4646+
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 29, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
4647+
14, 14, 14, 14, 30, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
4648+
};
4649+
static inline int best_index_iq4nl(const int8_t * values, float x) {
4650+
int ix = (int)x - values[0];
4651+
if (ix < 0 || ix >= 241) return ix < 0 ? 0 : 15;
4652+
ix = iq4nl_index[ix];
4653+
return ix < 16 ? ix : x - values[ix-16] < values[ix-15] - x ? ix-16 : ix-15;
4654+
}
4655+
46394656
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
46404657
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
46414658
float * scales, float * weight, uint8_t * L,
@@ -4676,7 +4693,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
46764693
float sumqx = 0, sumq2 = 0;
46774694
for (int j = 0; j < block_size; ++j) {
46784695
float al = id*xb[j];
4679-
int l = best_index_int8(16, values, al);
4696+
int l = best_index_iq4nl(values, al);
46804697
Lb[j] = l;
46814698
float q = values[l];
46824699
float w = weight[j];
@@ -4690,7 +4707,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
46904707
sumqx = sumq2 = 0;
46914708
for (int j = 0; j < block_size; ++j) {
46924709
float al = id*xb[j];
4693-
int l = best_index_int8(16, values, al);
4710+
int l = best_index_iq4nl(values, al);
46944711
float q = values[l];
46954712
float w = weight[j];
46964713
sumqx += w*q*xb[j];
@@ -4721,7 +4738,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
47214738
uint8_t * Lb = L + ib*block_size;
47224739
const float * xb = x + ib*block_size;
47234740
for (int j = 0; j < block_size; ++j) {
4724-
Lb[j] = best_index_int8(16, values, idl*xb[j]);
4741+
Lb[j] = best_index_iq4nl(values, idl*xb[j]);
47254742
}
47264743
l += 32;
47274744
uint8_t l_l = l & 0xf;
@@ -4735,7 +4752,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
47354752
if (ntry > 0) {
47364753
float id = scales[0] ? 1/scales[0] : 0;
47374754
for (int j = 0; j < super_block_size; ++j) {
4738-
L[j] = best_index_int8(16, values, id*x[j]);
4755+
L[j] = best_index_iq4nl(values, id*x[j]);
47394756
}
47404757
}
47414758
}

0 commit comments

Comments
 (0)