Skip to content

Commit 2700d3a

Browse files
ikawrakowIwan Kawrakow
andauthored
IQ4_K_R4 (#138)
* iq4_k_r4: WIP * iq4_k_r4: Zen4 and hopefully AVX2 On Zen4 we get PP-512(LLaMA-3.1-8B) = 232.6 t/s, up from 182.2 t/s for iq4_k. Applying the extra shift costs a ~6 performance penalty. * iq4_k_r4: AVX2 PP-512 = 227.60 t/s. The shifts are really costly. * iq4_k_r4: NEON We get PP-512(LLaMA-3.1-8B) = 108 t/s, up from 58.2 t/s for iq4_k. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent aecc95c commit 2700d3a

File tree

10 files changed

+443
-16
lines changed

10 files changed

+443
-16
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
5757
{ "IQ3_K", LLAMA_FTYPE_MOSTLY_IQ3_K, " 3.44 bpw non-linear quantization", },
5858
{ "IQ3_KL", LLAMA_FTYPE_MOSTLY_IQ3_KL, " 4 bpw non-linear quantization mix",},
5959
{ "IQ4_K", LLAMA_FTYPE_MOSTLY_IQ4_K, " 4.5 bpw non-linear quantization", },
60+
{ "IQ4_K_R4", LLAMA_FTYPE_MOSTLY_IQ4_K_R4, "IQ4_K repacked", },
6061
{ "IQ5_K", LLAMA_FTYPE_MOSTLY_IQ5_K, " 5.5 bpw non-linear quantization", },
6162
{ "IQ6_K", LLAMA_FTYPE_MOSTLY_IQ6_K, " 6.6 bpw non-linear quantization", },
6263
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },

ggml/include/ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ extern "C" {
421421
GGML_TYPE_IQ4_XS_R4 = 223,
422422
GGML_TYPE_Q6_0_R4 = 233,
423423
GGML_TYPE_IQ2_BN_R4 = 335,
424+
GGML_TYPE_IQ4_K_R4 = 339,
424425
GGML_TYPE_COUNT,
425426
};
426427

@@ -492,6 +493,7 @@ extern "C" {
492493
GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors
493494
GGML_FTYPE_MOSTLY_Q6_0_R4 = 227, // except 1d tensors
494495
GGML_FTYPE_MOSTLY_IQ2_BN_R4 = 329, // except 1d tensors
496+
GGML_FTYPE_MOSTLY_IQ4_K_R4 = 332, // except 1d tensors
495497
};
496498

497499
// available tensor operations:

ggml/src/ggml-common.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,15 @@ typedef struct {
541541
} block_iq4_k;
542542
static_assert(sizeof(block_iq4_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + 3*QK_K/64, "wrong iq4_k block size/padding");
543543

544+
typedef struct {
545+
ggml_half d[4];
546+
uint8_t extra[8];
547+
uint8_t scales_h[QK_K/16];
548+
uint8_t scales_l[QK_K/8];
549+
uint8_t qs[QK_K*2];
550+
} block_iq4_k_r4;
551+
static_assert(sizeof(block_iq4_k_r4) == 4*sizeof(block_iq4_k), "wrong iq4_k_r4 block size/padding");
552+
544553
typedef struct {
545554
ggml_half d;
546555
uint16_t extra;

ggml/src/ggml-quants.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15207,6 +15207,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
1520715207
case GGML_TYPE_Q4_K_R4: break;
1520815208
case GGML_TYPE_Q5_K_R4: break;
1520915209
case GGML_TYPE_Q6_K_R4: break;
15210+
case GGML_TYPE_IQ4_K_R4: break;
1521015211
case GGML_TYPE_Q4_0_4_4:
1521115212
case GGML_TYPE_Q4_0_4_8:
1521215213
{

ggml/src/ggml.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,6 +1313,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
13131313
.nrows = 1,
13141314
.row_meta_size = 0,
13151315
},
1316+
[GGML_TYPE_IQ4_K_R4] = {
1317+
.type_name = "iq4_k_r4",
1318+
.blck_size = QK_K,
1319+
.type_size = sizeof(block_iq4_k),
1320+
.is_quantized = true,
1321+
.to_float = (ggml_to_float_t) dequantize_row_iq4_k_r4,
1322+
.from_float = quantize_row_iq4_k_r4,
1323+
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_k_r4_ref,
1324+
.vec_dot = vec_dot_iq4_k_r4_q8_k,
1325+
.vec_dot_type = GGML_TYPE_Q8_K,
1326+
.nrows = 1,
1327+
.row_meta_size = 0,
1328+
},
13161329
[GGML_TYPE_IQ5_K] = {
13171330
.type_name = "iq5_k",
13181331
.blck_size = QK_K,
@@ -4114,6 +4127,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
41144127
case GGML_FTYPE_MOSTLY_IQ2_KS: wtype = GGML_TYPE_IQ2_KS; break;
41154128
case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break;
41164129
case GGML_FTYPE_MOSTLY_IQ4_K: wtype = GGML_TYPE_IQ4_K; break;
4130+
case GGML_FTYPE_MOSTLY_IQ4_K_R4: wtype = GGML_TYPE_IQ4_K_R4; break;
41174131
case GGML_FTYPE_MOSTLY_IQ5_K: wtype = GGML_TYPE_IQ5_K; break;
41184132
case GGML_FTYPE_MOSTLY_IQ6_K: wtype = GGML_TYPE_IQ6_K; break;
41194133
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
@@ -10649,6 +10663,7 @@ static void ggml_compute_forward_add(
1064910663
case GGML_TYPE_IQ2_KS:
1065010664
case GGML_TYPE_IQ3_K:
1065110665
case GGML_TYPE_IQ4_K:
10666+
case GGML_TYPE_IQ4_K_R4:
1065210667
case GGML_TYPE_IQ5_K:
1065310668
case GGML_TYPE_IQ6_K:
1065410669
case GGML_TYPE_IQ3_S:
@@ -11103,6 +11118,7 @@ static void ggml_compute_forward_add1(
1110311118
case GGML_TYPE_IQ2_KS:
1110411119
case GGML_TYPE_IQ3_K:
1110511120
case GGML_TYPE_IQ4_K:
11121+
case GGML_TYPE_IQ4_K_R4:
1110611122
case GGML_TYPE_IQ5_K:
1110711123
case GGML_TYPE_IQ6_K:
1110811124
case GGML_TYPE_IQ3_S:
@@ -11254,6 +11270,7 @@ static void ggml_compute_forward_acc(
1125411270
case GGML_TYPE_IQ2_KS:
1125511271
case GGML_TYPE_IQ3_K:
1125611272
case GGML_TYPE_IQ4_K:
11273+
case GGML_TYPE_IQ4_K_R4:
1125711274
case GGML_TYPE_IQ5_K:
1125811275
case GGML_TYPE_IQ6_K:
1125911276
case GGML_TYPE_IQ3_S:
@@ -14451,6 +14468,7 @@ static void ggml_compute_forward_out_prod(
1445114468
case GGML_TYPE_IQ2_KS:
1445214469
case GGML_TYPE_IQ3_K:
1445314470
case GGML_TYPE_IQ4_K:
14471+
case GGML_TYPE_IQ4_K_R4:
1445414472
case GGML_TYPE_IQ5_K:
1445514473
case GGML_TYPE_IQ6_K:
1445614474
case GGML_TYPE_IQ3_S:
@@ -14842,6 +14860,7 @@ static void ggml_compute_forward_set(
1484214860
case GGML_TYPE_IQ2_KS:
1484314861
case GGML_TYPE_IQ3_K:
1484414862
case GGML_TYPE_IQ4_K:
14863+
case GGML_TYPE_IQ4_K_R4:
1484514864
case GGML_TYPE_IQ5_K:
1484614865
case GGML_TYPE_IQ6_K:
1484714866
case GGML_TYPE_IQ3_S:
@@ -15127,6 +15146,7 @@ static void ggml_compute_forward_get_rows(
1512715146
case GGML_TYPE_IQ2_KS:
1512815147
case GGML_TYPE_IQ3_K:
1512915148
case GGML_TYPE_IQ4_K:
15149+
case GGML_TYPE_IQ4_K_R4:
1513015150
case GGML_TYPE_IQ5_K:
1513115151
case GGML_TYPE_IQ6_K:
1513215152
case GGML_TYPE_IQ3_S:
@@ -15739,6 +15759,7 @@ static void ggml_compute_forward_clamp(
1573915759
case GGML_TYPE_IQ2_KS:
1574015760
case GGML_TYPE_IQ3_K:
1574115761
case GGML_TYPE_IQ4_K:
15762+
case GGML_TYPE_IQ4_K_R4:
1574215763
case GGML_TYPE_IQ5_K:
1574315764
case GGML_TYPE_IQ6_K:
1574415765
case GGML_TYPE_IQ3_S:
@@ -22581,6 +22602,7 @@ size_t ggml_quantize_chunk(
2258122602
case GGML_TYPE_IQ2_KS: result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2258222603
case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2258322604
case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22605+
case GGML_TYPE_IQ4_K_R4:result = quantize_iq4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2258422606
case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2258522607
case GGML_TYPE_IQ6_K: result = quantize_iq6_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2258622608
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

0 commit comments

Comments
 (0)