Skip to content

Commit eda94dd

Browse files
NexesenexIwan Kawrakow
andcommitted
Adding Q6_0 (#77) Rev 20240807
* Adding q6_0 - basics + AVX2/Zen4 working * Adding q6_0: CUDA dequantize works, but not mmvq * Adding q6_0: CUDA mmvq works * Adding q6_0: CUDA cpy, so Q6_0 can be used for KV-cache * Add q6_0 to CPU flash attention Disappointing result: for LlaMA-3.2-1B, q6_0 K- and V-cache gives about the same PPL as q8_0 K-cache and q4_0 V-cache, while needing the exact same RAM. I.e., what was the point? * q6_0: slightly better kv-cache result Better than q8_0+q4_0, but not as good as q8_0+iq4_nl * q6_0: works on ARM_NEON * q6_0: dequantize works on Metal, but not vector dot product * q6_0: it now works on Metal Outperforms q5_0 by a significant margin. E.g. | model | size | params | backend | ngl | threads | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------------: | ---------------: | | llama 8B Q6_0 | 6.08 GiB | 8.03 B | Metal | 100 | 4 | tg128 | 44.02 ± 0.08 | | llama 8B Q5_0 | 5.21 GiB | 8.03 B | Metal | 100 | 4 | tg128 | 40.13 ± 0.12 | | llama 8B Q6_0 | 6.08 GiB | 8.03 B | Metal | 100 | 4 | pp512 | 500.55 ± 0.32 | | llama 8B Q5_0 | 5.21 GiB | 8.03 B | Metal | 100 | 4 | pp512 | 448.02 ± 0.27 | * q6_0: can now be used for kv-cache on Metal -> skipped. --------- Adaptation to mainline by me! Co-Authored-By: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 5935796 commit eda94dd

File tree

26 files changed

+9044
-5
lines changed

26 files changed

+9044
-5
lines changed

common/arg.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,6 +1297,7 @@ const std::vector<ggml_type> kv_cache_types = {
12971297
GGML_TYPE_IQ4_NL,
12981298
GGML_TYPE_Q5_0,
12991299
GGML_TYPE_Q5_1,
1300+
GGML_TYPE_Q6_0,
13001301
};
13011302

13021303
static ggml_type kv_cache_type_from_str(const std::string & s) {

ggml/include/ggml.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,11 @@ extern "C" {
423423
// GGML_TYPE_IQ4_NL_4_8 = 37,
424424
// GGML_TYPE_IQ4_NL_8_8 = 38,
425425
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
426-
GGML_TYPE_COUNT = 40,
426+
// GGML_TYPE_COUNT = 40,
427+
//
428+
GGML_TYPE_Q6_0 = 133,
429+
430+
GGML_TYPE_COUNT = 135,
427431
};
428432

429433
// precision
@@ -459,6 +463,12 @@ extern "C" {
459463
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
460464
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
461465
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
466+
467+
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 26, // except 1d tensors
468+
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 27, // except 1d tensors
469+
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 28, // except 1d tensors
470+
//
471+
GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors
462472
};
463473

464474
// available tensor operations:

ggml/src/ggml-common.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ typedef sycl::half2 ggml_half2;
108108
#define QI5_1 (QK5_1 / (4 * QR5_1))
109109
#define QR5_1 2
110110

111+
#define QI6_0 (QK6_0 / (4 * QR6_0))
112+
#define QR6_0 2
113+
111114
#define QI8_0 (QK8_0 / (4 * QR8_0))
112115
#define QR8_0 1
113116

@@ -216,6 +219,14 @@ typedef struct {
216219
} block_q5_1;
217220
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
218221

222+
#define QK6_0 32
223+
typedef struct {
224+
ggml_half d; // delta
225+
uint8_t qh[QK6_0/4]; // 5+6-th bit of quants
226+
uint8_t qs[QK6_0/2]; // nibbles / quants
227+
} block_q6_0;
228+
static_assert(sizeof(block_q6_0) == sizeof(ggml_half) + QK6_0/2 + QK6_0/4, "wrong q6_0 block size/padding");
229+
219230
#define QK8_0 32
220231
typedef struct {
221232
ggml_half d; // delta

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
1313
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
1414
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
15+
#define ggml_vec_dot_q6_0_q8_0_generic ggml_vec_dot_q6_0_q8_0
1516
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
1617
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
1718
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K

ggml/src/ggml-cpu/arch/x86/quants.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,21 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
10091009
#endif
10101010
}
10111011

1012+
void ggml_vec_dot_q6_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1013+
// #if GGML_USE_IQK_MULMAT
1014+
#ifdef __AVX2__
1015+
const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1;
1016+
#else
1017+
const enum ggml_type vec_dot_type = GGML_TYPE_Q8_0;
1018+
#endif
1019+
// if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q6_0, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) {
1020+
// return;
1021+
// }
1022+
// #endif
1023+
// TODO
1024+
*s = 0;
1025+
}
1026+
10121027
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
10131028
const int qk = QK8_0;
10141029
const int nb = n / qk;

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
242242
.vec_dot_type = GGML_TYPE_Q8_1,
243243
.nrows = 1,
244244
},
245+
[GGML_TYPE_Q6_0] = {
246+
.from_float = quantize_row_q6_0,
247+
.vec_dot = ggml_vec_dot_q6_0_q8_0,
248+
.vec_dot_type = GGML_TYPE_Q8_0,
249+
.nrows = 1,
250+
},
245251
[GGML_TYPE_Q8_0] = {
246252
.from_float = quantize_row_q8_0,
247253
.vec_dot = ggml_vec_dot_q8_0_q8_0,

ggml/src/ggml-cpu/ops.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,7 @@ void ggml_compute_forward_add(
666666
case GGML_TYPE_Q4_1:
667667
case GGML_TYPE_Q5_0:
668668
case GGML_TYPE_Q5_1:
669+
case GGML_TYPE_Q6_0:
669670
case GGML_TYPE_Q8_0:
670671
case GGML_TYPE_MXFP4:
671672
case GGML_TYPE_Q2_K:
@@ -1114,6 +1115,7 @@ void ggml_compute_forward_add1(
11141115
case GGML_TYPE_Q4_1:
11151116
case GGML_TYPE_Q5_0:
11161117
case GGML_TYPE_Q5_1:
1118+
case GGML_TYPE_Q6_0:
11171119
case GGML_TYPE_Q8_0:
11181120
case GGML_TYPE_Q8_1:
11191121
case GGML_TYPE_MXFP4:
@@ -1242,6 +1244,7 @@ void ggml_compute_forward_acc(
12421244
case GGML_TYPE_Q4_1:
12431245
case GGML_TYPE_Q5_0:
12441246
case GGML_TYPE_Q5_1:
1247+
case GGML_TYPE_Q6_0:
12451248
case GGML_TYPE_Q8_0:
12461249
case GGML_TYPE_Q8_1:
12471250
case GGML_TYPE_MXFP4:
@@ -4139,6 +4142,7 @@ void ggml_compute_forward_out_prod(
41394142
case GGML_TYPE_Q4_1:
41404143
case GGML_TYPE_Q5_0:
41414144
case GGML_TYPE_Q5_1:
4145+
case GGML_TYPE_Q6_0:
41424146
case GGML_TYPE_Q8_0:
41434147
case GGML_TYPE_MXFP4:
41444148
case GGML_TYPE_Q2_K:
@@ -4413,6 +4417,7 @@ void ggml_compute_forward_set(
44134417
case GGML_TYPE_Q4_1:
44144418
case GGML_TYPE_Q5_0:
44154419
case GGML_TYPE_Q5_1:
4420+
case GGML_TYPE_Q6_0:
44164421
case GGML_TYPE_Q8_0:
44174422
case GGML_TYPE_Q8_1:
44184423
case GGML_TYPE_MXFP4:
@@ -4675,6 +4680,7 @@ void ggml_compute_forward_get_rows(
46754680
case GGML_TYPE_Q4_1:
46764681
case GGML_TYPE_Q5_0:
46774682
case GGML_TYPE_Q5_1:
4683+
case GGML_TYPE_Q6_0:
46784684
case GGML_TYPE_Q8_0:
46794685
case GGML_TYPE_Q8_1:
46804686
case GGML_TYPE_MXFP4:
@@ -5399,6 +5405,7 @@ void ggml_compute_forward_clamp(
53995405
case GGML_TYPE_Q4_1:
54005406
case GGML_TYPE_Q5_0:
54015407
case GGML_TYPE_Q5_1:
5408+
case GGML_TYPE_Q6_0:
54025409
case GGML_TYPE_Q8_0:
54035410
case GGML_TYPE_Q8_1:
54045411
case GGML_TYPE_MXFP4:

ggml/src/ggml-cpu/quants.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
3838
quantize_row_q5_1_ref(x, y, k);
3939
}
4040

41+
void quantize_row_q6_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
42+
quantize_row_q6_0_ref(x, y, k);
43+
}
44+
4145
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
4246
quantize_row_q8_0_ref(x, y, k);
4347
}

ggml/src/ggml-cpu/quants.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
1616
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
1717
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
1818
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
19+
void quantize_row_q6_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
1920
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2021
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2122

@@ -39,6 +40,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
3940
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
4041
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
4142
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
43+
void ggml_vec_dot_q6_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
4244
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
4345

4446
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -70,6 +72,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
7072
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
7173
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
7274
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
75+
void ggml_vec_dot_q6_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
7376
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
7477

7578
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

ggml/src/ggml-cuda/common.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
717717
static constexpr int qi = QI5_1;
718718
};
719719

720+
template<>
721+
struct ggml_cuda_type_traits<GGML_TYPE_Q6_0> {
722+
static constexpr int qk = QK6_0;
723+
static constexpr int qr = QR6_0;
724+
static constexpr int qi = QI6_0;
725+
};
726+
720727
template<>
721728
struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
722729
static constexpr int qk = QK8_0;

0 commit comments

Comments
 (0)