Skip to content

Commit 8db50dc

Browse files
committed
Revert "Adding Q6_0 (#77)"
This reverts commit 1749e210d697d03f8a0d45e59257afa74b52f7f9.
1 parent 3d8f5eb commit 8db50dc

File tree

18 files changed

+4
-9193
lines changed

18 files changed

+4
-9193
lines changed

common/common.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,9 +1036,6 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
10361036
if (s == "q5_1") {
10371037
return GGML_TYPE_Q5_1;
10381038
}
1039-
if (s == "q6_0") {
1040-
return GGML_TYPE_Q6_0;
1041-
}
10421039

10431040
throw std::runtime_error("Invalid cache type: " + s);
10441041
}

examples/quantize/quantize.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
2121
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
2222
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
2323
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
24-
{ "Q6_0", LLAMA_FTYPE_MOSTLY_Q6_0, " 6.5 bpw quantization", },
2524
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
2625
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
2726
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },

ggml/include/ggml.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,6 @@ extern "C" {
397397
GGML_TYPE_Q4_0_8_8 = 33,
398398
GGML_TYPE_TQ1_0 = 34,
399399
GGML_TYPE_TQ2_0 = 35,
400-
//
401-
GGML_TYPE_Q6_0 = 133,
402400
GGML_TYPE_COUNT,
403401
};
404402

@@ -443,8 +441,6 @@ extern "C" {
443441
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
444442
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
445443
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
446-
//
447-
GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors
448444
};
449445

450446
// available tensor operations:

ggml/src/ggml-common.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,6 @@ typedef sycl::half2 ggml_half2;
8888
#define QI5_1 (QK5_1 / (4 * QR5_1))
8989
#define QR5_1 2
9090

91-
#define QI6_0 (QK6_0 / (4 * QR6_0))
92-
#define QR6_0 2
93-
9491
#define QI8_0 (QK8_0 / (4 * QR8_0))
9592
#define QR8_0 1
9693

@@ -186,14 +183,6 @@ typedef struct {
186183
} block_q5_1;
187184
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
188185

189-
#define QK6_0 32
190-
typedef struct {
191-
ggml_half d; // delta
192-
uint8_t qh[QK6_0/4]; // 5+6-th bit of quants
193-
uint8_t qs[QK6_0/2]; // nibbles / quants
194-
} block_q6_0;
195-
static_assert(sizeof(block_q6_0) == sizeof(ggml_half) + QK6_0/2 + QK6_0/4, "wrong q6_0 block size/padding");
196-
197186
#define QK8_0 32
198187
typedef struct {
199188
ggml_half d; // delta

ggml/src/ggml-cuda.cu

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3002,7 +3002,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30023002
case GGML_TYPE_Q4_1:
30033003
case GGML_TYPE_Q5_0:
30043004
case GGML_TYPE_Q5_1:
3005-
case GGML_TYPE_Q6_0:
30063005
case GGML_TYPE_Q8_0:
30073006
case GGML_TYPE_Q2_K:
30083007
case GGML_TYPE_Q3_K:
@@ -3074,9 +3073,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30743073
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
30753074
return true;
30763075
}
3077-
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q6_0) {
3078-
return true;
3079-
}
30803076
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
30813077
return true;
30823078
}

ggml/src/ggml-cuda/common.cuh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -387,13 +387,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
387387
static constexpr int qi = QI5_1;
388388
};
389389

390-
template<>
391-
struct ggml_cuda_type_traits<GGML_TYPE_Q6_0> {
392-
static constexpr int qk = QK6_0;
393-
static constexpr int qr = QR6_0;
394-
static constexpr int qi = QI6_0;
395-
};
396-
397390
template<>
398391
struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
399392
static constexpr int qk = QK8_0;

ggml/src/ggml-cuda/convert.cu

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -122,36 +122,6 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
122122
}
123123
}
124124

125-
template<typename dst_t>
126-
static __global__ void dequantize_block_q6_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
127-
128-
const int64_t i = blockIdx.x;
129-
130-
// assume 32 threads
131-
const int64_t tid = threadIdx.x;
132-
const int64_t il = tid/8;
133-
const int64_t ir = tid%8;
134-
const int64_t ib = 8*i + ir;
135-
if (ib >= nb32) {
136-
return;
137-
}
138-
139-
dst_t * y = yy + 256*i + 32*ir + 4*il;
140-
141-
const block_q6_0 * x = (const block_q6_0 *)vx + ib;
142-
const float d = __half2float(x->d);
143-
const float dm = -32*d;
144-
145-
const uint8_t * qs = x->qs + 4*il;
146-
const uint8_t * qh = x->qh + 4*(il%2);
147-
148-
for (int l = 0; l < 4; ++l) {
149-
const uint8_t h = qh[l] >> 4*(il/2);
150-
y[l+ 0] = d * ((qs[l] & 0xF) | ((h << 4) & 0x30)) + dm;
151-
y[l+16] = d * ((qs[l] >> 4) | ((h << 2) & 0x30)) + dm;
152-
}
153-
}
154-
155125
//================================== k-quants
156126

157127
template<typename dst_t>
@@ -527,13 +497,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k
527497
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
528498
}
529499

530-
template<typename dst_t>
531-
static void dequantize_row_q6_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
532-
const int nb32 = k / 32;
533-
const int nb = (k + 255) / 256;
534-
dequantize_block_q6_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
535-
}
536-
537500
template<typename dst_t>
538501
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
539502
const int nb = k / QK_K;
@@ -635,8 +598,6 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
635598
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
636599
case GGML_TYPE_Q5_1:
637600
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
638-
case GGML_TYPE_Q6_0:
639-
return dequantize_row_q6_0_cuda;
640601
case GGML_TYPE_Q8_0:
641602
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
642603
return dequantize_block_q8_0_f16_cuda;
@@ -687,8 +648,6 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
687648
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
688649
case GGML_TYPE_Q5_1:
689650
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
690-
case GGML_TYPE_Q6_0:
691-
return dequantize_row_q6_0_cuda;
692651
case GGML_TYPE_Q8_0:
693652
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
694653
case GGML_TYPE_Q2_K:

ggml/src/ggml-cuda/cpy.cu

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -225,41 +225,6 @@ static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
225225
memcpy(dsti->qh, &qh, sizeof(qh));
226226
}
227227

228-
static __device__ void cpy_blck_f32_q6_0(const char * cxi, char * cdsti) {
229-
const float * xi = (const float *) cxi;
230-
block_q6_0 * dsti = (block_q6_0 *) cdsti;
231-
232-
float amax = 0.0f;
233-
float vmax = 0.0f;
234-
235-
for (int j = 0; j < QK6_0; ++j) {
236-
const float v = xi[j];
237-
const float av = fabsf(xi[j]);
238-
if (amax < av) {
239-
amax = av;
240-
vmax = v;
241-
}
242-
}
243-
244-
const float d = vmax / -32;
245-
const float id = d ? 1.0f/d : 0.0f;
246-
247-
dsti->d = d;
248-
memset(dsti->qh, 0, QK6_0/4);
249-
250-
for (int j = 0; j < QK6_0/2; ++j) {
251-
const float x0 = xi[0 + j]*id;
252-
const float x1 = xi[QK4_0/2 + j]*id;
253-
254-
const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
255-
const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));
256-
257-
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
258-
const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2);
259-
dsti->qh[j%(QK6_0/4)] |= (h << 4*(j/(QK6_0/4)));
260-
}
261-
}
262-
263228
static __device__ const int8_t iq4nl_index[241] = {
264229
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
265230
1, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 18, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -462,17 +427,6 @@ static void ggml_cpy_f32_q5_1_cuda(
462427
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
463428
}
464429

465-
static void ggml_cpy_f32_q6_0_cuda(
466-
const char * cx, char * cdst, const int ne,
467-
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
468-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
469-
470-
GGML_ASSERT(ne % QK6_0 == 0);
471-
const int num_blocks = ne / QK6_0;
472-
cpy_f32_q<cpy_blck_f32_q6_0, QK6_0><<<num_blocks, 1, 0, stream>>>
473-
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
474-
}
475-
476430
static void ggml_cpy_f32_iq4_nl_cuda(
477431
const char * cx, char * cdst, const int ne,
478432
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -545,8 +499,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
545499
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
546500
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
547501
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
548-
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
549-
ggml_cpy_f32_q6_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
550502
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
551503
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
552504
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
@@ -587,8 +539,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
587539
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
588540
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
589541
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
590-
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
591-
return (void*) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>;
592542
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
593543
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
594544
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
88
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
99
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
1010
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
11-
type == GGML_TYPE_Q6_0 ? vec_dot_q6_0_q8_1 :
1211
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
1312
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
1413
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
@@ -32,7 +31,6 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
3231
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
3332
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
3433
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
35-
type == GGML_TYPE_Q6_0 ? VDR_Q6_0_Q8_1_MMVQ :
3634
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
3735
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
3836
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
@@ -231,13 +229,6 @@ static void mul_mat_vec_q5_1_q8_1_cuda(
231229
mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
232230
}
233231

234-
static void mul_mat_vec_q6_0_q8_1_cuda(
235-
const void * vx, const void * vy, float * dst,
236-
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
237-
238-
mul_mat_vec_q_cuda<GGML_TYPE_Q6_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
239-
}
240-
241232
static void mul_mat_vec_q8_0_q8_1_cuda(
242233
const void * vx, const void * vy, float * dst,
243234
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
@@ -376,9 +367,6 @@ void ggml_cuda_op_mul_mat_vec_q(
376367
case GGML_TYPE_Q5_1:
377368
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
378369
break;
379-
case GGML_TYPE_Q6_0:
380-
mul_mat_vec_q6_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
381-
break;
382370
case GGML_TYPE_Q8_0:
383371
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
384372
break;

ggml/src/ggml-cuda/vecdotq.cuh

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -41,30 +41,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
4141
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
4242
}
4343

44-
#define VDR_Q6_0_Q8_1_MMVQ 2
45-
#define VDR_Q6_0_Q8_1_MMQ 4
46-
47-
template <int vdr> static __device__ __forceinline__ float vec_dot_q6_0_q8_1_impl(
48-
const int * vl, const int * vh, const int * u, const float & d6, const half2 & ds8) {
49-
50-
int sumi = 0;
51-
52-
#pragma unroll
53-
for (int i = 0; i < vdr; ++i) {
54-
const int vi0 = ((vl[i] >> 0) & 0x0F0F0F0F) | ((vh[i] << 4) & 0x30303030);
55-
const int vi1 = ((vl[i] >> 4) & 0x0F0F0F0F) | ((vh[i] << 2) & 0x30303030);
56-
57-
// SIMD dot product of quantized values
58-
sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
59-
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
60-
}
61-
62-
const float2 ds8f = __half22float2(ds8);
63-
64-
// second part effectively subtracts 8 from each quant value
65-
return d6 * (sumi * ds8f.x - (32.f*vdr/QI6_0) * ds8f.y);
66-
}
67-
6844
#define VDR_Q4_1_Q8_1_MMVQ 2
6945
#define VDR_Q4_1_Q8_1_MMQ 4
7046

@@ -566,26 +542,6 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
566542
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
567543
}
568544

569-
static __device__ __forceinline__ float vec_dot_q6_0_q8_1(
570-
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
571-
572-
const block_q6_0 * bq6_0 = (const block_q6_0 *) vbq + kbx;
573-
574-
int vl[VDR_Q6_0_Q8_1_MMVQ];
575-
int vh[VDR_Q6_0_Q8_1_MMVQ];
576-
int u[2*VDR_Q6_0_Q8_1_MMVQ];
577-
578-
#pragma unroll
579-
for (int i = 0; i < VDR_Q6_0_Q8_1_MMVQ; ++i) {
580-
vl[i] = get_int_b2(bq6_0->qs, iqs + i);
581-
vh[i] = get_int_b2(bq6_0->qh, i) >> 4*(iqs/2);
582-
u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
583-
u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI6_0);
584-
}
585-
586-
return vec_dot_q6_0_q8_1_impl<VDR_Q6_0_Q8_1_MMVQ>(vl, vh, u, bq6_0->d, bq8_1->ds);
587-
}
588-
589545

590546
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
591547
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {

0 commit comments

Comments
 (0)