Skip to content

Commit ed2e379

Browse files
ikawrakowIwan Kawrakow
authored andcommitted
Adding Q6_0 (#77)
* Adding q6_0 - basics + AVX2/Zen4 working * Adding q6_0: CUDA dequantize works, but not mmvq * Adding q6_0: CUDA mmvq works * Adding q6_0: CUDA cpy, so Q6_0 can be used for KV-cache * Add q6_0 to CPU flash attention Disappointing result: for LlaMA-3.2-1B, q6_0 K- and V-cache gives about the same PPL as q8_0 K-cache and q4_0 V-cache, while needing the exact same RAM. I.e., what was the point? * q6_0: slightly better kv-cache result Better than q8_0+q4_0, but not as good as q8_0+iq4_nl * q6_0: works on ARM_NEON * q6_0: dequantize works on Metal, but not vector dot product * q6_0: it now works on Metal Outperforms q5_0 by a significant margin. E.g. | model | size | params | backend | ngl | threads | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------------: | ---------------: | | llama 8B Q6_0 | 6.08 GiB | 8.03 B | Metal | 100 | 4 | tg128 | 44.02 ± 0.08 | | llama 8B Q5_0 | 5.21 GiB | 8.03 B | Metal | 100 | 4 | tg128 | 40.13 ± 0.12 | | llama 8B Q6_0 | 6.08 GiB | 8.03 B | Metal | 100 | 4 | pp512 | 500.55 ± 0.32 | | llama 8B Q5_0 | 5.21 GiB | 8.03 B | Metal | 100 | 4 | pp512 | 448.02 ± 0.27 | * q6_0: can now be used for kv-cache on Metal --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 03eb81b commit ed2e379

File tree

18 files changed

+9193
-4
lines changed

18 files changed

+9193
-4
lines changed

common/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,6 +1036,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
10361036
if (s == "q5_1") {
10371037
return GGML_TYPE_Q5_1;
10381038
}
1039+
if (s == "q6_0") {
1040+
return GGML_TYPE_Q6_0;
1041+
}
10391042

10401043
throw std::runtime_error("Invalid cache type: " + s);
10411044
}

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
2121
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
2222
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
2323
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
24+
{ "Q6_0", LLAMA_FTYPE_MOSTLY_Q6_0, " 6.5 bpw quantization", },
2425
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
2526
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
2627
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },

ggml/include/ggml.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,8 @@ extern "C" {
397397
GGML_TYPE_Q4_0_8_8 = 33,
398398
GGML_TYPE_TQ1_0 = 34,
399399
GGML_TYPE_TQ2_0 = 35,
400+
//
401+
GGML_TYPE_Q6_0 = 133,
400402
GGML_TYPE_COUNT,
401403
};
402404

@@ -441,6 +443,8 @@ extern "C" {
441443
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
442444
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
443445
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
446+
//
447+
GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors
444448
};
445449

446450
// available tensor operations:

ggml/src/ggml-common.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ typedef sycl::half2 ggml_half2;
8888
#define QI5_1 (QK5_1 / (4 * QR5_1))
8989
#define QR5_1 2
9090

91+
#define QI6_0 (QK6_0 / (4 * QR6_0))
92+
#define QR6_0 2
93+
9194
#define QI8_0 (QK8_0 / (4 * QR8_0))
9295
#define QR8_0 1
9396

@@ -183,6 +186,14 @@ typedef struct {
183186
} block_q5_1;
184187
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
185188

189+
#define QK6_0 32
190+
typedef struct {
191+
ggml_half d; // delta
192+
uint8_t qh[QK6_0/4]; // 5+6-th bit of quants
193+
uint8_t qs[QK6_0/2]; // nibbles / quants
194+
} block_q6_0;
195+
static_assert(sizeof(block_q6_0) == sizeof(ggml_half) + QK6_0/2 + QK6_0/4, "wrong q6_0 block size/padding");
196+
186197
#define QK8_0 32
187198
typedef struct {
188199
ggml_half d; // delta

ggml/src/ggml-cuda.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3005,6 +3005,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30053005
case GGML_TYPE_Q4_1:
30063006
case GGML_TYPE_Q5_0:
30073007
case GGML_TYPE_Q5_1:
3008+
case GGML_TYPE_Q6_0:
30083009
case GGML_TYPE_Q8_0:
30093010
case GGML_TYPE_Q2_K:
30103011
case GGML_TYPE_Q3_K:
@@ -3076,6 +3077,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30763077
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
30773078
return true;
30783079
}
3080+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q6_0) {
3081+
return true;
3082+
}
30793083
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
30803084
return true;
30813085
}

ggml/src/ggml-cuda/common.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
387387
static constexpr int qi = QI5_1;
388388
};
389389

390+
template<>
391+
struct ggml_cuda_type_traits<GGML_TYPE_Q6_0> {
392+
static constexpr int qk = QK6_0;
393+
static constexpr int qr = QR6_0;
394+
static constexpr int qi = QI6_0;
395+
};
396+
390397
template<>
391398
struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
392399
static constexpr int qk = QK8_0;

ggml/src/ggml-cuda/convert.cu

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,36 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
122122
}
123123
}
124124

125+
template<typename dst_t>
126+
static __global__ void dequantize_block_q6_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
127+
128+
const int64_t i = blockIdx.x;
129+
130+
// assume 32 threads
131+
const int64_t tid = threadIdx.x;
132+
const int64_t il = tid/8;
133+
const int64_t ir = tid%8;
134+
const int64_t ib = 8*i + ir;
135+
if (ib >= nb32) {
136+
return;
137+
}
138+
139+
dst_t * y = yy + 256*i + 32*ir + 4*il;
140+
141+
const block_q6_0 * x = (const block_q6_0 *)vx + ib;
142+
const float d = __half2float(x->d);
143+
const float dm = -32*d;
144+
145+
const uint8_t * qs = x->qs + 4*il;
146+
const uint8_t * qh = x->qh + 4*(il%2);
147+
148+
for (int l = 0; l < 4; ++l) {
149+
const uint8_t h = qh[l] >> 4*(il/2);
150+
y[l+ 0] = d * ((qs[l] & 0xF) | ((h << 4) & 0x30)) + dm;
151+
y[l+16] = d * ((qs[l] >> 4) | ((h << 2) & 0x30)) + dm;
152+
}
153+
}
154+
125155
//================================== k-quants
126156

127157
template<typename dst_t>
@@ -497,6 +527,13 @@ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k
497527
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
498528
}
499529

530+
template<typename dst_t>
531+
static void dequantize_row_q6_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
532+
const int nb32 = k / 32;
533+
const int nb = (k + 255) / 256;
534+
dequantize_block_q6_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
535+
}
536+
500537
template<typename dst_t>
501538
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
502539
const int nb = k / QK_K;
@@ -598,6 +635,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
598635
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
599636
case GGML_TYPE_Q5_1:
600637
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
638+
case GGML_TYPE_Q6_0:
639+
return dequantize_row_q6_0_cuda;
601640
case GGML_TYPE_Q8_0:
602641
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
603642
return dequantize_block_q8_0_f16_cuda;
@@ -648,6 +687,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
648687
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
649688
case GGML_TYPE_Q5_1:
650689
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
690+
case GGML_TYPE_Q6_0:
691+
return dequantize_row_q6_0_cuda;
651692
case GGML_TYPE_Q8_0:
652693
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
653694
case GGML_TYPE_Q2_K:

ggml/src/ggml-cuda/cpy.cu

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,41 @@ static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
225225
memcpy(dsti->qh, &qh, sizeof(qh));
226226
}
227227

228+
static __device__ void cpy_blck_f32_q6_0(const char * cxi, char * cdsti) {
229+
const float * xi = (const float *) cxi;
230+
block_q6_0 * dsti = (block_q6_0 *) cdsti;
231+
232+
float amax = 0.0f;
233+
float vmax = 0.0f;
234+
235+
for (int j = 0; j < QK6_0; ++j) {
236+
const float v = xi[j];
237+
const float av = fabsf(xi[j]);
238+
if (amax < av) {
239+
amax = av;
240+
vmax = v;
241+
}
242+
}
243+
244+
const float d = vmax / -32;
245+
const float id = d ? 1.0f/d : 0.0f;
246+
247+
dsti->d = d;
248+
memset(dsti->qh, 0, QK6_0/4);
249+
250+
for (int j = 0; j < QK6_0/2; ++j) {
251+
const float x0 = xi[0 + j]*id;
252+
const float x1 = xi[QK4_0/2 + j]*id;
253+
254+
const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
255+
const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));
256+
257+
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
258+
const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2);
259+
dsti->qh[j%(QK6_0/4)] |= (h << 4*(j/(QK6_0/4)));
260+
}
261+
}
262+
228263
static __device__ const int8_t iq4nl_index[241] = {
229264
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
230265
1, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 18, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -427,6 +462,17 @@ static void ggml_cpy_f32_q5_1_cuda(
427462
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
428463
}
429464

465+
static void ggml_cpy_f32_q6_0_cuda(
466+
const char * cx, char * cdst, const int ne,
467+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
468+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
469+
470+
GGML_ASSERT(ne % QK6_0 == 0);
471+
const int num_blocks = ne / QK6_0;
472+
cpy_f32_q<cpy_blck_f32_q6_0, QK6_0><<<num_blocks, 1, 0, stream>>>
473+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
474+
}
475+
430476
static void ggml_cpy_f32_iq4_nl_cuda(
431477
const char * cx, char * cdst, const int ne,
432478
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -499,6 +545,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
499545
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
500546
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
501547
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
548+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
549+
ggml_cpy_f32_q6_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
502550
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
503551
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
504552
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
@@ -539,6 +587,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
539587
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
540588
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
541589
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
590+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
591+
return (void*) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>;
542592
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
543593
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
544594
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
88
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
99
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
1010
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
11+
type == GGML_TYPE_Q6_0 ? vec_dot_q6_0_q8_1 :
1112
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
1213
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
1314
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
@@ -31,6 +32,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
3132
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
3233
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
3334
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
35+
type == GGML_TYPE_Q6_0 ? VDR_Q6_0_Q8_1_MMVQ :
3436
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
3537
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
3638
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
@@ -229,6 +231,13 @@ static void mul_mat_vec_q5_1_q8_1_cuda(
229231
mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
230232
}
231233

234+
static void mul_mat_vec_q6_0_q8_1_cuda(
235+
const void * vx, const void * vy, float * dst,
236+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
237+
238+
mul_mat_vec_q_cuda<GGML_TYPE_Q6_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
239+
}
240+
232241
static void mul_mat_vec_q8_0_q8_1_cuda(
233242
const void * vx, const void * vy, float * dst,
234243
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
@@ -367,6 +376,9 @@ void ggml_cuda_op_mul_mat_vec_q(
367376
case GGML_TYPE_Q5_1:
368377
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
369378
break;
379+
case GGML_TYPE_Q6_0:
380+
mul_mat_vec_q6_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
381+
break;
370382
case GGML_TYPE_Q8_0:
371383
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
372384
break;

ggml/src/ggml-cuda/vecdotq.cuh

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,30 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
4141
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
4242
}
4343

44+
#define VDR_Q6_0_Q8_1_MMVQ 2
45+
#define VDR_Q6_0_Q8_1_MMQ 4
46+
47+
template <int vdr> static __device__ __forceinline__ float vec_dot_q6_0_q8_1_impl(
48+
const int * vl, const int * vh, const int * u, const float & d6, const half2 & ds8) {
49+
50+
int sumi = 0;
51+
52+
#pragma unroll
53+
for (int i = 0; i < vdr; ++i) {
54+
const int vi0 = ((vl[i] >> 0) & 0x0F0F0F0F) | ((vh[i] << 4) & 0x30303030);
55+
const int vi1 = ((vl[i] >> 4) & 0x0F0F0F0F) | ((vh[i] << 2) & 0x30303030);
56+
57+
// SIMD dot product of quantized values
58+
sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
59+
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
60+
}
61+
62+
const float2 ds8f = __half22float2(ds8);
63+
64+
// second part effectively subtracts 8 from each quant value
65+
return d6 * (sumi * ds8f.x - (32.f*vdr/QI6_0) * ds8f.y);
66+
}
67+
4468
#define VDR_Q4_1_Q8_1_MMVQ 2
4569
#define VDR_Q4_1_Q8_1_MMQ 4
4670

@@ -542,6 +566,26 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
542566
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
543567
}
544568

569+
static __device__ __forceinline__ float vec_dot_q6_0_q8_1(
570+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
571+
572+
const block_q6_0 * bq6_0 = (const block_q6_0 *) vbq + kbx;
573+
574+
int vl[VDR_Q6_0_Q8_1_MMVQ];
575+
int vh[VDR_Q6_0_Q8_1_MMVQ];
576+
int u[2*VDR_Q6_0_Q8_1_MMVQ];
577+
578+
#pragma unroll
579+
for (int i = 0; i < VDR_Q6_0_Q8_1_MMVQ; ++i) {
580+
vl[i] = get_int_b2(bq6_0->qs, iqs + i);
581+
vh[i] = get_int_b2(bq6_0->qh, i) >> 4*(iqs/2);
582+
u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
583+
u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI6_0);
584+
}
585+
586+
return vec_dot_q6_0_q8_1_impl<VDR_Q6_0_Q8_1_MMVQ>(vl, vh, u, bq6_0->d, bq8_1->ds);
587+
}
588+
545589

546590
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
547591
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {

0 commit comments

Comments
 (0)