Skip to content

Commit 8ffad18

Browse files
ikawrakowIwan Kawrakow
andauthored
MMQ implementation for IQ4_KS_R4 and IQ5_KS_R4 (#493)
* MMQ for iq4_ks_r4 * MMQ for iq5_ks_r4 * Add forgotten file * Another forgotten file --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 0b10f74 commit 8ffad18

File tree

6 files changed

+167
-43
lines changed

6 files changed

+167
-43
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_KS> {
599599
static constexpr int qi = QI4_XS;
600600
};
601601

602+
template<>
603+
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_KS_R4> {
604+
static constexpr int qk = QK_K;
605+
static constexpr int qr = QR4_XS;
606+
static constexpr int qi = QI4_XS;
607+
};
608+
602609
template<>
603610
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_KSS> {
604611
static constexpr int qk = QK_K;
@@ -620,6 +627,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ5_KS> {
620627
static constexpr int qi = QI5_XS;
621628
};
622629

630+
template<>
631+
struct ggml_cuda_type_traits<GGML_TYPE_IQ5_KS_R4> {
632+
static constexpr int qk = QK_K;
633+
static constexpr int qr = QR5_XS;
634+
static constexpr int qi = QI5_XS;
635+
};
636+
623637
template<>
624638
struct ggml_cuda_type_traits<GGML_TYPE_IQ6_K> {
625639
static constexpr int qk = QK_K;

ggml/src/ggml-cuda/iqk_mmvq.cu

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,21 +36,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ5_K_R4> {
3636
static constexpr int qi = QI5_XS;
3737
};
3838

39-
template<>
40-
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_KS_R4> {
41-
static constexpr int qk = QK_K;
42-
static constexpr int qr = QR4_XS;
43-
static constexpr int qi = QI4_XS;
44-
};
45-
46-
template<>
47-
struct ggml_cuda_type_traits<GGML_TYPE_IQ5_KS_R4> {
48-
static constexpr int qk = QK_K;
49-
static constexpr int qr = QR5_XS;
50-
static constexpr int qi = QI5_XS;
51-
};
52-
53-
5439
// Reminder:
5540
// constexpr int qk = ggml_cuda_type_traits<type>::qk;
5641
// constexpr int qi = ggml_cuda_type_traits<type>::qi;

ggml/src/ggml-cuda/mmq.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,15 @@ void ggml_cuda_op_mul_mat_q(
9797
case GGML_TYPE_IQ4_KS:
9898
mul_mat_q_case<GGML_TYPE_IQ4_KS>(ctx, args, stream);
9999
break;
100+
case GGML_TYPE_IQ4_KS_R4:
101+
mul_mat_q_case<GGML_TYPE_IQ4_KS_R4>(ctx, args, stream);
102+
break;
100103
case GGML_TYPE_IQ5_KS:
101104
mul_mat_q_case<GGML_TYPE_IQ5_KS>(ctx, args, stream);
102105
break;
106+
case GGML_TYPE_IQ5_KS_R4:
107+
mul_mat_q_case<GGML_TYPE_IQ5_KS_R4>(ctx, args, stream);
108+
break;
103109
case GGML_TYPE_IQ2_KS:
104110
mul_mat_q_case<GGML_TYPE_IQ2_KS>(ctx, args, stream);
105111
break;
@@ -157,7 +163,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
157163
case GGML_TYPE_IQ4_XS:
158164
case GGML_TYPE_IQ4_NL:
159165
case GGML_TYPE_IQ4_KS:
166+
case GGML_TYPE_IQ4_KS_R4:
160167
case GGML_TYPE_IQ5_KS:
168+
case GGML_TYPE_IQ5_KS_R4:
161169
case GGML_TYPE_IQ2_KS:
162170
case GGML_TYPE_IQ2_K:
163171
case GGML_TYPE_IQ3_K:

0 commit comments

Comments
 (0)