@@ -528,6 +528,15 @@ typedef struct {
528
528
} block_iq1_s;
529
529
static_assert (sizeof (block_iq1_s) == sizeof(ggml_fp16_t ) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
530
530
531
+ #define QK4_NL 32
532
+ #define QR4_NL 2
533
+ #define QI4_NL (QK4_NL / (4 *QR4_NL))
534
+ typedef struct {
535
+ half d;
536
+ uint8_t qs[QK4_NL/2 ];
537
+ } block_iq4_nl;
538
+ static_assert (sizeof (block_iq4_nl) == sizeof(ggml_fp16_t ) + QK4_NL/2, "wrong iq4_nl block size/padding");
539
+
531
540
#define WARP_SIZE 32
532
541
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
533
542
@@ -1987,6 +1996,26 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
1987
1996
1988
1997
}
1989
1998
1999
+ static const __device__ int8_t kvalues_iq4nl[16 ] = {-127 , -104 , -83 , -65 , -49 , -35 , -22 , -10 , 1 , 13 , 25 , 38 , 53 , 69 , 89 , 113 };
2000
+
2001
+ template <typename dst_t >
2002
+ static __global__ void dequantize_block_iq4_nl (const void * __restrict__ vx, dst_t * __restrict__ yy) {
2003
+
2004
+ const int i = blockIdx .x ;
2005
+ const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
2006
+
2007
+ const int tid = threadIdx .x ;
2008
+ const int il = tid/8 ; // 0...3
2009
+ const int ib = tid%8 ; // 0...7
2010
+ dst_t * y = yy + i*QK_K + 32 *ib + 4 *il;
2011
+ const uint8_t * q4 = x[ib].qs + 4 *il;
2012
+ const float d = (float )x[ib].d ;
2013
+ for (int j = 0 ; j < 4 ; ++j) {
2014
+ y[j+ 0 ] = d * kvalues_iq4nl[q4[j] & 0xf ];
2015
+ y[j+16 ] = d * kvalues_iq4nl[q4[j] >> 4 ];
2016
+ }
2017
+
2018
+ }
1990
2019
1991
2020
static __global__ void dequantize_mul_mat_vec_q2_k (const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1992
2021
@@ -4732,6 +4761,56 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
4732
4761
#endif
4733
4762
}
4734
4763
4764
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4765
+ static __device__ __forceinline__ void get_int_from_table_16 (const uint32_t & q4, const uint8_t * values,
4766
+ int & val1, int & val2) {
4767
+
4768
+ uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
4769
+ aux32 = q4 & 0x0f0f0f0f ;
4770
+ uint16_t v1 = values[q8[0 ]] | (values[q8[1 ]] << 8 );
4771
+ uint16_t v2 = values[q8[2 ]] | (values[q8[3 ]] << 8 );
4772
+ val1 = v1 | (v2 << 16 );
4773
+ aux32 = (q4 >> 4 ) & 0x0f0f0f0f ;
4774
+ v1 = values[q8[0 ]] | (values[q8[1 ]] << 8 );
4775
+ v2 = values[q8[2 ]] | (values[q8[3 ]] << 8 );
4776
+ val2 = v1 | (v2 << 16 );
4777
+ }
4778
+ #endif
4779
+
4780
+ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1 (
4781
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4782
+
4783
+ const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
4784
+
4785
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4786
+ const uint16_t * q4 = (const uint16_t *)bq->qs + 2 *iqs;
4787
+ const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
4788
+
4789
+ const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
4790
+
4791
+ int v1, v2;
4792
+ int sumi1 = 0 , sumi2 = 0 ;
4793
+ for (int l = 0 ; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
4794
+ const uint32_t aux = q4[2 *l] | (q4[2 *l+1 ] << 16 );
4795
+ get_int_from_table_16 (aux, values, v1, v2);
4796
+ sumi1 = __dp4a (v1, q8[l+0 ], sumi1);
4797
+ sumi2 = __dp4a (v2, q8[l+4 ], sumi2);
4798
+ }
4799
+
4800
+ #else
4801
+ const uint8_t * q4 = bq->qs + 4 *iqs;
4802
+ const int8_t * q8 = bq8_1->qs + 4 *iqs;
4803
+
4804
+ int sumi1 = 0 , sumi2 = 0 ;
4805
+ for (int l = 0 ; l < 4 *VDR_Q4_0_Q8_1_MMVQ; ++l) {
4806
+ sumi1 += q8[l+ 0 ] * kvalues_iq4nl[q4[l] & 0xf ];
4807
+ sumi2 += q8[l+16 ] * kvalues_iq4nl[q4[l] >> 4 ];
4808
+ }
4809
+ #endif
4810
+ const float d = (float )bq->d * __low2float (bq8_1->ds );
4811
+ return d * (sumi1 + sumi2);
4812
+ }
4813
+
4735
4814
template <int qk, int qr, int qi, bool need_sum, typename block_q_t , int mmq_x, int mmq_y, int nwarps,
4736
4815
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
4737
4816
static __device__ __forceinline__ void mul_mat_q (
@@ -6777,6 +6856,12 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, c
6777
6856
dequantize_block_iq1_s<<<nb, 32 , 0 , stream>>> (vx, y);
6778
6857
}
6779
6858
6859
+ template <typename dst_t >
6860
+ static void dequantize_row_iq4_nl_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6861
+ const int nb = (k + QK_K - 1 ) / QK_K;
6862
+ dequantize_block_iq4_nl<<<nb, 32 , 0 , stream>>> (vx, y);
6863
+ }
6864
+
6780
6865
template <typename src_t , typename dst_t >
6781
6866
static void convert_unary_cuda (const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
6782
6867
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@@ -6818,6 +6903,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6818
6903
return dequantize_row_iq3_xxs_cuda;
6819
6904
case GGML_TYPE_IQ1_S:
6820
6905
return dequantize_row_iq1_s_cuda;
6906
+ case GGML_TYPE_IQ4_NL:
6907
+ return dequantize_row_iq4_nl_cuda;
6821
6908
case GGML_TYPE_F32:
6822
6909
return convert_unary_cuda<float >;
6823
6910
default :
@@ -6855,6 +6942,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
6855
6942
return dequantize_row_iq3_xxs_cuda;
6856
6943
case GGML_TYPE_IQ1_S:
6857
6944
return dequantize_row_iq1_s_cuda;
6945
+ case GGML_TYPE_IQ4_NL:
6946
+ return dequantize_row_iq4_nl_cuda;
6858
6947
case GGML_TYPE_F16:
6859
6948
return convert_unary_cuda<half>;
6860
6949
default :
@@ -8599,6 +8688,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8599
8688
case GGML_TYPE_IQ2_XS:
8600
8689
case GGML_TYPE_IQ3_XXS:
8601
8690
case GGML_TYPE_IQ1_S:
8691
+ case GGML_TYPE_IQ4_NL:
8602
8692
return max_compute_capability >= CC_RDNA2 ? 128 : 64 ;
8603
8693
default :
8604
8694
GGML_ASSERT (false );
@@ -8623,6 +8713,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8623
8713
case GGML_TYPE_IQ2_XS:
8624
8714
case GGML_TYPE_IQ3_XXS:
8625
8715
case GGML_TYPE_IQ1_S:
8716
+ case GGML_TYPE_IQ4_NL:
8626
8717
return max_compute_capability >= CC_VOLTA ? 128 : 64 ;
8627
8718
case GGML_TYPE_Q6_K:
8628
8719
return 64 ;
@@ -8724,6 +8815,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
8724
8815
mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1 , vec_dot_iq1_s_q8_1>
8725
8816
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8726
8817
break ;
8818
+ case GGML_TYPE_IQ4_NL:
8819
+ mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
8820
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8821
+ break ;
8727
8822
default :
8728
8823
GGML_ASSERT (false );
8729
8824
break ;
@@ -11446,7 +11541,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11446
11541
return false ;
11447
11542
}
11448
11543
ggml_type a_type = a->type ;
11449
- if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) {
11544
+ if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
11545
+ a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) {
11450
11546
if (b->ne [1 ] == 1 && ggml_nrows (b) > 1 ) {
11451
11547
return false ;
11452
11548
}
0 commit comments