Skip to content

Commit

Permalink
remove old dot kernels and template
Browse files Browse the repository at this point in the history
  • Loading branch information
LostRuins committed Jun 20, 2023
1 parent 93247a1 commit a6e8b02
Showing 1 changed file with 10 additions and 225 deletions.
235 changes: 10 additions & 225 deletions ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,34 +331,7 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
}


void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) {

int n = iqs / 128;
int r = iqs - 128 * n;
int l = r / 8;

__global const float *y = yy + 128 * n + l;
__global const uint8_t *q = x[ib].qs + 32 * n + l;
__global const uint8_t *s = x[ib].scales + 8 * n;

const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);

float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));

*result = sum;
}


__kernel void dequantize_mul_mat_vec_q2_K_fast(__global struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
__kernel void dequantize_mul_mat_vec_q2_K(__global struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {

const int row = get_group_id(0);

Expand Down Expand Up @@ -431,49 +404,7 @@ __kernel void dequantize_mul_mat_vec_q2_K_fast(__global struct block_q2_K * xx,
}
}

void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) {

const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;

uint32_t aux[3];
uint32_t utmp[4];

int n = iqs/128;
int r = iqs - 128*n;
int l = r/8;

__global const float * y = yy + 128*n + l;
__global const uint8_t * q = x[ib].qs + 32*n + l;
__global const uint8_t * hm = x[ib].hmask + l;
const int8_t * s = (const int8_t *)utmp + 8*n;

aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24;
aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24;
aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24;

utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);

const float dall = vload_half(0, &x[ib].d);
const uint8_t m = 1 << (4*n);

float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));

*result = sum * dall;

}

__kernel void dequantize_mul_mat_vec_q3_K_fast(__global struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
__kernel void dequantize_mul_mat_vec_q3_K(__global struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;

Expand Down Expand Up @@ -547,36 +478,7 @@ __kernel void dequantize_mul_mat_vec_q3_K_fast(__global struct block_q3_K * xx,
}
}

void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) {

const int j = iqs / 64; // j is in 0...3
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
const int is = 2*j; // is is in 0...6 in steps of 2

__global const float * y = yy + 64*j + ir;
__global const uint8_t * q = x[ib].qs + 32*j + ir;

const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);

uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
const float d2 = dall * sc;
const float m2 = dmin * m;

float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
}

*result = sum;
}

__kernel void dequantize_mul_mat_vec_q4_K_fast(__global struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
__kernel void dequantize_mul_mat_vec_q4_K(__global struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {

//to rename it later, just to test now
const uint16_t kmask1 = 0x3f3f;
Expand Down Expand Up @@ -650,41 +552,7 @@ __kernel void dequantize_mul_mat_vec_q4_K_fast(__global struct block_q4_K * xx,
}
}

void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) {

const int j = iqs / 64;
const int ir = (iqs - 64*j)/2;
const int is = 2*j;

__global const float * y = yy + 64*j + ir;
__global const uint8_t * ql = x[ib].qs + 32*j + ir;
__global const uint8_t * qh = x[ib].qh + ir;

const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);

uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
const float d2 = dall * sc;
const float m2 = dmin * m;

uint8_t hm = 1 << is;
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
}
hm <<= 1;
for (int k = 0; k < 4; ++k) {
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
}
*result = sum;

}

__kernel void dequantize_mul_mat_vec_q5_K_fast(__global struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
__kernel void dequantize_mul_mat_vec_q5_K(__global struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {

const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
Expand Down Expand Up @@ -766,33 +634,7 @@ __kernel void dequantize_mul_mat_vec_q5_K_fast(__global struct block_q5_K * xx,
}
}

void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) {


const int ip = iqs / 128; // 0 or 1
const int il = (iqs - 128*ip)/8; // 0...15
const int is = 8*ip;

__global const float * y = yy + 128*ip + il;

const float d = vload_half(0, &x[ib].d);

__global const uint8_t * ql = x[ib].ql + 64*ip + il;
__global const uint8_t * qh = x[ib].qh + 32*ip + il;
__global const int8_t * sc = x[ib].scales + is;

*result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
+ y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);

}

__kernel void dequantize_mul_mat_vec_q6_K_fast(__global struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
__kernel void dequantize_mul_mat_vec_q6_K(__global struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {

const int row = get_group_id(0);

Expand Down Expand Up @@ -937,44 +779,6 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
}
);

std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
const int block_size = get_local_size(0);
const int row = get_group_id(0);
const int tid = get_local_id(0);

const int iter_stride = QK_K;
const int vals_per_iter = iter_stride / block_size;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;

tmp[tid] = 0;

for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int ib = ib0 + col/QK_K; // x block index
const int iqs = col%QK_K; // x quant index
const int iybs = col - col%QK_K; // y block start index

// dequantize
float v;
DOT_KERNEL(x, ib, iqs, y + iybs, &v);
tmp[tid] += v;
}

// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=block_size/2; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
);

std::string mul_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
Expand Down Expand Up @@ -1037,18 +841,6 @@ std::array<std::string, 2> mul_str_values = {
"mul_f32", "float"
};

std::array<std::string, 3> dmmv_k_str_keys = {
"KERNEL_NAME", "X_TYPE", "DOT_KERNEL"
};

std::array<std::string, 15> dmmv_k_str_values = {
"dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
"dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
"dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K",
"dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K",
"dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K",
};

std::string& replace(std::string& s, const std::string& from, const std::string& to) {
size_t pos = 0;
while ((pos = s.find(from, pos)) != std::string::npos) {
Expand Down Expand Up @@ -1078,13 +870,6 @@ std::string generate_kernels() {
}
src << mul_kernel << '\n';
}
for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) {
std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template;
for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) {
replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]);
}
src << dmmv_k_kernel << '\n';
}

return src.str();
}
Expand Down Expand Up @@ -1343,11 +1128,11 @@ void ggml_cl_init(void) {
CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K_fast", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K_fast", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K_fast", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K_fast", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K_fast", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));

// mul kernel
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
Expand Down

0 comments on commit a6e8b02

Please sign in to comment.