3
3
#include < array>
4
4
#include < atomic>
5
5
#include < sstream>
6
+ #include < vector>
6
7
7
8
#define CL_TARGET_OPENCL_VERSION 110
8
9
#include < clblast.h>
@@ -197,6 +198,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
197
198
}
198
199
);
199
200
201
+ std::string mul_template = MULTILINE_QUOTE(
202
+ __kernel void KERNEL_NAME (__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
203
+ const int i = get_group_id (0 )*get_local_size (0 ) + get_local_id (0 );
204
+
205
+ if (i >= get_global_size (0 )) {
206
+ return ;
207
+ }
208
+
209
+ dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
210
+ }
211
+ );
212
+
200
213
#define CL_CHECK (err ) \
201
214
do { \
202
215
cl_int err_ = (err); \
@@ -239,6 +252,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
239
252
" convert_mul_mat_vec_f16" , " half" , " 1" , " 1" , " convert_f16"
240
253
};
241
254
255
+ std::array<std::string, 2 > mul_str_keys = {
256
+ " KERNEL_NAME" , " TYPE"
257
+ };
258
+ std::array<std::string, 2 > mul_str_values = {
259
+ " mul_f32" , " float"
260
+ };
261
+
242
262
std::string& replace (std::string& s, const std::string& from, const std::string& to) {
243
263
size_t pos = 0 ;
244
264
while ((pos = s.find (from, pos)) != std::string::npos) {
@@ -261,6 +281,13 @@ std::string generate_kernels() {
261
281
src << dequant_kernel << ' \n ' ;
262
282
src << dmmv_kernel << ' \n ' ;
263
283
}
284
+ for (size_t i = 0 ; i < mul_str_values.size (); i += mul_str_keys.size ()) {
285
+ std::string mul_kernel = mul_template;
286
+ for (size_t j = 0 ; j < mul_str_keys.size (); j++) {
287
+ replace (mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
288
+ }
289
+ src << mul_kernel << ' \n ' ;
290
+ }
264
291
return src.str ();
265
292
}
266
293
@@ -272,6 +299,7 @@ static cl_program program;
272
299
static cl_kernel convert_row_f16_cl;
273
300
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
274
301
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
302
+ static cl_kernel mul_f32_cl;
275
303
static bool fp16_support;
276
304
277
305
static cl_program build_program_from_source (cl_context ctx, cl_device_id dev, const char * program_buffer) {
@@ -508,6 +536,9 @@ void ggml_cl_init(void) {
508
536
CL_CHECK ((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel (program, " dequantize_mul_mat_vec_q5_1" , &err), err));
509
537
CL_CHECK ((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel (program, " dequantize_mul_mat_vec_q8_0" , &err), err));
510
538
CL_CHECK ((convert_mul_mat_vec_f16_cl = clCreateKernel (program, " convert_mul_mat_vec_f16" , &err), err));
539
+
540
+ // mul kernel
541
+ CL_CHECK ((mul_f32_cl = clCreateKernel (program, " mul_f32" , &err), err));
511
542
}
512
543
513
544
static cl_kernel* ggml_get_to_fp32_cl (ggml_type type) {
@@ -644,6 +675,98 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
644
675
return err;
645
676
}
646
677
678
+ static void ggml_cl_mul_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
679
+ GGML_ASSERT (src1->backend == GGML_BACKEND_CL);
680
+ const int64_t ne00 = src0->ne [0 ];
681
+ const int64_t ne01 = src0->ne [1 ];
682
+ const int64_t ne02 = src0->ne [2 ];
683
+ const int64_t ne03 = src0->ne [2 ];
684
+ const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
685
+ const int64_t ne10 = src1->ne [0 ];
686
+ const int64_t ne11 = src1->ne [1 ];
687
+ const int64_t ne12 = src1->ne [2 ];
688
+ const int64_t ne13 = src1->ne [3 ];
689
+ const int64_t nb10 = src1->nb [0 ];
690
+ const int nb2 = dst->nb [2 ];
691
+ const int nb3 = dst->nb [3 ];
692
+ size_t x_size;
693
+ size_t d_size;
694
+
695
+ cl_mem d_X = ggml_cl_pool_malloc (ne0 * sizeof (float ), &x_size, CL_MEM_READ_ONLY); // src0
696
+ cl_mem d_Y = (cl_mem) src1->data ; // src1 is already on device, broadcasted.
697
+ cl_mem d_D = ggml_cl_pool_malloc (ne0 * sizeof (float ), &d_size, CL_MEM_WRITE_ONLY); // dst
698
+
699
+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
700
+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
701
+ const int i0 = i03*ne02 + i02;
702
+
703
+ cl_event ev;
704
+
705
+ // copy src0 to device
706
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, i0, src0, i03, i02, &ev));
707
+
708
+ if (nb10 == sizeof (float )) {
709
+ // Contiguous, avoid overhead from queueing many kernel runs
710
+ const int64_t i13 = i03%ne13;
711
+ const int64_t i12 = i02%ne12;
712
+ const int i1 = i13*ne12*ne11 + i12*ne11;
713
+
714
+ cl_int x_offset = 0 ;
715
+ cl_int y_offset = i1*ne10;
716
+ cl_int d_offset = 0 ;
717
+
718
+ size_t global = ne00 * ne01;
719
+ cl_int ky = ne10;
720
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 0 , sizeof (cl_mem), &d_X));
721
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 1 , sizeof (cl_int), &x_offset));
722
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 2 , sizeof (cl_mem), &d_Y));
723
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 3 , sizeof (cl_int), &y_offset));
724
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 4 , sizeof (cl_mem), &d_D));
725
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 5 , sizeof (cl_int), &d_offset));
726
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 6 , sizeof (cl_int), &ky));
727
+ CL_CHECK (clEnqueueNDRangeKernel (queue, mul_f32_cl, 1 , NULL , &global, NULL , 1 , &ev, NULL ));
728
+ } else {
729
+ for (int64_t i01 = 0 ; i01 < ne01; i01++) {
730
+ const int64_t i13 = i03%ne13;
731
+ const int64_t i12 = i02%ne12;
732
+ const int64_t i11 = i01%ne11;
733
+ const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
734
+
735
+ cl_int x_offset = i01*ne00;
736
+ cl_int y_offset = i1*ne10;
737
+ cl_int d_offset = i01*ne00;
738
+
739
+ // compute
740
+ size_t global = ne00;
741
+ cl_int ky = ne10;
742
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 0 , sizeof (cl_mem), &d_X));
743
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 1 , sizeof (cl_int), &x_offset));
744
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 2 , sizeof (cl_mem), &d_Y));
745
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 3 , sizeof (cl_int), &y_offset));
746
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 4 , sizeof (cl_mem), &d_D));
747
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 5 , sizeof (cl_int), &d_offset));
748
+ CL_CHECK (clSetKernelArg (mul_f32_cl, 6 , sizeof (cl_int), &ky));
749
+ CL_CHECK (clEnqueueNDRangeKernel (queue, mul_f32_cl, 1 , NULL , &global, NULL , 1 , &ev, NULL ));
750
+ }
751
+ }
752
+
753
+ CL_CHECK (clReleaseEvent (ev));
754
+ CL_CHECK (clFinish (queue));
755
+
756
+ // copy dst to host
757
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
758
+ CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * ne00*ne01, d, 0 , NULL , NULL ));
759
+ }
760
+ }
761
+ ggml_cl_pool_free (d_X, x_size);
762
+ ggml_cl_pool_free (d_D, d_size);
763
+ }
764
+
765
+ void ggml_cl_mul (const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
766
+ GGML_ASSERT (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
767
+ ggml_cl_mul_f32 (src0, src1, dst);
768
+ }
769
+
647
770
static void ggml_cl_mul_mat_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
648
771
const int64_t ne00 = src0->ne [0 ];
649
772
const int64_t ne01 = src0->ne [1 ];
@@ -860,44 +983,48 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
860
983
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl (type);
861
984
GGML_ASSERT (to_fp32_cl != nullptr );
862
985
986
+ size_t ev_idx = 0 ;
987
+ std::vector<cl_event> events;
988
+
863
989
for (int64_t i03 = 0 ; i03 < ne03; i03++) {
864
990
for (int64_t i02 = 0 ; i02 < ne02; i02++) {
865
- cl_event ev_sgemm;
866
-
867
991
// copy src0 to device if necessary
868
992
if (src0->backend == GGML_BACKEND_CPU) {
869
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, NULL ));
993
+ events.emplace_back ();
994
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
870
995
} else if (src0->backend == GGML_BACKEND_CL) {
871
996
d_Q = (cl_mem) src0->data ;
872
997
} else {
873
998
GGML_ASSERT (false );
874
999
}
875
1000
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
876
1001
// copy src1 to device
877
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02, NULL ));
1002
+ events.emplace_back ();
1003
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02, events.data () + ev_idx++));
878
1004
879
1005
// compute
880
1006
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
881
1007
const size_t local = CL_DMMV_BLOCK_SIZE;
882
1008
const cl_int ncols = ne00;
1009
+ events.emplace_back ();
883
1010
CL_CHECK (clSetKernelArg (*dmmv, 0 , sizeof (cl_mem), &d_Q));
884
1011
CL_CHECK (clSetKernelArg (*dmmv, 1 , sizeof (float ) * local, NULL ));
885
1012
CL_CHECK (clSetKernelArg (*dmmv, 2 , sizeof (cl_mem), &d_Y));
886
1013
CL_CHECK (clSetKernelArg (*dmmv, 3 , sizeof (cl_mem), &d_D));
887
1014
CL_CHECK (clSetKernelArg (*dmmv, 4 , sizeof (cl_int), &ncols));
888
- CL_CHECK (clFinish (queue));
889
- CL_CHECK (clEnqueueNDRangeKernel (queue, *dmmv, 1 , NULL , &global, &local, 0 , NULL , &ev_sgemm));
1015
+ CL_CHECK (clEnqueueNDRangeKernel (queue, *dmmv, 1 , NULL , &global, &local, events.size () - 1 , events.data (), events.data () + ev_idx++));
890
1016
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
891
1017
// convert src0 to fp32 on device
892
1018
const size_t global = x_ne;
893
1019
CL_CHECK (clSetKernelArg (*to_fp32_cl, 0 , sizeof (cl_mem), &d_Q));
894
1020
CL_CHECK (clSetKernelArg (*to_fp32_cl, 1 , sizeof (cl_mem), &d_X));
895
- CL_CHECK (clFinish (queue));
896
- CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, NULL , 0 , NULL , NULL ));
1021
+ CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
897
1022
898
1023
// copy src1 to device
899
1024
CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02, NULL ));
900
1025
1026
+ events.emplace_back ();
1027
+
901
1028
// wait for conversion
902
1029
CL_CHECK (clFinish (queue));
903
1030
@@ -910,7 +1037,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
910
1037
d_Y, 0 , ne10,
911
1038
beta,
912
1039
d_D, 0 , ne01,
913
- &queue, &ev_sgemm );
1040
+ &queue, events. data () + ev_idx++ );
914
1041
915
1042
if (status != clblast::StatusCode::kSuccess ) {
916
1043
GGML_ASSERT (false );
@@ -919,8 +1046,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
919
1046
920
1047
// copy dst to host
921
1048
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
922
- CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &ev_sgemm, NULL ));
923
- clReleaseEvent (ev_sgemm);
1049
+ CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &events[events.size () - 1 ], NULL ));
1050
+ for (auto *event : events) {
1051
+ clReleaseEvent (event);
1052
+ }
1053
+
1054
+ ev_idx = 0 ;
1055
+ events.clear ();
924
1056
}
925
1057
}
926
1058
@@ -1026,3 +1158,33 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1026
1158
tensor->data = dst;
1027
1159
tensor->backend = GGML_BACKEND_CL;
1028
1160
}
1161
+
1162
+ void ggml_cl_load_data (const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1163
+ cl_int err;
1164
+ FILE * fp = fopen (fname, " rb" );
1165
+
1166
+ const size_t size = ggml_nbytes (tensor);
1167
+
1168
+ cl_mem dst;
1169
+ CL_CHECK ((dst = clCreateBuffer (context, CL_MEM_READ_ONLY, size, nullptr , &err), err));
1170
+ void * buf_host = malloc (size);
1171
+
1172
+ #ifdef _WIN32
1173
+ int ret = _fseeki64 (fp, (__int64) offset, SEEK_SET);
1174
+ #else
1175
+ int ret = fseek (fp, (long ) offset, SEEK_SET);
1176
+ #endif
1177
+ GGML_ASSERT (ret == 0 ); // same
1178
+
1179
+ size_t ret2 = fread (buf_host, size, 1 , fp);
1180
+ if (ret2 != 1 ) {
1181
+ fprintf (stderr, " unexpectedly reached end of file" );
1182
+ exit (1 );
1183
+ }
1184
+
1185
+ clEnqueueWriteBuffer (queue, dst, CL_TRUE, 0 , size, buf_host, 0 , nullptr , nullptr );
1186
+
1187
+ tensor->data = dst;
1188
+ free (buf_host);
1189
+ fclose (fp);
1190
+ }
0 commit comments