@@ -439,108 +439,6 @@ struct ggml_backend_opencl_context {
439439 cl_kernel kernel_timestep_embedding;
440440 cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
441441
442- std::vector<ProfilingInfo> profiling_info;
443-
444- void write_profiling_info () {
445- FILE * fperf = fopen (" cl_profiling.csv" , " w" );
446- if (!fperf) {
447- GGML_LOG_ERROR (" Failed to open cl_profiling.csv\n " );
448- return ;
449- }
450-
451- // Populate profiling info
452- for (ProfilingInfo & info : profiling_info) {
453- cl_ulong cmd_queued;
454- cl_ulong cmd_submit;
455- cl_ulong cmd_start;
456- cl_ulong cmd_end;
457- cl_ulong cmd_complete;
458-
459- CL_CHECK (clWaitForEvents (1 , &info.evt ));
460- CL_CHECK (clGetEventProfilingInfo (
461- info.evt , CL_PROFILING_COMMAND_QUEUED, sizeof (cl_ulong), &cmd_queued, NULL ));
462- CL_CHECK (clGetEventProfilingInfo (
463- info.evt , CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &cmd_submit, NULL ));
464- CL_CHECK (clGetEventProfilingInfo (
465- info.evt , CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &cmd_start, NULL ));
466- CL_CHECK (clGetEventProfilingInfo (
467- info.evt , CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &cmd_end, NULL ));
468- CL_CHECK (clGetEventProfilingInfo (
469- info.evt , CL_PROFILING_COMMAND_COMPLETE, sizeof (cl_ulong), &cmd_complete, NULL ));
470- CL_CHECK (clReleaseEvent (info.evt ));
471-
472- char kernel_name[512 ];
473- CL_CHECK (clGetKernelInfo (info.kernel , CL_KERNEL_FUNCTION_NAME,
474- sizeof (kernel_name), kernel_name, NULL ));
475- info.kernel_name = kernel_name;
476-
477- info.cmd_queued = cmd_queued;
478- info.cmd_submit = cmd_submit;
479- info.cmd_start = cmd_start;
480- info.cmd_end = cmd_end;
481-
482- info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
483- info.cmd_submit_duration_ns = cmd_start - cmd_submit;
484- info.cmd_duration_ns = cmd_end - cmd_start;
485- info.cmd_complete_duration_ns = cmd_complete - cmd_end;
486- info.cmd_total_duration_ns = cmd_complete - cmd_queued;
487- }
488-
489- // Dump a csv
490- float total_kernel_time = 0 ;
491- fprintf (fperf, " op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n " );
492- for (const ProfilingInfo & info : profiling_info) {
493- total_kernel_time += info.cmd_duration_ns /1 .e6f ;
494- fprintf (fperf, " %s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " ,
495- info.op_name .c_str (), info.kernel_name .c_str (),
496- info.cmd_queued_duration_ns /1 .e6f ,
497- info.cmd_submit_duration_ns /1 .e6f ,
498- info.cmd_duration_ns /1 .e6f ,
499- info.cmd_complete_duration_ns /1 .e6f ,
500- info.cmd_total_duration_ns /1 .e6f ,
501- info.global_size [0 ], info.global_size [1 ], info.global_size [2 ],
502- info.local_size [0 ], info.local_size [1 ], info.local_size [2 ],
503- info.output_size [0 ], info.output_size [1 ], info.output_size [2 ], info.output_size [3 ]);
504- }
505- fclose (fperf);
506-
507- GGML_LOG_INFO (" ggml_opencl: total kernel time: %f\n " , total_kernel_time);
508-
509- // Dump a simple chrome trace
510- FILE* ftrace = fopen (" cl_trace.json" , " w" );
511- if (!ftrace) {
512- GGML_LOG_ERROR (" Failed to open cl_trace.json\n " );
513- return ;
514- }
515-
516- fprintf (ftrace, " [\n " );
517- for (const ProfilingInfo & info : profiling_info) {
518- fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " ,
519- info.kernel_name .c_str (), info.cmd_queued /1000 );
520- fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " ,
521- info.kernel_name .c_str (), info.cmd_submit /1000 );
522-
523- fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " ,
524- info.kernel_name .c_str (), info.cmd_start /1000 );
525- fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " ,
526- info.kernel_name .c_str (), info.cmd_end /1000 );
527- }
528- fclose (ftrace);
529- }
530-
531- void enqueue_ndrange_kernel (cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
532- #ifdef GGML_OPENCL_PROFILING
533- cl_event evt;
534- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, work_dim, NULL , global_work_size, local_work_size, 0 , NULL , &evt));
535-
536- profiling_info.emplace_back ();
537- populateProfilingInfo (profiling_info.back (), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
538- #else
539- GGML_UNUSED (tensor);
540- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, work_dim, NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
541- #endif
542- }
543-
544442#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
545443 // Transpose kernels
546444 cl_program program_transpose;
@@ -5384,7 +5282,15 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
53845282 size_t global_work_size[] = {(size_t )ne01*nth0, (size_t )ny*nth1, (size_t )ne12*ne13};
53855283 size_t local_work_size[] = {(size_t )nth0, (size_t )nth1, 1 };
53865284
5387- backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
5285+ #ifdef GGML_OPENCL_PROFILING
5286+ cl_event evt;
5287+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
5288+
5289+ g_profiling_info.emplace_back ();
5290+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
5291+ #else
5292+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
5293+ #endif
53885294 }
53895295}
53905296
@@ -5401,6 +5307,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
54015307 GGML_ASSERT (src2->extra );
54025308
54035309 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
5310+ cl_command_queue queue = backend_ctx->queue ;
54045311
54055312 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
54065313 ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra ;
@@ -5506,7 +5413,15 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
55065413 size_t global_work_size[] = {(size_t )(ne01+ndst*nsg-1 )/(ndst*nsg)*sgs, (size_t )(_ne1+nrows-1 )/nrows*nsg, (size_t )ne123};
55075414 size_t local_work_size[] = {(size_t )sgs, (size_t )nsg, 1 };
55085415
5509- backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
5416+ #ifdef GGML_OPENCL_PROFILING
5417+ cl_event evt;
5418+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
5419+
5420+ g_profiling_info.emplace_back ();
5421+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
5422+ #else
5423+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
5424+ #endif
55105425}
55115426
55125427static void ggml_cl_scale (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
0 commit comments