@@ -356,33 +356,6 @@ static void clamp(const T * x, T * dst, const float min, const float max, const
356356 }
357357}
358358
359- template <typename T>
360- static void gated_op_fused_geglu (const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1 > &item_ct1) {
361- SYCL_GLOBAL_ID_LOOP (k, item_ct1) {
362- const int64_t j0 = (i / n) * o0 + (i % n);
363- const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
364- dst[i] = op_gelu (x[j0]) * g[j1];
365- }
366- }
367-
368- template <typename T>
369- static void gated_op_fused_reglu (const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1 > &item_ct1) {
370- SYCL_GLOBAL_ID_LOOP (k, item_ct1) {
371- const int64_t j0 = (i / n) * o0 + (i % n);
372- const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
373- dst[i] = op_relu (x[j0]) * g[j1];
374- }
375- }
376-
377- template <typename T>
378- static void gated_op_fused_swiglu (const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1 > &item_ct1) {
379- SYCL_GLOBAL_ID_LOOP (k, item_ct1) {
380- const int64_t j0 = (i / n) * o0 + (i % n);
381- const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
382- dst[i] = op_silu (x[j0]) * g[j1];
383- }
384- }
385-
386359namespace ggml_sycl_detail {
387360static void acc_f32_sycl (const float *x, const float *y, float *dst,
388361 const int n_elements, const int ne10, const int ne11,
@@ -457,85 +430,6 @@ static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx,
457430 }
458431}
459432
460- template <typename KernelInvoker, typename ... Args>
461- static inline void dispatch_ggml_sycl_op_fused_glu (ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
462- #if defined (GGML_SYCL_F16)
463- GGML_ASSERT (dst->src [0 ]->type == GGML_TYPE_F32 || dst->src [0 ]->type == GGML_TYPE_F16);
464- GGML_ASSERT (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
465- #else
466- GGML_ASSERT (dst->src [0 ]->type == GGML_TYPE_F32);
467- GGML_ASSERT (dst->type == GGML_TYPE_F32);
468- #endif
469- GGML_ASSERT (dst->src [0 ]->type == dst->type );
470- dpct::queue_ptr main_stream = ctx.stream ();
471- SYCL_CHECK (ggml_sycl_set_device (ctx.device ));
472- const ggml_tensor * src0 = dst->src [0 ];
473- const ggml_tensor * src1 = dst->src [1 ];
474- const int64_t nc = src1 ? src0->ne [0 ] : src0->ne [0 ] / 2 ;;
475- GGML_ASSERT (dst->ne [0 ] == nc);
476- GGML_ASSERT (ggml_is_contiguous_1 (dst->src [0 ]));
477- GGML_ASSERT (ggml_is_contiguous (dst));
478- const int32_t swapped = ((const int32_t *) dst->op_params )[1 ];
479- void * src0_d = src0->data ;
480- void * src1_d = src1 ? src1->data : src0->data ;
481- const int64_t src0_o = src0->nb [1 ];
482- const int64_t src1_o = src1 ? src1->nb [1 ] : src0->nb [1 ];
483- void * dst_d = dst->data ;
484- if (src1) {
485- GGML_ASSERT (ggml_is_contiguous_1 (src1));
486- GGML_ASSERT (src1->nb [0 ] == ggml_element_size (src1));
487- GGML_ASSERT (src1->ne [0 ] == nc);
488- GGML_ASSERT (src0->type == src1->type );
489- }
490- switch (dst->type ) {
491- #if defined (GGML_SYCL_F16)
492- case GGML_TYPE_F16:
493- {
494- sycl::half * src0_p = (sycl::half *) src0_d;
495- sycl::half * src1_p = (sycl::half *) src1_d;
496-
497- if (!src1) {
498- src0_p += swapped ? nc : 0 ;
499- src1_p += swapped ? 0 : nc;
500- }
501- kernel_invoker (src0_p,
502- src1_p,
503- (sycl::half *) dst_d,
504- ggml_nelements (dst),
505- nc,
506- src0_o / sizeof (sycl::half),
507- src1_o / sizeof (sycl::half),
508- main_stream,
509- std::forward<Args>(args)...);
510- break ;
511- }
512- #endif
513- case GGML_TYPE_F32:
514- {
515- float * src0_p = (float *) src0_d;
516- float * src1_p = (float *) src1_d;
517-
518- if (!src1) {
519- src0_p += swapped ? nc : 0 ;
520- src1_p += swapped ? 0 : nc;
521- }
522-
523- kernel_invoker (src0_p,
524- src1_p,
525- (float *) dst_d,
526- ggml_nelements (dst),
527- nc,
528- src0_o / sizeof (float ),
529- src1_o / sizeof (float ),
530- main_stream,
531- std::forward<Args>(args)...);
532- break ;
533- }
534- default :
535- GGML_ABORT (" GGML tensor type not supported!\n " );
536- }
537- }
538-
539433template <typename KernelInvoker, typename ... Args>
540434static inline void dispatch_ggml_sycl_op_upscale (ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
541435#if defined (GGML_SYCL_F16)
@@ -945,40 +839,6 @@ static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor
945839 ggml_sycl_detail::acc_f32_sycl (src0_dd, src1_dd, dst_dd, (int )ggml_nelements (dst), (int )dst->src [1 ]->ne [0 ], (int )dst->src [1 ]->ne [1 ], (int )dst->src [1 ]->ne [2 ], nb1, nb2, offset, main_stream);
946840}
947841
948- static inline void ggml_sycl_op_geglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
949- ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
950- [](const auto * x_ptr, const auto * g_ptr, auto * dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
951- const uint32_t num_blocks = ceil_div (k, SYCL_GELU_BLOCK_SIZE);
952- sycl_parallel_for (main_stream,
953- sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_GELU_BLOCK_SIZE)), sycl::range<1 >(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
954- gated_op_fused_geglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
955- });
956- });
957- }
958-
959- static inline void ggml_sycl_op_reglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
960- ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
961- [](const auto * x_ptr, const auto * g_ptr, auto * dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
962- const uint32_t num_blocks = ceil_div ((uint32_t )k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
963- sycl_parallel_for (main_stream,
964- sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_RELU_BLOCK_SIZE)), sycl::range<1 >(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
965- gated_op_fused_reglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
966- });
967- });
968- }
969-
970- static inline void ggml_sycl_op_swiglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
971- ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
972- [](const auto * x_ptr, const auto * g_ptr, auto * dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
973- const uint32_t num_blocks = ceil_div ((uint32_t )k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
974- sycl_parallel_for (main_stream,
975- sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_SILU_BLOCK_SIZE)), sycl::range<1 >(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
976- gated_op_fused_swiglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
977- });
978- });
979- }
980-
981-
982842void ggml_sycl_sqrt (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
983843 scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
984844 ggml_sycl_op_sqrt (ctx, dst);
@@ -1104,17 +964,3 @@ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1104964 ggml_sycl_op_elu (ctx, dst);
1105965}
1106966
1107- void ggml_sycl_geglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1108- scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1109- ggml_sycl_op_geglu (ctx, dst);
1110- }
1111-
1112- void ggml_sycl_reglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1113- scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1114- ggml_sycl_op_reglu (ctx, dst);
1115- }
1116-
1117- void ggml_sycl_swiglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1118- scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1119- ggml_sycl_op_swiglu (ctx, dst);
1120- }
0 commit comments