@@ -412,19 +412,21 @@ struct get_reduction_aux_2nd_kernel_name_t {
412
412
// /
413
413
// / Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
414
414
template <typename KernelName, typename KernelType, int Dims, class Reduction >
415
- void reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
416
- Reduction &Redu) {
415
+ void reduCGFunc (handler &CGH, KernelType KernelFunc,
416
+ const nd_range<Dims> &Range, Reduction &Redu) {
417
417
418
418
size_t NWorkItems = Range.get_global_range ().size ();
419
419
size_t WGSize = Range.get_local_range ().size ();
420
420
size_t NWorkGroups = Range.get_group_range ().size ();
421
421
422
- bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0 ;
423
- bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1 )) == 0 );
422
+ // The last work-group may be not fully loaded with work, or the work group
423
+ // size may be not power of two. Those two cases considered inefficient
424
+ // as they require additional code and checks in the kernel.
425
+ bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0 ;
426
+ bool IsEfficientCase = !HasNonUniformWG && ((WGSize & (WGSize - 1 )) == 0 );
424
427
425
428
bool IsUpdateOfUserAcc =
426
- Reduction::accessor_mode == access::mode::read_write &&
427
- NWorkGroups == 1 ;
429
+ Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1 ;
428
430
429
431
// Use local memory to reduce elements in work-groups into 0-th element.
430
432
// If WGSize is not power of two, then WGSize+1 elements are allocated.
@@ -436,8 +438,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
436
438
auto Out = Redu.getWriteAccForPartialReds (NWorkGroups, 0 , CGH);
437
439
auto ReduIdentity = Redu.getIdentity ();
438
440
if (IsEfficientCase) {
439
- // Efficient case: work-groups are fully loaded and work-group size
440
- // is power of two.
441
+ // Efficient case: work-groups are uniform and WGSize is is power of two.
441
442
CGH.parallel_for <KernelName>(Range, [=](nd_item<Dims> NDIt) {
442
443
// Call user's functions. Reducer.MValue gets initialized there.
443
444
typename Reduction::reducer_type Reducer (ReduIdentity);
@@ -464,13 +465,14 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
464
465
: LocalReds[0 ];
465
466
});
466
467
} else {
467
- // Inefficient case: work-groups are not fully loaded
468
- // or WGSize is not power of two .
468
+ // Inefficient case: work-groups are non uniform or WGSize is not power
469
+ // of two, which requires more conditional, read and write operations .
469
470
// These two inefficient cases are handled by one kernel, which
470
471
// can be split later into two separate kernels, if there are users who
471
472
// really need more efficient code for them.
472
- using AuxName = typename get_reduction_main_2nd_kernel_name_t <
473
- KernelName, KernelType>::name;
473
+ using AuxName =
474
+ typename get_reduction_main_2nd_kernel_name_t <KernelName,
475
+ KernelType>::name;
474
476
CGH.parallel_for <AuxName>(Range, [=](nd_item<Dims> NDIt) {
475
477
// Call user's functions. Reducer.MValue gets initialized there.
476
478
typename Reduction::reducer_type Reducer (ReduIdentity);
@@ -500,7 +502,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
500
502
501
503
// Compute the partial sum/reduction for the work-group.
502
504
if (LID == 0 ) {
503
- auto GrID = NDIt.get_group_linear_id ();
505
+ size_t GrID = NDIt.get_group_linear_id ();
504
506
auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
505
507
Out.get_pointer ().get ()[GrID] =
506
508
IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), V) : V;
@@ -518,19 +520,18 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
518
520
// / Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
519
521
template <typename KernelName, typename KernelType, int Dims, class Reduction >
520
522
void reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
521
- size_t KernelRun, Reduction &Redu) {
523
+ size_t KernelRun, Reduction &Redu) {
522
524
size_t WGSize = Range.get_local_range ().size ();
523
525
size_t NWorkGroups = Range.get_group_range ().size ();
524
526
525
527
// The last work-group may be not fully loaded with work, or the work group
526
- // size may be not power of those . Those two cases considered inefficient
528
+ // size may be not power of two . Those two cases considered inefficient
527
529
// as they require additional code and checks in the kernel.
528
- bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
529
- bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1 )) == 0 ;
530
+ bool HasNonUniformWG = NWorkGroups * WGSize != NWorkItems;
531
+ bool IsEfficientCase = !HasNonUniformWG && (WGSize & (WGSize - 1 )) == 0 ;
530
532
531
533
bool IsUpdateOfUserAcc =
532
- Reduction::accessor_mode == access::mode::read_write &&
533
- NWorkGroups == 1 ;
534
+ Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1 ;
534
535
535
536
// Use local memory to reduce elements in work-groups into 0-th element.
536
537
// If WGSize is not power of two, then WGSize+1 elements are allocated.
@@ -549,8 +550,9 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
549
550
if (IsEfficientCase) {
550
551
// Efficient case: work-groups are fully loaded and work-group size
551
552
// is power of two.
552
- using AuxName = typename get_reduction_aux_1st_kernel_name_t <
553
- KernelName, KernelType>::name;
553
+ using AuxName =
554
+ typename get_reduction_aux_1st_kernel_name_t <KernelName,
555
+ KernelType>::name;
554
556
CGH.parallel_for <AuxName>(Range, [=](nd_item<Dims> NDIt) {
555
557
// Copy the element to local memory to prepare it for tree-reduction.
556
558
size_t LID = NDIt.get_local_linear_id ();
@@ -579,8 +581,9 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
579
581
// These two inefficient cases are handled by one kernel, which
580
582
// can be split later into two separate kernels, if there are users
581
583
// who really need more efficient code for them.
582
- using AuxName = typename get_reduction_aux_2nd_kernel_name_t <
583
- KernelName, KernelType>::name;
584
+ using AuxName =
585
+ typename get_reduction_aux_2nd_kernel_name_t <KernelName,
586
+ KernelType>::name;
584
587
auto ReduIdentity = Redu.getIdentity ();
585
588
CGH.parallel_for <AuxName>(Range, [=](nd_item<Dims> NDIt) {
586
589
size_t WGSize = NDIt.get_local_range ().size ();
@@ -607,7 +610,7 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
607
610
608
611
// Compute the partial sum/reduction for the work-group.
609
612
if (LID == 0 ) {
610
- auto GrID = NDIt.get_group_linear_id ();
613
+ size_t GrID = NDIt.get_group_linear_id ();
611
614
auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
612
615
Out.get_pointer ().get ()[GrID] =
613
616
IsUpdateOfUserAcc ? BOp (*(Out.get_pointer ()), V) : V;
0 commit comments