Skip to content

Commit 39c8bd9

Browse files
Remove logic to select max_work_group_size
The logic was misguided, and based on the idea that if using max-work-group-size can lead to launching just a single work-group, then we can reduce everything within the work-group and not use atomics altogether. This lead to problems on CPU, where max-work-group-size is 8192, and max-work-group size was selected, but the total number of work-groups launched was high due to large iteration space size, and this resulted in severe underutilization of the device (low ocupancy).
1 parent 189e8aa commit 39c8bd9

File tree

1 file changed

+0
-42
lines changed

1 file changed

+0
-42
lines changed

dpctl/tensor/libtensor/include/kernels/reductions.hpp

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -342,20 +342,6 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
342342
(reduction_nelems + reductions_per_wi * wg - 1) /
343343
(reductions_per_wi * wg);
344344

345-
if (reduction_groups > 1) {
346-
const size_t &max_wg =
347-
d.get_info<sycl::info::device::max_work_group_size>();
348-
349-
if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
350-
wg = max_wg;
351-
reductions_per_wi =
352-
std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
353-
reduction_groups =
354-
(reduction_nelems + reductions_per_wi * wg - 1) /
355-
(reductions_per_wi * wg);
356-
}
357-
}
358-
359345
auto globalRange =
360346
sycl::range<1>{iter_nelems * reduction_groups * wg};
361347
auto localRange = sycl::range<1>{wg};
@@ -479,20 +465,6 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
479465
(reduction_nelems + reductions_per_wi * wg - 1) /
480466
(reductions_per_wi * wg);
481467

482-
if (reduction_groups > 1) {
483-
const size_t &max_wg =
484-
d.get_info<sycl::info::device::max_work_group_size>();
485-
486-
if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
487-
wg = max_wg;
488-
reductions_per_wi =
489-
std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
490-
reduction_groups =
491-
(reduction_nelems + reductions_per_wi * wg - 1) /
492-
(reductions_per_wi * wg);
493-
}
494-
}
495-
496468
auto globalRange =
497469
sycl::range<1>{iter_nelems * reduction_groups * wg};
498470
auto localRange = sycl::range<1>{wg};
@@ -574,20 +546,6 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
574546
(reduction_nelems + reductions_per_wi * wg - 1) /
575547
(reductions_per_wi * wg);
576548

577-
if (reduction_groups > 1) {
578-
const size_t &max_wg =
579-
d.get_info<sycl::info::device::max_work_group_size>();
580-
581-
if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
582-
wg = max_wg;
583-
reductions_per_wi =
584-
std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
585-
reduction_groups =
586-
(reduction_nelems + reductions_per_wi * wg - 1) /
587-
(reductions_per_wi * wg);
588-
}
589-
}
590-
591549
auto globalRange =
592550
sycl::range<1>{iter_nelems * reduction_groups * wg};
593551
auto localRange = sycl::range<1>{wg};

0 commit comments

Comments
 (0)