Remove logic to select max_work_group_size

oleksandr-pavlyk · oleksandr-pavlyk · commit 39c8bd933483 · 2023-08-26T01:53:21.000-05:00
The logic was misguided, and based on the idea that if
using max-work-group-size can lead to launching just a
single work-group, then we can reduce everything within
the work-group and not use atomics altogether.

This lead to problems on CPU, where max-work-group-size is 8192,
and max-work-group size was selected, but the total number of
work-groups launched was high due to large iteration space size,
and this resulted in severe underutilization of the device (low
ocupancy).
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -342,20 +342,6 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
                 (reduction_nelems + reductions_per_wi * wg - 1) /
                 (reductions_per_wi * wg);
 
-            if (reduction_groups > 1) {
-                const size_t &max_wg =
-                    d.get_info<sycl::info::device::max_work_group_size>();
-
-                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
-                    wg = max_wg;
-                    reductions_per_wi =
-                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-                    reduction_groups =
-                        (reduction_nelems + reductions_per_wi * wg - 1) /
-                        (reductions_per_wi * wg);
-                }
-            }
-
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
@@ -479,20 +465,6 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                 (reduction_nelems + reductions_per_wi * wg - 1) /
                 (reductions_per_wi * wg);
 
-            if (reduction_groups > 1) {
-                const size_t &max_wg =
-                    d.get_info<sycl::info::device::max_work_group_size>();
-
-                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
-                    wg = max_wg;
-                    reductions_per_wi =
-                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-                    reduction_groups =
-                        (reduction_nelems + reductions_per_wi * wg - 1) /
-                        (reductions_per_wi * wg);
-                }
-            }
-
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
@@ -574,20 +546,6 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
                 (reduction_nelems + reductions_per_wi * wg - 1) /
                 (reductions_per_wi * wg);
 
-            if (reduction_groups > 1) {
-                const size_t &max_wg =
-                    d.get_info<sycl::info::device::max_work_group_size>();
-
-                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
-                    wg = max_wg;
-                    reductions_per_wi =
-                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-                    reduction_groups =
-                        (reduction_nelems + reductions_per_wi * wg - 1) /
-                        (reductions_per_wi * wg);
-                }
-            }
-
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};