[SYCL] Additional mostly NFC changes for reduction patch(1585)

v-klochkov · v-klochkov · commit f241d2bccf6e · 2020-04-28T17:41:19.000-07:00
Removed handler::dissociateWithHandler()
Removed handler::addEventToQueue() and made queue_impl::addEvent() private again;
Minor changes in comments.
Replaced 'auto' with 'size_t' in couple places.

Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -213,13 +213,6 @@ class __SYCL_EXPORT handler {
   /// usage in finalize() method.
   void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
 
-  /// Stores the given \param Event to the \param Queue.
-  /// Even though MQueue is a field of handler, the method addEvent() of
-  /// queue_impl class cannot be called inside this handler.hpp file
-  /// as queue_impl is incomplete class for handler.
-  static void addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
-                              cl::sycl::event Event);
-
   /// Constructs CG object of specific type, passes it to Scheduler and
   /// returns sycl::event object representing the command group.
   /// It's expected that the method is the latest method executed before
@@ -270,30 +263,6 @@ class __SYCL_EXPORT handler {
                                      /*index*/ 0);
   }
 
-  template <typename DataT, int Dims, access::mode AccessMode,
-            access::target AccessTarget>
-  void dissociateWithHandler(accessor<DataT, Dims, AccessMode, AccessTarget,
-                                      access::placeholder::false_t>
-                                 Acc) {
-    detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
-    detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
-    detail::Requirement *Req = AccImpl.get();
-
-    // Remove accessor from the list of requirements, accessors storage,
-    // and from the list of associated accessors.
-    auto ReqIt = std::find(MRequirements.begin(), MRequirements.end(), Req);
-    auto AccIt = std::find(MAccStorage.begin(), MAccStorage.end(), AccImpl);
-    auto It =
-        std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
-                     [Req](const detail::ArgDesc &D) { return D.MPtr == Req; });
-    assert((ReqIt != MRequirements.end() && AccIt != MAccStorage.end() &&
-            It != MAssociatedAccesors.end()) &&
-           "Cannot dissociate accessor.");
-    MRequirements.erase(ReqIt);
-    MAccStorage.erase(AccIt);
-    MAssociatedAccesors.erase(It);
-  }
-
   // Recursively calls itself until arguments pack is fully processed.
   // The version for regular(standard layout) argument.
   template <typename T, typename... Ts>
@@ -832,30 +801,23 @@ class __SYCL_EXPORT handler {
     //    necessary to reduce all partial sums into one final sum.
 
     // 1. Call the kernel that includes user's lambda function.
-    // If this kernel is going to be now last one, i.e. it does not write
-    // to user's accessor, then detach user's accessor from this kernel
-    // to make the dependencies between accessors and kernels more clean and
-    // correct.
-    if (NWorkGroups > 1)
-      dissociateWithHandler(Redu.MAcc);
-
     intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
     auto QueueCopy = MQueue;
     MLastEvent = this->finalize();
 
     // 2. Run the additional aux kernel as many times as needed to reduce
     // all partial sums into one scalar.
+
+    // TODO: user's nd_range and the work-group size specified there must
+    // be honored only for the main kernel that calls user's lambda functions.
+    // There is no need in using the same work-group size in these additional
+    // kernels. Thus, the better strategy here is to make the work-group size
+    // as big as possible to converge/reduce the partial sums into the last
+    // sum faster.
     size_t WGSize = Range.get_local_range().size();
     size_t NWorkItems = NWorkGroups;
     size_t KernelRun = 1;
     while (NWorkItems > 1) {
-      // Before creating another kernel, add the event from the previous kernel
-      // to queue.
-      addEventToQueue(QueueCopy, MLastEvent);
-
-      // TODO: here the work-group size is not limited by user's needs,
-      // the better strategy here is to make the work-group-size as big
-      // as possible.
       WGSize = std::min(WGSize, NWorkItems);
       NWorkGroups = NWorkItems / WGSize;
       // The last group may be not fully loaded. Still register it as a group.
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -412,19 +412,21 @@ struct get_reduction_aux_2nd_kernel_name_t {
 ///
 /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
-                Reduction &Redu) {
+void reduCGFunc(handler &CGH, KernelType KernelFunc,
+                const nd_range<Dims> &Range, Reduction &Redu) {
 
   size_t NWorkItems = Range.get_global_range().size();
   size_t WGSize = Range.get_local_range().size();
   size_t NWorkGroups = Range.get_group_range().size();
 
-  bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0;
-  bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0);
+  // The last work-group may be not fully loaded with work, or the work group
+  // size may be not power of two. Those two cases considered inefficient
+  // as they require additional code and checks in the kernel.
+  bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0;
+  bool IsEfficientCase = !HasNonUniformWG && ((WGSize & (WGSize - 1)) == 0);
 
   bool IsUpdateOfUserAcc =
-      Reduction::accessor_mode == access::mode::read_write &&
-      NWorkGroups == 1;
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
 
   // Use local memory to reduce elements in work-groups into 0-th element.
   // If WGSize is not power of two, then WGSize+1 elements are allocated.
@@ -436,8 +438,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
   auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, CGH);
   auto ReduIdentity = Redu.getIdentity();
   if (IsEfficientCase) {
-    // Efficient case: work-groups are fully loaded and work-group size
-    // is power of two.
+    // Efficient case: work-groups are uniform and WGSize is is power of two.
     CGH.parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
       // Call user's functions. Reducer.MValue gets initialized there.
       typename Reduction::reducer_type Reducer(ReduIdentity);
@@ -464,13 +465,14 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
                               : LocalReds[0];
     });
   } else {
-    // Inefficient case: work-groups are not fully loaded
-    // or WGSize is not power of two.
+    // Inefficient case: work-groups are non uniform or WGSize is not power
+    // of two, which requires more conditional, read and write operations.
     // These two inefficient cases are handled by one kernel, which
     // can be split later into two separate kernels, if there are users who
     // really need more efficient code for them.
-    using AuxName = typename get_reduction_main_2nd_kernel_name_t<
-        KernelName, KernelType>::name;
+    using AuxName =
+        typename get_reduction_main_2nd_kernel_name_t<KernelName,
+                                                      KernelType>::name;
     CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
       // Call user's functions. Reducer.MValue gets initialized there.
       typename Reduction::reducer_type Reducer(ReduIdentity);
@@ -500,7 +502,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
 
       // Compute the partial sum/reduction for the work-group.
       if (LID == 0) {
-        auto GrID = NDIt.get_group_linear_id();
+        size_t GrID = NDIt.get_group_linear_id();
         auto V = BOp(LocalReds[0], LocalReds[WGSize]);
         Out.get_pointer().get()[GrID] =
             IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
@@ -518,19 +520,18 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
 /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
 void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
-                    size_t KernelRun, Reduction &Redu) {
+                   size_t KernelRun, Reduction &Redu) {
   size_t WGSize = Range.get_local_range().size();
   size_t NWorkGroups = Range.get_group_range().size();
 
   // The last work-group may be not fully loaded with work, or the work group
-  // size may be not power of those. Those two cases considered inefficient
+  // size may be not power of two. Those two cases considered inefficient
   // as they require additional code and checks in the kernel.
-  bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
-  bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0;
+  bool HasNonUniformWG = NWorkGroups * WGSize != NWorkItems;
+  bool IsEfficientCase = !HasNonUniformWG && (WGSize & (WGSize - 1)) == 0;
 
   bool IsUpdateOfUserAcc =
-      Reduction::accessor_mode == access::mode::read_write &&
-      NWorkGroups == 1;
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
 
   // Use local memory to reduce elements in work-groups into 0-th element.
   // If WGSize is not power of two, then WGSize+1 elements are allocated.
@@ -549,8 +550,9 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
   if (IsEfficientCase) {
     // Efficient case: work-groups are fully loaded and work-group size
     // is power of two.
-    using AuxName = typename get_reduction_aux_1st_kernel_name_t<
-        KernelName, KernelType>::name;
+    using AuxName =
+        typename get_reduction_aux_1st_kernel_name_t<KernelName,
+                                                     KernelType>::name;
     CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
       // Copy the element to local memory to prepare it for tree-reduction.
       size_t LID = NDIt.get_local_linear_id();
@@ -579,8 +581,9 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
     // These two inefficient cases are handled by one kernel, which
     // can be split later into two separate kernels, if there are users
     // who really need more efficient code for them.
-    using AuxName = typename get_reduction_aux_2nd_kernel_name_t<
-        KernelName, KernelType>::name;
+    using AuxName =
+        typename get_reduction_aux_2nd_kernel_name_t<KernelName,
+                                                     KernelType>::name;
     auto ReduIdentity = Redu.getIdentity();
     CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
       size_t WGSize = NDIt.get_local_range().size();
@@ -607,7 +610,7 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
 
       // Compute the partial sum/reduction for the work-group.
       if (LID == 0) {
-        auto GrID = NDIt.get_group_linear_id();
+        size_t GrID = NDIt.get_group_linear_id();
         auto V = BOp(LocalReds[0], LocalReds[WGSize]);
         Out.get_pointer().get()[GrID] =
             IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
@@ -351,11 +351,6 @@ class queue_impl {
   /// \return a native handle.
   pi_native_handle getNative() const;
 
-  /// Stores an event that should be associated with the queue
-  ///
-  /// \param Event is the event to be stored
-  void addEvent(event Event);
-
 private:
   /// Performs command group submission to the queue.
   ///
@@ -388,6 +383,11 @@ class queue_impl {
   /// \param Event is the event to be stored
   void addUSMEvent(event Event);
 
+  /// Stores an event that should be associated with the queue
+  ///
+  /// \param Event is the event to be stored
+  void addEvent(event Event);
+
   /// Protects all the fields that can be changed by class' methods.
   mutex_class MMutex;
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
@@ -19,11 +19,6 @@
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 
-void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
-                              cl::sycl::event Event) {
-  Queue->addEvent(std::move(Event));
-}
-
 event handler::finalize() {
   // This block of code is needed only for reduction implementation.
   // It is harmless (does nothing) for everything else.
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3231,7 +3231,6 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
 _ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
 _ZN2cl4sycl7handler18extractArgsAndReqsEv
 _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
-_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE
 _ZN2cl4sycl7handler8finalizeEv
 _ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
 _ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_