[SYCL][CUDA] atomic_ref.fetch_add used for fp64 reduction if device.has(atomic64) (#3950)

JackAKirk · web-flow · commit 544fb7c8501a · 2021-06-30T08:22:39.000+03:00
The atomic64 device aspect has been added.  Only the cuda backend is currently supported for the atomic64 device aspect.

SYCL2020 introduces the atomic64 aspect which is required for the use of atomic_ref.fetch_add with fp64 operand.
These changes allow devices with the atomic64 aspect to use a specialized reduction when using the add operator that makes use of atomics at the group level using atomic_ref.
If the atomic64 aspect is not available then the default existing implementation which does not use atomic operations is used.

Signed-off-by: JackAKirk &lt;jack.kirk@codeplay.com&gt;
diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -53,6 +53,28 @@ using IsReduOptForFastAtomicFetch =
                    sycl::detail::IsBitAND<T, BinaryOperation>::value)>;
 #endif
 
+// This type trait is used to detect if the atomic operation BinaryOperation
+// used with operands of the type T is available for using in reduction, in
+// addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
+// has the atomic64 aspect. This type trait should only be used if the device
+// has the atomic64 aspect.  Note that this type trait is currently a subset of
+// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
+// using the reduce_over_group() algorithm to produce stable results across same
+// type devices.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Once the reducer class is updated to
+// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
+// case should be removed here and replaced in IsReduOptForFastAtomicFetch.
+template <typename T, class BinaryOperation>
+using IsReduOptForAtomic64Add =
+#ifdef SYCL_REDUCTION_DETERMINISTIC
+    bool_constant<false>;
+#else
+    bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
+                  sycl::detail::is_sgenfloat<T>::value &&
+                  (sizeof(T) == 4 || sizeof(T) == 8)>;
+#endif
+
 // This type trait is used to detect if the group algorithm reduce() used with
 // operands of the type T and the operation BinaryOperation is available
 // for using in reduction.
@@ -288,6 +310,18 @@ class reducer<T, BinaryOperation,
         .fetch_max(MValue);
   }
 
+  /// Atomic ADD operation: for floating point using atomic_ref
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              IsReduOptForAtomic64Add<T, _BinaryOperation>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+
+    atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
+               sycl::ONEAPI::memory_scope::device,
+               access::address_space::global_space>(
+        *global_ptr<T>(ReduVarPtr)) += MValue;
+  }
+
   T MValue;
 };
 
@@ -330,6 +364,8 @@ class reduction_impl : private reduction_impl_base {
   using local_accessor_type =
       accessor<T, buffer_dim, access::mode::read_write, access::target::local>;
 
+  static constexpr bool has_atomic_add_float64 =
+      IsReduOptForAtomic64Add<T, BinaryOperation>::value;
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
   static constexpr bool has_fast_reduce =
@@ -636,7 +672,8 @@ class reduction_impl : private reduction_impl_base {
   /// require initialization with identity value, then return user's read-write
   /// accessor. Otherwise, create 1-element global buffer initialized with
   /// identity value and return an accessor to that buffer.
-  template <bool HasFastAtomics = has_fast_atomics>
+
+  template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
   std::enable_if_t<HasFastAtomics, rw_accessor_type>
   getReadWriteAccessorToInitializedMem(handler &CGH) {
     if (!is_usm && !initializeToIdentity())
@@ -1467,6 +1504,50 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
   }
 }
 
+// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
+// temporarily 32) bit floating point support for atomic add.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Corresponding changes to
+// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
+// be made.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+std::enable_if_t<Reduction::has_atomic_add_float64>
+reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
+                       const nd_range<Dims> &Range, Reduction &,
+                       typename Reduction::rw_accessor_type Out) {
+  using Name = typename get_reduction_main_kernel_name_t<
+      KernelName, KernelType, Reduction::is_usm,
+      Reduction::has_atomic_add_float64,
+      typename Reduction::rw_accessor_type>::name;
+  CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
+    // Call user's function. Reducer.MValue gets initialized there.
+    typename Reduction::reducer_type Reducer;
+    KernelFunc(NDIt, Reducer);
+
+    typename Reduction::binary_operation BOp;
+    Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp);
+    if (NDIt.get_local_linear_id() == 0) {
+      Reducer.atomic_combine(Reduction::getOutPointer(Out));
+    }
+  });
+}
+
+// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
+// temporarily 32) bit floating point support for atomic add.
+// TODO 32 bit floating point atomics are eventually expected to be supported by
+// the has_fast_atomics specialization. Corresponding changes to
+// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
+// be made.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_atomic_add_float64>
+reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
+                   const nd_range<Dims> &Range, Reduction &Redu) {
+
+  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
+  reduCGFuncImplAtomic64<KernelName, KernelType, Dims, Reduction>(
+      CGH, KernelFunc, Range, Redu, Out);
+}
+
 inline void associateReduAccsWithHandlerHelper(handler &) {}
 
 template <typename ReductionT>
diff --git a/sycl/include/CL/sycl/aspects.hpp b/sycl/include/CL/sycl/aspects.hpp
@@ -41,6 +41,7 @@ enum class aspect {
   ext_intel_mem_channel = 25,
   usm_atomic_host_allocations = 26,
   usm_atomic_shared_allocations = 27,
+  atomic64 = 28
 };
 
 } // namespace sycl
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
@@ -285,7 +285,8 @@ typedef enum {
   PI_DEVICE_INFO_GPU_SLICES = 0x10023,
   PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024,
   PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
-  PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026
+  PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
+  PI_DEVICE_INFO_ATOMIC_64 = 0x10110
 } _pi_device_info;
 
 typedef enum {
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -210,6 +210,11 @@ class reduction_impl;
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::queue_impl;
 
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+enable_if_t<Reduction::has_atomic_add_float64>
+reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
+                   const nd_range<Dims> &Range, Reduction &Redu);
+
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
 enable_if_t<Reduction::has_fast_atomics>
 reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
@@ -1383,6 +1388,49 @@ class __SYCL_EXPORT handler {
     }
   }
 
+  /// Implements parallel_for() accepting nd_range \p Range and one reduction
+  /// object. This version is a specialization for the add operator.
+  /// It performs runtime checks for device aspect "atomic64"; if found, fast
+  /// sycl::atomic_ref operations are used to update the reduction at the
+  /// end of each work-group work.  Otherwise the default implementation is
+  /// used.
+  //
+  // If the reduction variable must be initialized with the identity value
+  // before the kernel run, then an additional working accessor is created,
+  // initialized with the identity value and used in the kernel. That working
+  // accessor is then copied to user's accessor or USM pointer after
+  // the kernel run.
+  // For USM pointers without initialize_to_identity properties the same scheme
+  // with working accessor is used as re-using user's USM pointer in the kernel
+  // would require creation of another variant of user's kernel, which does not
+  // seem efficient.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  detail::enable_if_t<Reduction::has_atomic_add_float64>
+  parallel_for(nd_range<Dims> Range, Reduction Redu,
+               _KERNELFUNCPARAM(KernelFunc)) {
+
+    shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
+    device D = detail::getDeviceFromHandler(*this);
+
+    if (D.has(aspect::atomic64)) {
+
+      ONEAPI::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
+                                                     Redu);
+
+      if (Reduction::is_usm || Redu.initializeToIdentity()) {
+        this->finalize();
+        handler CopyHandler(QueueCopy, MIsHost);
+        CopyHandler.saveCodeLoc(MCodeLoc);
+        ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
+                                                                 Redu);
+        MLastEvent = CopyHandler.finalize();
+      }
+    } else {
+      parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
+    }
+  }
+
   /// Defines and invokes a SYCL kernel function for the specified nd_range.
   /// Performs reduction operation specified in \p Redu.
   ///
@@ -1399,9 +1447,19 @@ class __SYCL_EXPORT handler {
   /// optimized implementations waiting for their turn of code-review.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
-  detail::enable_if_t<!Reduction::has_fast_atomics>
+  detail::enable_if_t<!Reduction::has_fast_atomics &&
+                      !Reduction::has_atomic_add_float64>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {
+
+    parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
+  }
+
+  template <typename KernelName, typename KernelType, int Dims,
+            typename Reduction>
+  detail::enable_if_t<!Reduction::has_fast_atomics>
+  parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
+                    KernelType KernelFunc) {
     // This parallel_for() is lowered to the following sequence:
     // 1) Call a kernel that a) call user's lambda function and b) performs
     //    one iteration of reduction, storing the partial reductions/sums
diff --git a/sycl/include/CL/sycl/info/device_traits.def b/sycl/include/CL/sycl/info/device_traits.def
@@ -22,6 +22,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, max_clock_frequency, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, address_bits, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, max_mem_alloc_size, pi_uint64)
 __SYCL_PARAM_TRAITS_SPEC(device, image_support, bool)
+__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool)
 __SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
 __SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)
diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp
@@ -144,7 +144,8 @@ enum class device : cl_device_info {
   ext_intel_gpu_eu_count_per_subslice =
       PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE,
   ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH,
-  ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL
+  ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL,
+  atomic64 = PI_DEVICE_INFO_ATOMIC_64
 };
 
 enum class device_type : pi_uint64 {
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -985,6 +985,19 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     bool ifp = (major >= 7);
     return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
   }
+
+  case PI_DEVICE_INFO_ATOMIC_64: {
+    int major = 0;
+    cl::sycl::detail::pi::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    bool atomic64 = (major >= 6) ? true : false;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   atomic64);
+  }
+
   case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // NVIDIA devices only support one sub-group size (the warp size)
     int warpSize = 0;
@@ -1362,7 +1375,11 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret, "");
   }
   case PI_DEVICE_INFO_EXTENSIONS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
+
+    std::string SupportedExtensions = "cl_khr_fp64 ";
+
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   SupportedExtensions.c_str());
   }
   case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
     // The minimum value for the FULL profile is 1 MB.
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
@@ -178,6 +178,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
   case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
   case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
+  case PI_DEVICE_INFO_ATOMIC_64:
     return PI_INVALID_VALUE;
 
   default:
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
@@ -238,6 +238,8 @@ bool device_impl::has(aspect Aspect) const {
     return has_extension("cl_khr_int64_base_atomics");
   case aspect::int64_extended_atomics:
     return has_extension("cl_khr_int64_extended_atomics");
+  case aspect::atomic64:
+    return get_info<info::device::atomic64>();
   case aspect::image:
     return get_info<info::device::image_support>();
   case aspect::online_compiler:
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
@@ -232,6 +232,23 @@ template <> struct get_device_info<bool, info::device::queue_profiling> {
   }
 };
 
+// Specialization for atomic64 that is necessary because
+// PI_DEVICE_INFO_ATOMIC_64 is currently only implemented for the cuda backend.
+template <> struct get_device_info<bool, info::device::atomic64> {
+  static bool get(RT::PiDevice dev, const plugin &Plugin) {
+
+    bool result = false;
+
+    RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
+        dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64), sizeof(result),
+        &result, nullptr);
+    if (Err != PI_SUCCESS) {
+      return false;
+    }
+    return result;
+  }
+};
+
 // Specialization for exec_capabilities, OpenCL returns a bitfield
 template <>
 struct get_device_info<std::vector<info::execution_capability>,
@@ -610,6 +627,10 @@ template <> inline bool get_device_info_host<info::device::image_support>() {
   return true;
 }
 
+template <> inline bool get_device_info_host<info::device::atomic64>() {
+  return false;
+}
+
 template <>
 inline cl_uint get_device_info_host<info::device::max_read_image_args>() {
   // current value is the required minimum
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
@@ -4127,6 +4127,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65571EEENS3_12param_traitsIS4_XT_
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65572EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65573EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65574EEENS3_12param_traitsIS4_XT_EE11return_typeEv
+_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65808EEENS3_12param_traitsIS4_XT_EE11return_typeEv
 _ZNK2cl4sycl6device9getNativeEv
 _ZNK2cl4sycl6kernel11get_contextEv
 _ZNK2cl4sycl6kernel11get_programEv
diff --git a/sycl/test/on-device/basic_tests/aspects.cpp b/sycl/test/on-device/basic_tests/aspects.cpp
@@ -57,6 +57,9 @@ int main() {
     if (plt.has(aspect::int64_extended_atomics)) {
       std::cout << "  extended atomic operations" << std::endl;
     }
+    if (plt.has(aspect::atomic64)) {
+      std::cout << "  atomic64" << std::endl;
+    }
     if (plt.has(aspect::image)) {
       std::cout << "  images" << std::endl;
     }

Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,9 @@ int main() {`
`57`	`57`	`if (plt.has(aspect::int64_extended_atomics)) {`
`58`	`58`	`std::cout << " extended atomic operations" << std::endl;`
`59`	`59`	`}`
	`60`	`+ if (plt.has(aspect::atomic64)) {`
	`61`	`+ std::cout << " atomic64" << std::endl;`
	`62`	`+ }`
`60`	`63`	`if (plt.has(aspect::image)) {`
`61`	`64`	`std::cout << " images" << std::endl;`
`62`	`65`	`}`