Skip to content

[SYCL][Reduction] Remove atomic64 check for float reductions #6434

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 20 additions & 23 deletions sycl/include/sycl/ext/oneapi/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ using IsReduOptForFastAtomicFetch =
#ifdef SYCL_REDUCTION_DETERMINISTIC
bool_constant<false>;
#else
bool_constant<sycl::detail::is_sgeninteger<T>::value &&
bool_constant<((sycl::detail::is_sgenfloat<T>::value && sizeof(T) == 4) ||
sycl::detail::is_sgeninteger<T>::value) &&
sycl::detail::IsValidAtomicType<T>::value &&
(sycl::detail::IsPlus<T, BinaryOperation>::value ||
sycl::detail::IsMinimum<T, BinaryOperation>::value ||
Expand All @@ -74,18 +75,15 @@ using IsReduOptForFastAtomicFetch =
// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
// using the reduce_over_group() algorithm to produce stable results across same
// type devices.
// TODO 32 bit floating point atomics are eventually expected to be supported by
// the has_fast_atomics specialization. Once the reducer class is updated to
// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
// case should be removed here and replaced in IsReduOptForFastAtomicFetch.
template <typename T, class BinaryOperation>
using IsReduOptForAtomic64Add =
using IsReduOptForAtomic64Op =
#ifdef SYCL_REDUCTION_DETERMINISTIC
bool_constant<false>;
#else
bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
sycl::detail::is_sgenfloat<T>::value &&
(sizeof(T) == 4 || sizeof(T) == 8)>;
bool_constant<(sycl::detail::IsPlus<T, BinaryOperation>::value ||
sycl::detail::IsMinimum<T, BinaryOperation>::value ||
sycl::detail::IsMaximum<T, BinaryOperation>::value) &&
sycl::detail::is_sgenfloat<T>::value && sizeof(T) == 8>;
#endif

// This type trait is used to detect if the group algorithm reduce() used with
Expand Down Expand Up @@ -248,7 +246,7 @@ template <class Reducer> class combiner {
typename _T = T, class _BinaryOperation = BinaryOperation>
enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
(IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
IsReduOptForAtomic64Add<T, _BinaryOperation>::value) &&
IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
sycl::detail::IsPlus<T, _BinaryOperation>::value>
atomic_combine(_T *ReduVarPtr) const {
atomic_combine_impl<Space>(
Expand Down Expand Up @@ -294,7 +292,8 @@ template <class Reducer> class combiner {
template <access::address_space Space = access::address_space::global_space,
typename _T = T, class _BinaryOperation = BinaryOperation>
enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
(IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
sycl::detail::IsMinimum<T, _BinaryOperation>::value>
atomic_combine(_T *ReduVarPtr) const {
atomic_combine_impl<Space>(
Expand All @@ -305,7 +304,8 @@ template <class Reducer> class combiner {
template <access::address_space Space = access::address_space::global_space,
typename _T = T, class _BinaryOperation = BinaryOperation>
enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
(IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
sycl::detail::IsMaximum<T, _BinaryOperation>::value>
atomic_combine(_T *ReduVarPtr) const {
atomic_combine_impl<Space>(
Expand Down Expand Up @@ -565,8 +565,8 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
using rw_accessor_type = accessor<T, accessor_dim, access::mode::read_write,
access::target::device, is_placeholder,
ext::oneapi::accessor_property_list<>>;
static constexpr bool has_atomic_add_float64 =
IsReduOptForAtomic64Add<T, BinaryOperation>::value;
static constexpr bool has_float64_atomics =
IsReduOptForAtomic64Op<T, BinaryOperation>::value;
static constexpr bool has_fast_atomics =
IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
static constexpr bool has_fast_reduce =
Expand Down Expand Up @@ -645,7 +645,7 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
/// require initialization with identity value, then return user's read-write
/// accessor. Otherwise, create global buffer with 'num_elements' initialized
/// with identity value and return an accessor to that buffer.
template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
template <bool HasFastAtomics = (has_fast_atomics || has_float64_atomics)>
std::enable_if_t<HasFastAtomics, rw_accessor_type>
getReadWriteAccessorToInitializedMem(handler &CGH) {
if constexpr (is_rw_acc) {
Expand Down Expand Up @@ -2040,18 +2040,15 @@ template <class KernelName> struct NDRangeAtomic64;
} // namespace main_krn
} // namespace reduction

// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
// temporarily 32) bit floating point support for atomic add.
// TODO 32 bit floating point atomics are eventually expected to be supported by
// the has_fast_atomics specialization. Corresponding changes to
// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
// be made.
// Specialization for devices with the atomic64 aspect, which guarantees 64 bit
// floating point support for atomic reduction operation.
template <typename KernelName, typename KernelType, int Dims, class Reduction>
void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu) {
auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
static_assert(Reduction::has_atomic_add_float64,
"Only suitable for reductions that have FP64 atomic add.");
static_assert(
Reduction::has_float64_atomics,
"Only suitable for reductions that have FP64 atomic operations.");
constexpr size_t NElements = Reduction::num_elements;
using Name =
__sycl_reduction_kernel<reduction::main_krn::NDRangeAtomic64, KernelName>;
Expand Down
4 changes: 2 additions & 2 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1669,13 +1669,13 @@ class __SYCL_EXPORT handler {
void parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {
if constexpr (!Reduction::has_fast_atomics &&
!Reduction::has_atomic_add_float64) {
!Reduction::has_float64_atomics) {
// The most basic implementation.
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
return;
} else { // Can't "early" return for "if constexpr".
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
if constexpr (Reduction::has_atomic_add_float64) {
if constexpr (Reduction::has_float64_atomics) {
/// This version is a specialization for the add
/// operator. It performs runtime checks for device aspect "atomic64";
/// if found, fast sycl::atomic_ref operations are used to update the
Expand Down