Skip to content

Commit 544fb7c

Browse files
authored
[SYCL][CUDA] atomic_ref.fetch_add used for fp64 reduction if device.has(atomic64) (#3950)
The atomic64 device aspect has been added. Only the cuda backend is currently supported for the atomic64 device aspect. SYCL2020 introduces the atomic64 aspect which is required for the use of atomic_ref.fetch_add with fp64 operand. These changes allow devices with the atomic64 aspect to use a specialized reduction when using the add operator that makes use of atomics at the group level using atomic_ref. If the atomic64 aspect is not available then the default existing implementation which does not use atomic operations is used. Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
1 parent d1d1eb2 commit 544fb7c

File tree

12 files changed

+193
-5
lines changed

12 files changed

+193
-5
lines changed

sycl/include/CL/sycl/ONEAPI/reduction.hpp

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,28 @@ using IsReduOptForFastAtomicFetch =
5353
sycl::detail::IsBitAND<T, BinaryOperation>::value)>;
5454
#endif
5555

56+
// This type trait is used to detect if the atomic operation BinaryOperation
57+
// used with operands of the type T is available for using in reduction, in
58+
// addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
59+
// has the atomic64 aspect. This type trait should only be used if the device
60+
// has the atomic64 aspect. Note that this type trait is currently a subset of
61+
// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
62+
// using the reduce_over_group() algorithm to produce stable results across same
63+
// type devices.
64+
// TODO 32 bit floating point atomics are eventually expected to be supported by
65+
// the has_fast_atomics specialization. Once the reducer class is updated to
66+
// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
67+
// case should be removed here and replaced in IsReduOptForFastAtomicFetch.
68+
template <typename T, class BinaryOperation>
69+
using IsReduOptForAtomic64Add =
70+
#ifdef SYCL_REDUCTION_DETERMINISTIC
71+
bool_constant<false>;
72+
#else
73+
bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
74+
sycl::detail::is_sgenfloat<T>::value &&
75+
(sizeof(T) == 4 || sizeof(T) == 8)>;
76+
#endif
77+
5678
// This type trait is used to detect if the group algorithm reduce() used with
5779
// operands of the type T and the operation BinaryOperation is available
5880
// for using in reduction.
@@ -288,6 +310,18 @@ class reducer<T, BinaryOperation,
288310
.fetch_max(MValue);
289311
}
290312

313+
/// Atomic ADD operation: for floating point using atomic_ref
314+
template <typename _T = T, class _BinaryOperation = BinaryOperation>
315+
enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
316+
IsReduOptForAtomic64Add<T, _BinaryOperation>::value>
317+
atomic_combine(_T *ReduVarPtr) const {
318+
319+
atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
320+
sycl::ONEAPI::memory_scope::device,
321+
access::address_space::global_space>(
322+
*global_ptr<T>(ReduVarPtr)) += MValue;
323+
}
324+
291325
T MValue;
292326
};
293327

@@ -330,6 +364,8 @@ class reduction_impl : private reduction_impl_base {
330364
using local_accessor_type =
331365
accessor<T, buffer_dim, access::mode::read_write, access::target::local>;
332366

367+
static constexpr bool has_atomic_add_float64 =
368+
IsReduOptForAtomic64Add<T, BinaryOperation>::value;
333369
static constexpr bool has_fast_atomics =
334370
IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
335371
static constexpr bool has_fast_reduce =
@@ -636,7 +672,8 @@ class reduction_impl : private reduction_impl_base {
636672
/// require initialization with identity value, then return user's read-write
637673
/// accessor. Otherwise, create 1-element global buffer initialized with
638674
/// identity value and return an accessor to that buffer.
639-
template <bool HasFastAtomics = has_fast_atomics>
675+
676+
template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
640677
std::enable_if_t<HasFastAtomics, rw_accessor_type>
641678
getReadWriteAccessorToInitializedMem(handler &CGH) {
642679
if (!is_usm && !initializeToIdentity())
@@ -1467,6 +1504,50 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
14671504
}
14681505
}
14691506

1507+
// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
1508+
// temporarily 32) bit floating point support for atomic add.
1509+
// TODO 32 bit floating point atomics are eventually expected to be supported by
1510+
// the has_fast_atomics specialization. Corresponding changes to
1511+
// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
1512+
// be made.
1513+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
1514+
std::enable_if_t<Reduction::has_atomic_add_float64>
1515+
reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
1516+
const nd_range<Dims> &Range, Reduction &,
1517+
typename Reduction::rw_accessor_type Out) {
1518+
using Name = typename get_reduction_main_kernel_name_t<
1519+
KernelName, KernelType, Reduction::is_usm,
1520+
Reduction::has_atomic_add_float64,
1521+
typename Reduction::rw_accessor_type>::name;
1522+
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
1523+
// Call user's function. Reducer.MValue gets initialized there.
1524+
typename Reduction::reducer_type Reducer;
1525+
KernelFunc(NDIt, Reducer);
1526+
1527+
typename Reduction::binary_operation BOp;
1528+
Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp);
1529+
if (NDIt.get_local_linear_id() == 0) {
1530+
Reducer.atomic_combine(Reduction::getOutPointer(Out));
1531+
}
1532+
});
1533+
}
1534+
1535+
// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
1536+
// temporarily 32) bit floating point support for atomic add.
1537+
// TODO 32 bit floating point atomics are eventually expected to be supported by
1538+
// the has_fast_atomics specialization. Corresponding changes to
1539+
// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
1540+
// be made.
1541+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
1542+
enable_if_t<Reduction::has_atomic_add_float64>
1543+
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
1544+
const nd_range<Dims> &Range, Reduction &Redu) {
1545+
1546+
auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
1547+
reduCGFuncImplAtomic64<KernelName, KernelType, Dims, Reduction>(
1548+
CGH, KernelFunc, Range, Redu, Out);
1549+
}
1550+
14701551
inline void associateReduAccsWithHandlerHelper(handler &) {}
14711552

14721553
template <typename ReductionT>

sycl/include/CL/sycl/aspects.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ enum class aspect {
4141
ext_intel_mem_channel = 25,
4242
usm_atomic_host_allocations = 26,
4343
usm_atomic_shared_allocations = 27,
44+
atomic64 = 28
4445
};
4546

4647
} // namespace sycl

sycl/include/CL/sycl/detail/pi.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,8 @@ typedef enum {
285285
PI_DEVICE_INFO_GPU_SLICES = 0x10023,
286286
PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024,
287287
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
288-
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026
288+
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
289+
PI_DEVICE_INFO_ATOMIC_64 = 0x10110
289290
} _pi_device_info;
290291

291292
typedef enum {

sycl/include/CL/sycl/handler.hpp

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,11 @@ class reduction_impl;
210210
using cl::sycl::detail::enable_if_t;
211211
using cl::sycl::detail::queue_impl;
212212

213+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
214+
enable_if_t<Reduction::has_atomic_add_float64>
215+
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
216+
const nd_range<Dims> &Range, Reduction &Redu);
217+
213218
template <typename KernelName, typename KernelType, int Dims, class Reduction>
214219
enable_if_t<Reduction::has_fast_atomics>
215220
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
@@ -1383,6 +1388,49 @@ class __SYCL_EXPORT handler {
13831388
}
13841389
}
13851390

1391+
/// Implements parallel_for() accepting nd_range \p Range and one reduction
1392+
/// object. This version is a specialization for the add operator.
1393+
/// It performs runtime checks for device aspect "atomic64"; if found, fast
1394+
/// sycl::atomic_ref operations are used to update the reduction at the
1395+
/// end of each work-group work. Otherwise the default implementation is
1396+
/// used.
1397+
//
1398+
// If the reduction variable must be initialized with the identity value
1399+
// before the kernel run, then an additional working accessor is created,
1400+
// initialized with the identity value and used in the kernel. That working
1401+
// accessor is then copied to user's accessor or USM pointer after
1402+
// the kernel run.
1403+
// For USM pointers without initialize_to_identity properties the same scheme
1404+
// with working accessor is used as re-using user's USM pointer in the kernel
1405+
// would require creation of another variant of user's kernel, which does not
1406+
// seem efficient.
1407+
template <typename KernelName = detail::auto_name, typename KernelType,
1408+
int Dims, typename Reduction>
1409+
detail::enable_if_t<Reduction::has_atomic_add_float64>
1410+
parallel_for(nd_range<Dims> Range, Reduction Redu,
1411+
_KERNELFUNCPARAM(KernelFunc)) {
1412+
1413+
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
1414+
device D = detail::getDeviceFromHandler(*this);
1415+
1416+
if (D.has(aspect::atomic64)) {
1417+
1418+
ONEAPI::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
1419+
Redu);
1420+
1421+
if (Reduction::is_usm || Redu.initializeToIdentity()) {
1422+
this->finalize();
1423+
handler CopyHandler(QueueCopy, MIsHost);
1424+
CopyHandler.saveCodeLoc(MCodeLoc);
1425+
ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1426+
Redu);
1427+
MLastEvent = CopyHandler.finalize();
1428+
}
1429+
} else {
1430+
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1431+
}
1432+
}
1433+
13861434
/// Defines and invokes a SYCL kernel function for the specified nd_range.
13871435
/// Performs reduction operation specified in \p Redu.
13881436
///
@@ -1399,9 +1447,19 @@ class __SYCL_EXPORT handler {
13991447
/// optimized implementations waiting for their turn of code-review.
14001448
template <typename KernelName = detail::auto_name, typename KernelType,
14011449
int Dims, typename Reduction>
1402-
detail::enable_if_t<!Reduction::has_fast_atomics>
1450+
detail::enable_if_t<!Reduction::has_fast_atomics &&
1451+
!Reduction::has_atomic_add_float64>
14031452
parallel_for(nd_range<Dims> Range, Reduction Redu,
14041453
_KERNELFUNCPARAM(KernelFunc)) {
1454+
1455+
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1456+
}
1457+
1458+
template <typename KernelName, typename KernelType, int Dims,
1459+
typename Reduction>
1460+
detail::enable_if_t<!Reduction::has_fast_atomics>
1461+
parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
1462+
KernelType KernelFunc) {
14051463
// This parallel_for() is lowered to the following sequence:
14061464
// 1) Call a kernel that a) call user's lambda function and b) performs
14071465
// one iteration of reduction, storing the partial reductions/sums

sycl/include/CL/sycl/info/device_traits.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, max_clock_frequency, pi_uint32)
2222
__SYCL_PARAM_TRAITS_SPEC(device, address_bits, pi_uint32)
2323
__SYCL_PARAM_TRAITS_SPEC(device, max_mem_alloc_size, pi_uint64)
2424
__SYCL_PARAM_TRAITS_SPEC(device, image_support, bool)
25+
__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool)
2526
__SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
2627
__SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
2728
__SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)

sycl/include/CL/sycl/info/info_desc.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,8 @@ enum class device : cl_device_info {
144144
ext_intel_gpu_eu_count_per_subslice =
145145
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE,
146146
ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH,
147-
ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL
147+
ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL,
148+
atomic64 = PI_DEVICE_INFO_ATOMIC_64
148149
};
149150

150151
enum class device_type : pi_uint64 {

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,19 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
985985
bool ifp = (major >= 7);
986986
return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
987987
}
988+
989+
case PI_DEVICE_INFO_ATOMIC_64: {
990+
int major = 0;
991+
cl::sycl::detail::pi::assertion(
992+
cuDeviceGetAttribute(&major,
993+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
994+
device->get()) == CUDA_SUCCESS);
995+
996+
bool atomic64 = (major >= 6) ? true : false;
997+
return getInfo(param_value_size, param_value, param_value_size_ret,
998+
atomic64);
999+
}
1000+
9881001
case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
9891002
// NVIDIA devices only support one sub-group size (the warp size)
9901003
int warpSize = 0;
@@ -1362,7 +1375,11 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
13621375
return getInfo(param_value_size, param_value, param_value_size_ret, "");
13631376
}
13641377
case PI_DEVICE_INFO_EXTENSIONS: {
1365-
return getInfo(param_value_size, param_value, param_value_size_ret, "");
1378+
1379+
std::string SupportedExtensions = "cl_khr_fp64 ";
1380+
1381+
return getInfo(param_value_size, param_value, param_value_size_ret,
1382+
SupportedExtensions.c_str());
13661383
}
13671384
case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
13681385
// The minimum value for the FULL profile is 1 MB.

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
178178
case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
179179
case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
180180
case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
181+
case PI_DEVICE_INFO_ATOMIC_64:
181182
return PI_INVALID_VALUE;
182183

183184
default:

sycl/source/detail/device_impl.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,8 @@ bool device_impl::has(aspect Aspect) const {
238238
return has_extension("cl_khr_int64_base_atomics");
239239
case aspect::int64_extended_atomics:
240240
return has_extension("cl_khr_int64_extended_atomics");
241+
case aspect::atomic64:
242+
return get_info<info::device::atomic64>();
241243
case aspect::image:
242244
return get_info<info::device::image_support>();
243245
case aspect::online_compiler:

sycl/source/detail/device_info.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,23 @@ template <> struct get_device_info<bool, info::device::queue_profiling> {
232232
}
233233
};
234234

235+
// Specialization for atomic64 that is necessary because
236+
// PI_DEVICE_INFO_ATOMIC_64 is currently only implemented for the cuda backend.
237+
template <> struct get_device_info<bool, info::device::atomic64> {
238+
static bool get(RT::PiDevice dev, const plugin &Plugin) {
239+
240+
bool result = false;
241+
242+
RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
243+
dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64), sizeof(result),
244+
&result, nullptr);
245+
if (Err != PI_SUCCESS) {
246+
return false;
247+
}
248+
return result;
249+
}
250+
};
251+
235252
// Specialization for exec_capabilities, OpenCL returns a bitfield
236253
template <>
237254
struct get_device_info<std::vector<info::execution_capability>,
@@ -610,6 +627,10 @@ template <> inline bool get_device_info_host<info::device::image_support>() {
610627
return true;
611628
}
612629

630+
template <> inline bool get_device_info_host<info::device::atomic64>() {
631+
return false;
632+
}
633+
613634
template <>
614635
inline cl_uint get_device_info_host<info::device::max_read_image_args>() {
615636
// current value is the required minimum

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4127,6 +4127,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65571EEENS3_12param_traitsIS4_XT_
41274127
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65572EEENS3_12param_traitsIS4_XT_EE11return_typeEv
41284128
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65573EEENS3_12param_traitsIS4_XT_EE11return_typeEv
41294129
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65574EEENS3_12param_traitsIS4_XT_EE11return_typeEv
4130+
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65808EEENS3_12param_traitsIS4_XT_EE11return_typeEv
41304131
_ZNK2cl4sycl6device9getNativeEv
41314132
_ZNK2cl4sycl6kernel11get_contextEv
41324133
_ZNK2cl4sycl6kernel11get_programEv

sycl/test/on-device/basic_tests/aspects.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ int main() {
5757
if (plt.has(aspect::int64_extended_atomics)) {
5858
std::cout << " extended atomic operations" << std::endl;
5959
}
60+
if (plt.has(aspect::atomic64)) {
61+
std::cout << " atomic64" << std::endl;
62+
}
6063
if (plt.has(aspect::image)) {
6164
std::cout << " images" << std::endl;
6265
}

0 commit comments

Comments
 (0)