Skip to content

[SYCL] Implement USM vars and placeholder accessors passed to reduction #1657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 46 additions & 34 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ __SYCL_EXPORT device getDeviceFromHandler(handler &);

namespace intel {
namespace detail {
template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
access::placeholder IsPlaceholder>
template <typename T, class BinaryOperation, int Dims, bool IsUSM,
access::mode AccMode, access::placeholder IsPlaceholder>
class reduction_impl;

using cl::sycl::detail::enable_if_t;
Expand Down Expand Up @@ -140,12 +140,12 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
size_t KernelRun, Reduction &Redu);
Reduction &Redu);

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
size_t KernelRun, Reduction &Redu);
Reduction &Redu);
} // namespace detail
} // namespace intel

Expand Down Expand Up @@ -266,11 +266,9 @@ class __SYCL_EXPORT handler {

bool is_host() { return MIsHost; }

template <typename DataT, int Dims, access::mode AccessMode,
access::target AccessTarget>
void associateWithHandler(accessor<DataT, Dims, AccessMode, AccessTarget,
access::placeholder::false_t>
Acc) {
template <typename T, int Dims, access::mode AccMode,
access::target AccTarget, access::placeholder IsPH>
void associateWithHandler(accessor<T, Dims, AccMode, AccTarget, IsPH> Acc) {
detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
detail::Requirement *Req = AccImpl.get();
Expand All @@ -281,7 +279,7 @@ class __SYCL_EXPORT handler {
// Add an accessor to the handler list of associated accessors.
// For associated accessors index does not means nothing.
MAssociatedAccesors.emplace_back(detail::kernel_param_kind_t::kind_accessor,
Req, static_cast<int>(AccessTarget),
Req, static_cast<int>(AccTarget),
/*index*/ 0);
}

Expand Down Expand Up @@ -692,18 +690,7 @@ class __SYCL_EXPORT handler {
void
require(accessor<DataT, Dims, AccMode, AccTarget, access::placeholder::true_t>
Acc) {
detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
detail::Requirement *Req = AccImpl.get();
// Add accessor to the list of requirements.
MRequirements.push_back(Req);
// Store copy of the accessor.
MAccStorage.push_back(std::move(AccImpl));
// Add an accessor to the handler list of associated accessors.
// For associated accessors index does not means nothing.
MAssociatedAccesors.emplace_back(detail::kernel_param_kind_t::kind_accessor,
Req, static_cast<int>(AccTarget),
/*index*/ 0);
associateWithHandler(Acc);
}

/// Registers event dependencies on this command group.
Expand Down Expand Up @@ -867,8 +854,22 @@ class __SYCL_EXPORT handler {
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
Reduction::has_fast_atomics>
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
Redu.MAcc);
if (Reduction::is_usm)
Redu.associateWithHandler(*this);
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
auto Acc = Redu.getUserAccessor();
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu, Acc);

// Submit non-blocking copy from reduction accessor to user's reduction
// variable.
if (Reduction::is_usm) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
Redu.associateWithHandler(CopyHandler);
CopyHandler.copy(Acc, Redu.getUSMPointer());
MLastEvent = CopyHandler.finalize();
}
}

/// Implements parallel_for() accepting nd_range and 1 reduction variable
Expand All @@ -886,7 +887,7 @@ class __SYCL_EXPORT handler {
detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
Reduction::has_fast_atomics>
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
auto QueueCopy = MQueue;
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
auto RWAcc = Redu.getReadWriteScalarAcc(*this);
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
RWAcc);
Expand All @@ -896,7 +897,8 @@ class __SYCL_EXPORT handler {
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
CopyHandler.associateWithHandler(RWAcc);
CopyHandler.copy(RWAcc, Redu.MAcc);
Redu.associateWithHandler(CopyHandler);
CopyHandler.copy(RWAcc, Redu.getUserAccessor());
MLastEvent = CopyHandler.finalize();
}

Expand Down Expand Up @@ -935,8 +937,10 @@ class __SYCL_EXPORT handler {
// necessary to reduce all partial sums into one final sum.

// 1. Call the kernel that includes user's lambda function.
if (Reduction::is_usm && NWorkGroups == 1)
Redu.associateWithHandler(*this);
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
auto QueueCopy = MQueue;
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
this->finalize();

// 2. Run the additional aux kernel as many times as needed to reduce
Expand All @@ -950,7 +954,6 @@ class __SYCL_EXPORT handler {
// sum faster.
size_t WGSize = Range.get_local_range().size();
size_t NWorkItems = NWorkGroups;
size_t KernelRun = 1;
while (NWorkItems > 1) {
WGSize = std::min(WGSize, NWorkItems);
NWorkGroups = NWorkItems / WGSize;
Expand All @@ -965,14 +968,23 @@ class __SYCL_EXPORT handler {
// The last kernel DOES write to reduction's accessor.
// Associate it with handler manually.
if (NWorkGroups == 1)
AuxHandler.associateWithHandler(Redu.MAcc);
intel::detail::reduAuxCGFunc<KernelName, KernelType>(
AuxHandler, Range, NWorkItems, KernelRun, Redu);
Redu.associateWithHandler(AuxHandler);
intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
NWorkItems, Redu);
MLastEvent = AuxHandler.finalize();

NWorkItems = NWorkGroups;
++KernelRun;
} // end while (NWorkItems > 1)

// Submit non-blocking copy from reduction accessor to user's reduction
// variable.
if (Reduction::is_usm) {
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
Redu.associateWithHandler(CopyHandler);
CopyHandler.copy(Redu.getUserAccessor(), Redu.getUSMPointer());
MLastEvent = CopyHandler.finalize();
}
}

/// Hierarchical kernel invocation method of a kernel defined as a lambda
Expand Down Expand Up @@ -1614,8 +1626,8 @@ class __SYCL_EXPORT handler {
friend class detail::stream_impl;
// Make reduction_impl friend to store buffers and arrays created for it
// in handler from reduction_impl methods.
template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
access::placeholder IsPlaceholder>
template <typename T, class BinaryOperation, int Dims, bool IsUSM,
access::mode AccMode, access::placeholder IsPlaceholder>
friend class intel::detail::reduction_impl;
};
} // namespace sycl
Expand Down
Loading