Skip to content

Commit f241d2b

Browse files
committed
[SYCL] Additional mostly NFC changes for reduction patch(1585)
Removed handler::dissociateWithHandler() Removed handler::addEventToQueue() and made queue_impl::addEvent() private again; Minor changes in comments. Replaced 'auto' with 'size_t' in couple places. Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
1 parent 47db644 commit f241d2b

File tree

5 files changed

+39
-80
lines changed

5 files changed

+39
-80
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 7 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -213,13 +213,6 @@ class __SYCL_EXPORT handler {
213213
/// usage in finalize() method.
214214
void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
215215

216-
/// Stores the given \param Event to the \param Queue.
217-
/// Even though MQueue is a field of handler, the method addEvent() of
218-
/// queue_impl class cannot be called inside this handler.hpp file
219-
/// as queue_impl is incomplete class for handler.
220-
static void addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
221-
cl::sycl::event Event);
222-
223216
/// Constructs CG object of specific type, passes it to Scheduler and
224217
/// returns sycl::event object representing the command group.
225218
/// It's expected that the method is the latest method executed before
@@ -270,30 +263,6 @@ class __SYCL_EXPORT handler {
270263
/*index*/ 0);
271264
}
272265

273-
template <typename DataT, int Dims, access::mode AccessMode,
274-
access::target AccessTarget>
275-
void dissociateWithHandler(accessor<DataT, Dims, AccessMode, AccessTarget,
276-
access::placeholder::false_t>
277-
Acc) {
278-
detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
279-
detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
280-
detail::Requirement *Req = AccImpl.get();
281-
282-
// Remove accessor from the list of requirements, accessors storage,
283-
// and from the list of associated accessors.
284-
auto ReqIt = std::find(MRequirements.begin(), MRequirements.end(), Req);
285-
auto AccIt = std::find(MAccStorage.begin(), MAccStorage.end(), AccImpl);
286-
auto It =
287-
std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
288-
[Req](const detail::ArgDesc &D) { return D.MPtr == Req; });
289-
assert((ReqIt != MRequirements.end() && AccIt != MAccStorage.end() &&
290-
It != MAssociatedAccesors.end()) &&
291-
"Cannot dissociate accessor.");
292-
MRequirements.erase(ReqIt);
293-
MAccStorage.erase(AccIt);
294-
MAssociatedAccesors.erase(It);
295-
}
296-
297266
// Recursively calls itself until arguments pack is fully processed.
298267
// The version for regular(standard layout) argument.
299268
template <typename T, typename... Ts>
@@ -832,30 +801,23 @@ class __SYCL_EXPORT handler {
832801
// necessary to reduce all partial sums into one final sum.
833802

834803
// 1. Call the kernel that includes user's lambda function.
835-
// If this kernel is going to be now last one, i.e. it does not write
836-
// to user's accessor, then detach user's accessor from this kernel
837-
// to make the dependencies between accessors and kernels more clean and
838-
// correct.
839-
if (NWorkGroups > 1)
840-
dissociateWithHandler(Redu.MAcc);
841-
842804
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
843805
auto QueueCopy = MQueue;
844806
MLastEvent = this->finalize();
845807

846808
// 2. Run the additional aux kernel as many times as needed to reduce
847809
// all partial sums into one scalar.
810+
811+
// TODO: user's nd_range and the work-group size specified there must
812+
// be honored only for the main kernel that calls user's lambda functions.
813+
// There is no need in using the same work-group size in these additional
814+
// kernels. Thus, the better strategy here is to make the work-group size
815+
// as big as possible to converge/reduce the partial sums into the last
816+
// sum faster.
848817
size_t WGSize = Range.get_local_range().size();
849818
size_t NWorkItems = NWorkGroups;
850819
size_t KernelRun = 1;
851820
while (NWorkItems > 1) {
852-
// Before creating another kernel, add the event from the previous kernel
853-
// to queue.
854-
addEventToQueue(QueueCopy, MLastEvent);
855-
856-
// TODO: here the work-group size is not limited by user's needs,
857-
// the better strategy here is to make the work-group-size as big
858-
// as possible.
859821
WGSize = std::min(WGSize, NWorkItems);
860822
NWorkGroups = NWorkItems / WGSize;
861823
// The last group may be not fully loaded. Still register it as a group.

sycl/include/CL/sycl/intel/reduction.hpp

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -412,19 +412,21 @@ struct get_reduction_aux_2nd_kernel_name_t {
412412
///
413413
/// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
414414
template <typename KernelName, typename KernelType, int Dims, class Reduction>
415-
void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
416-
Reduction &Redu) {
415+
void reduCGFunc(handler &CGH, KernelType KernelFunc,
416+
const nd_range<Dims> &Range, Reduction &Redu) {
417417

418418
size_t NWorkItems = Range.get_global_range().size();
419419
size_t WGSize = Range.get_local_range().size();
420420
size_t NWorkGroups = Range.get_group_range().size();
421421

422-
bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0;
423-
bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0);
422+
// The last work-group may be not fully loaded with work, or the work group
423+
// size may be not power of two. Those two cases considered inefficient
424+
// as they require additional code and checks in the kernel.
425+
bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0;
426+
bool IsEfficientCase = !HasNonUniformWG && ((WGSize & (WGSize - 1)) == 0);
424427

425428
bool IsUpdateOfUserAcc =
426-
Reduction::accessor_mode == access::mode::read_write &&
427-
NWorkGroups == 1;
429+
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
428430

429431
// Use local memory to reduce elements in work-groups into 0-th element.
430432
// If WGSize is not power of two, then WGSize+1 elements are allocated.
@@ -436,8 +438,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
436438
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, CGH);
437439
auto ReduIdentity = Redu.getIdentity();
438440
if (IsEfficientCase) {
439-
// Efficient case: work-groups are fully loaded and work-group size
440-
// is power of two.
441+
// Efficient case: work-groups are uniform and WGSize is is power of two.
441442
CGH.parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
442443
// Call user's functions. Reducer.MValue gets initialized there.
443444
typename Reduction::reducer_type Reducer(ReduIdentity);
@@ -464,13 +465,14 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
464465
: LocalReds[0];
465466
});
466467
} else {
467-
// Inefficient case: work-groups are not fully loaded
468-
// or WGSize is not power of two.
468+
// Inefficient case: work-groups are non uniform or WGSize is not power
469+
// of two, which requires more conditional, read and write operations.
469470
// These two inefficient cases are handled by one kernel, which
470471
// can be split later into two separate kernels, if there are users who
471472
// really need more efficient code for them.
472-
using AuxName = typename get_reduction_main_2nd_kernel_name_t<
473-
KernelName, KernelType>::name;
473+
using AuxName =
474+
typename get_reduction_main_2nd_kernel_name_t<KernelName,
475+
KernelType>::name;
474476
CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
475477
// Call user's functions. Reducer.MValue gets initialized there.
476478
typename Reduction::reducer_type Reducer(ReduIdentity);
@@ -500,7 +502,7 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
500502

501503
// Compute the partial sum/reduction for the work-group.
502504
if (LID == 0) {
503-
auto GrID = NDIt.get_group_linear_id();
505+
size_t GrID = NDIt.get_group_linear_id();
504506
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
505507
Out.get_pointer().get()[GrID] =
506508
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
@@ -518,19 +520,18 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range
518520
/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
519521
template <typename KernelName, typename KernelType, int Dims, class Reduction>
520522
void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
521-
size_t KernelRun, Reduction &Redu) {
523+
size_t KernelRun, Reduction &Redu) {
522524
size_t WGSize = Range.get_local_range().size();
523525
size_t NWorkGroups = Range.get_group_range().size();
524526

525527
// The last work-group may be not fully loaded with work, or the work group
526-
// size may be not power of those. Those two cases considered inefficient
528+
// size may be not power of two. Those two cases considered inefficient
527529
// as they require additional code and checks in the kernel.
528-
bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
529-
bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0;
530+
bool HasNonUniformWG = NWorkGroups * WGSize != NWorkItems;
531+
bool IsEfficientCase = !HasNonUniformWG && (WGSize & (WGSize - 1)) == 0;
530532

531533
bool IsUpdateOfUserAcc =
532-
Reduction::accessor_mode == access::mode::read_write &&
533-
NWorkGroups == 1;
534+
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
534535

535536
// Use local memory to reduce elements in work-groups into 0-th element.
536537
// If WGSize is not power of two, then WGSize+1 elements are allocated.
@@ -549,8 +550,9 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
549550
if (IsEfficientCase) {
550551
// Efficient case: work-groups are fully loaded and work-group size
551552
// is power of two.
552-
using AuxName = typename get_reduction_aux_1st_kernel_name_t<
553-
KernelName, KernelType>::name;
553+
using AuxName =
554+
typename get_reduction_aux_1st_kernel_name_t<KernelName,
555+
KernelType>::name;
554556
CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
555557
// Copy the element to local memory to prepare it for tree-reduction.
556558
size_t LID = NDIt.get_local_linear_id();
@@ -579,8 +581,9 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
579581
// These two inefficient cases are handled by one kernel, which
580582
// can be split later into two separate kernels, if there are users
581583
// who really need more efficient code for them.
582-
using AuxName = typename get_reduction_aux_2nd_kernel_name_t<
583-
KernelName, KernelType>::name;
584+
using AuxName =
585+
typename get_reduction_aux_2nd_kernel_name_t<KernelName,
586+
KernelType>::name;
584587
auto ReduIdentity = Redu.getIdentity();
585588
CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
586589
size_t WGSize = NDIt.get_local_range().size();
@@ -607,7 +610,7 @@ void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
607610

608611
// Compute the partial sum/reduction for the work-group.
609612
if (LID == 0) {
610-
auto GrID = NDIt.get_group_linear_id();
613+
size_t GrID = NDIt.get_group_linear_id();
611614
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
612615
Out.get_pointer().get()[GrID] =
613616
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;

sycl/source/detail/queue_impl.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -351,11 +351,6 @@ class queue_impl {
351351
/// \return a native handle.
352352
pi_native_handle getNative() const;
353353

354-
/// Stores an event that should be associated with the queue
355-
///
356-
/// \param Event is the event to be stored
357-
void addEvent(event Event);
358-
359354
private:
360355
/// Performs command group submission to the queue.
361356
///
@@ -388,6 +383,11 @@ class queue_impl {
388383
/// \param Event is the event to be stored
389384
void addUSMEvent(event Event);
390385

386+
/// Stores an event that should be associated with the queue
387+
///
388+
/// \param Event is the event to be stored
389+
void addEvent(event Event);
390+
391391
/// Protects all the fields that can be changed by class' methods.
392392
mutex_class MMutex;
393393

sycl/source/handler.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,6 @@
1919
__SYCL_INLINE_NAMESPACE(cl) {
2020
namespace sycl {
2121

22-
void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
23-
cl::sycl::event Event) {
24-
Queue->addEvent(std::move(Event));
25-
}
26-
2722
event handler::finalize() {
2823
// This block of code is needed only for reduction implementation.
2924
// It is harmless (does nothing) for everything else.

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3231,7 +3231,6 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
32313231
_ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
32323232
_ZN2cl4sycl7handler18extractArgsAndReqsEv
32333233
_ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
3234-
_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE
32353234
_ZN2cl4sycl7handler8finalizeEv
32363235
_ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
32373236
_ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_

0 commit comments

Comments
 (0)