Skip to content

[SYCL] Fix reductions in preview after https://github.com/intel/llvm/pull/18767 #18898

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 11 additions & 73 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,12 @@ class type_erased_cgfo_ty {

public:
template <class T>
type_erased_cgfo_ty(T &f)
// NOTE: Even if `T` is a pointer to a function, `&f` is a pointer to a
type_erased_cgfo_ty(T &&f)
// NOTE: Even if `f` is a pointer to a function, `&f` is a pointer to a
// pointer to a function and as such can be casted to `void *` (pointer to
// a function cannot be casted).
: object(static_cast<const void *>(&f)), invoker_f(&invoker<T>::call) {}
: object(static_cast<const void *>(&f)),
invoker_f(&invoker<std::remove_reference_t<T>>::call) {}
~type_erased_cgfo_ty() = default;

type_erased_cgfo_ty(const type_erased_cgfo_ty &) = delete;
Expand Down Expand Up @@ -3878,14 +3879,6 @@ class HandlerAccess {
Handler.parallel_for_impl(Range, Props, Kernel);
}

template <typename T, typename> struct dependent {
using type = T;
};
template <typename T>
using dependent_queue_t = typename dependent<queue, T>::type;
template <typename T>
using dependent_handler_t = typename dependent<handler, T>::type;

// pre/postProcess are used only for reductions right now, but the
// abstractions they provide aren't reduction-specific. The main problem they
// solve is
Expand All @@ -3901,71 +3894,16 @@ class HandlerAccess {
// inside control group function object (lambda above) so we resort to a
// somewhat hacky way of creating multiple `handler`s and manual finalization
// of them (instead of the one in `queue::submit`).
//
// Overloads with `queue &q` are provided in case the caller has it created
// already to avoid unnecessary reference count increments associated with
// `handler::getQueue()`.
template <class FunctorTy>
static void preProcess(handler &CGH, dependent_queue_t<FunctorTy> &q,
FunctorTy Func) {
bool EventNeeded = !q.is_in_order();
handler AuxHandler(getSyclObjImpl(q), EventNeeded);
AuxHandler.copyCodeLoc(CGH);
std::forward<FunctorTy>(Func)(AuxHandler);
auto E = AuxHandler.finalize();
assert(!CGH.MIsFinalized &&
"Can't do pre-processing if the command has been enqueued already!");
if (EventNeeded)
CGH.depends_on(E);
}
__SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F);
__SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F);

template <class FunctorTy>
static void preProcess(dependent_handler_t<FunctorTy> &CGH,
FunctorTy &&Func) {
preProcess(CGH, CGH.getQueue(), std::forward<FunctorTy>(Func));
static void preProcess(handler &CGH, FunctorTy &Func) {
preProcess(CGH, type_erased_cgfo_ty{Func});
}
template <class FunctorTy>
static void postProcess(dependent_handler_t<FunctorTy> &CGH,
FunctorTy &&Func) {
// The "hacky" `handler`s manipulation mentioned above and implemented here
// is far from perfect. A better approach would be
//
// bool OrigNeedsEvent = CGH.needsEvent()
// assert(CGH.not_finalized/enqueued());
// if (!InOrderQueue)
// CGH.setNeedsEvent()
//
// handler PostProcessHandler(Queue, OrigNeedsEvent)
// auto E = CGH.finalize(); // enqueue original or current last
// // post-process
// if (!InOrder)
// PostProcessHandler.depends_on(E)
//
// swap_impls(CGH, PostProcessHandler)
// return; // queue::submit finalizes PostProcessHandler and returns its
// // event if necessary.
//
// Still hackier than "real" `queue::submit` but at least somewhat sane.
// That, however hasn't been tried yet and we have an even hackier approach
// copied from what's been done in an old reductions implementation before
// eventless submission work has started. Not sure how feasible the approach
// above is at this moment.

// This `finalize` is wrong (at least logically) if
// `assert(!CGH.eventNeeded())`
auto E = CGH.finalize();
dependent_queue_t<FunctorTy> Queue = CGH.getQueue();
bool InOrder = Queue.is_in_order();
// Cannot use `CGH.eventNeeded()` alone as there might be subsequent
// `postProcess` calls and we cannot address them properly similarly to the
// `finalize` issue described above. `swap_impls` suggested above might be
// able to handle this scenario naturally.
handler AuxHandler(getSyclObjImpl(Queue), CGH.eventNeeded() || !InOrder);
if (!InOrder)
AuxHandler.depends_on(E);
AuxHandler.copyCodeLoc(CGH);
std::forward<FunctorTy>(Func)(AuxHandler);
CGH.MLastEvent = AuxHandler.finalize();
return;
static void postProcess(handler &CGH, FunctorTy &Func) {
postProcess(CGH, type_erased_cgfo_ty{Func});
}
};
} // namespace detail
Expand Down
2 changes: 1 addition & 1 deletion sycl/include/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1066,7 +1066,7 @@ class reduction_impl_algo {
std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
CGH.addReduction(Counter);

HandlerAccess::preProcess(CGH, q,
HandlerAccess::preProcess(CGH,
[Counter = Counter.get()](handler &AuxHandler) {
AuxHandler.memset(Counter, 0, sizeof(int));
});
Expand Down
68 changes: 68 additions & 0 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2403,5 +2403,73 @@ void handler::copyCodeLoc(const handler &other) {
queue handler::getQueue() {
return createSyclObjFromImpl<queue>(impl->get_queue());
}
namespace detail {
__SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH,
type_erased_cgfo_ty F) {
queue_impl &Q = CGH.impl->get_queue();
bool EventNeeded = !Q.isInOrder();
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
handler_impl HandlerImpl{Q, nullptr, EventNeeded};
handler AuxHandler{HandlerImpl};
#else
handler AuxHandler{Q.shared_from_this(), EventNeeded};
#endif
AuxHandler.copyCodeLoc(CGH);
F(AuxHandler);
auto E = AuxHandler.finalize();
assert(!CGH.MIsFinalized &&
"Can't do pre-processing if the command has been enqueued already!");
if (EventNeeded)
CGH.depends_on(E);
}
__SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH,
type_erased_cgfo_ty F) {
// The "hacky" `handler`s manipulation mentioned near the declaration in
// `handler.hpp` and implemented here is far from perfect. A better approach
// would be
//
// bool OrigNeedsEvent = CGH.needsEvent()
// assert(CGH.not_finalized/enqueued());
// if (!InOrderQueue)
// CGH.setNeedsEvent()
//
// handler PostProcessHandler(Queue, OrigNeedsEvent)
// auto E = CGH.finalize(); // enqueue original or current last
// // post-process
// if (!InOrder)
// PostProcessHandler.depends_on(E)
//
// swap_impls(CGH, PostProcessHandler)
// return; // queue::submit finalizes PostProcessHandler and returns its
// // event if necessary.
//
// Still hackier than "real" `queue::submit` but at least somewhat sane.
// That, however hasn't been tried yet and we have an even hackier approach
// copied from what's been done in an old reductions implementation before
// eventless submission work has started. Not sure how feasible the approach
// above is at this moment.

// This `finalize` is wrong (at least logically) if
// `assert(!CGH.eventNeeded())`
auto E = CGH.finalize();
queue_impl &Q = CGH.impl->get_queue();
bool InOrder = Q.isInOrder();
// Cannot use `CGH.eventNeeded()` alone as there might be subsequent
// `postProcess` calls and we cannot address them properly similarly to the
// `finalize` issue described above. `swap_impls` suggested above might be
// able to handle this scenario naturally.
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder};
handler AuxHandler{HandlerImpl};
#else
handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder};
#endif
if (!InOrder)
AuxHandler.depends_on(E);
AuxHandler.copyCodeLoc(CGH);
F(AuxHandler);
CGH.MLastEvent = AuxHandler.finalize();
}
} // namespace detail
} // namespace _V1
} // namespace sycl
2 changes: 2 additions & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3260,6 +3260,8 @@ _ZN4sycl3_V16detail12buffer_plainC2EmmRKNS0_13property_listESt10unique_ptrINS1_1
_ZN4sycl3_V16detail12compile_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE
_ZN4sycl3_V16detail12isOutOfRangeENS0_3vecIiLi4EEENS0_15addressing_modeENS0_5rangeILi3EEE
_ZN4sycl3_V16detail12make_contextEmRKSt8functionIFvNS0_14exception_listEEENS0_7backendEbRKSt6vectorINS0_6deviceESaISA_EE
_ZN4sycl3_V16detail13HandlerAccess10preProcessERNS0_7handlerENS1_19type_erased_cgfo_tyE
_ZN4sycl3_V16detail13HandlerAccess11postProcessERNS0_7handlerENS1_19type_erased_cgfo_tyE
_ZN4sycl3_V16detail13host_pipe_map3addEPKvPKc
_ZN4sycl3_V16detail13lgamma_r_implENS1_9half_impl4halfEPi
_ZN4sycl3_V16detail13lgamma_r_implEdPi
Expand Down
2 changes: 2 additions & 0 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -4332,6 +4332,8 @@
?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_KAEBUimage_descriptor@12345@AEBVqueue@45@@Z
?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVdevice@45@AEBVcontext@45@@Z
?pitched_alloc_device@experimental@oneapi@ext@_V1@sycl@@YAPEAXPEA_K_K1IAEBVqueue@45@@Z
?postProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z
?preProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z
?prefetch@handler@_V1@sycl@@QEAAXPEBX_K@Z
?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBUcode_location@detail@23@@Z
?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
Expand Down