Skip to content

Commit 161bef1

Browse files
committed
[WIP] No handler submit
1 parent f2fa176 commit 161bef1

File tree

5 files changed

+267
-10
lines changed

5 files changed

+267
-10
lines changed

sycl/include/sycl/khr/free_function_commands.hpp

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
namespace sycl {
66
inline namespace _V1 {
77

8+
#define __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS
89
#ifdef __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS
910
namespace khr {
1011

@@ -153,27 +154,21 @@ void launch_grouped(const queue &q, range<1> r, range<1> size,
153154
const KernelType &k,
154155
const sycl::detail::code_location &codeLoc =
155156
sycl::detail::code_location::current()) {
156-
submit(
157-
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
158-
codeLoc);
157+
q.parallel_for_no_handler(nd_range<1>(r, size), k);
159158
}
160159
template <typename KernelType>
161160
void launch_grouped(const queue &q, range<2> r, range<2> size,
162161
const KernelType &k,
163162
const sycl::detail::code_location &codeLoc =
164163
sycl::detail::code_location::current()) {
165-
submit(
166-
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
167-
codeLoc);
164+
q.parallel_for_no_handler(nd_range<2>(r, size), k);
168165
}
169166
template <typename KernelType>
170167
void launch_grouped(const queue &q, range<3> r, range<3> size,
171168
const KernelType &k,
172169
const sycl::detail::code_location &codeLoc =
173170
sycl::detail::code_location::current()) {
174-
submit(
175-
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
176-
codeLoc);
171+
q.parallel_for_no_handler(nd_range<3>(r, size), k);
177172
}
178173

179174
template <typename... Args>
@@ -283,7 +278,7 @@ template <typename KernelType>
283278
void launch_task(const sycl::queue &q, const KernelType &k,
284279
const sycl::detail::code_location &codeLoc =
285280
sycl::detail::code_location::current()) {
286-
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
281+
q.single_task_no_handler(k);
287282
}
288283

289284
template <typename... Args>

sycl/include/sycl/queue.hpp

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2784,6 +2784,112 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
27842784
CodeLoc);
27852785
}
27862786

2787+
// no_handler
2788+
2789+
private:
2790+
// NOTE: the name of this function - "kernel_single_task" - is used by the
2791+
// Front End to determine kernel invocation kind.
2792+
template <typename KernelName, typename KernelType, typename... Props>
2793+
#ifdef __SYCL_DEVICE_ONLY__
2794+
[[__sycl_detail__::add_ir_attributes_function(
2795+
"sycl-single-task",
2796+
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
2797+
nullptr,
2798+
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
2799+
#endif
2800+
2801+
__SYCL_KERNEL_ATTR__ static void
2802+
kernel_single_task(const KernelType &KernelFunc) {
2803+
#ifdef __SYCL_DEVICE_ONLY__
2804+
KernelFunc();
2805+
#else
2806+
(void)KernelFunc;
2807+
#endif
2808+
}
2809+
2810+
// NOTE: the name of these functions - "kernel_parallel_for" - are used by the
2811+
// Front End to determine kernel invocation kind.
2812+
template <typename KernelName, typename ElementType, typename KernelType,
2813+
typename... Props>
2814+
#ifdef __SYCL_DEVICE_ONLY__
2815+
[[__sycl_detail__::add_ir_attributes_function(
2816+
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::name...,
2817+
ext::oneapi::experimental::detail::PropertyMetaInfo<Props>::value...)]]
2818+
#endif
2819+
__SYCL_KERNEL_ATTR__ static void
2820+
kernel_parallel_for(const KernelType &KernelFunc) {
2821+
#ifdef __SYCL_DEVICE_ONLY__
2822+
KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
2823+
#else
2824+
(void)KernelFunc;
2825+
#endif
2826+
}
2827+
2828+
template <int Dims> static sycl::range<3> padRange(sycl::range<Dims> Range) {
2829+
if constexpr (Dims == 3) {
2830+
return Range;
2831+
} else {
2832+
sycl::range<3> Res{0, 0, 0};
2833+
for (int I = 0; I < Dims; ++I)
2834+
Res[I] = Range[I];
2835+
return Res;
2836+
}
2837+
}
2838+
2839+
template <int Dims> static sycl::id<3> padId(sycl::id<Dims> Id) {
2840+
if constexpr (Dims == 3) {
2841+
return Id;
2842+
} else {
2843+
sycl::id<3> Res{0, 0, 0};
2844+
for (int I = 0; I < Dims; ++I)
2845+
Res[I] = Id[I];
2846+
return Res;
2847+
}
2848+
}
2849+
2850+
template <typename KernelName, typename KernelType, int Dims>
2851+
void submit_no_handler(nd_range<Dims> Range, const KernelType &KernelFunc) const {
2852+
2853+
using NameT =
2854+
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
2855+
2856+
const char *KernelN = detail::getKernelName<NameT>();
2857+
KernelType Kernel = KernelFunc;
2858+
void *KernelFuncPtr = reinterpret_cast<void *>(&Kernel);
2859+
int KernelNumParams = detail::getKernelNumParams<NameT>();
2860+
detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = &(detail::getKernelParamDesc<NameT>);
2861+
bool IsKernelESIMD = detail::isKernelESIMD<NameT>();
2862+
bool HasSpecialCapt = detail::hasSpecialCaptures<NameT>();
2863+
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr = detail::getKernelNameBasedCache<NameT>();
2864+
2865+
assert(HasSpecialCapt == false);
2866+
assert(IsKernelESIMD == false);
2867+
2868+
submit_no_handler_impl(Range, KernelN, KernelFuncPtr, KernelNumParams, KernelParamDescGetter,
2869+
KernelNameBasedCachePtr);
2870+
}
2871+
2872+
public:
2873+
/// single_task version not using handler
2874+
template <typename KernelName = detail::auto_name, typename KernelType>
2875+
void single_task_no_handler(const KernelType &KernelFunc) const {
2876+
2877+
kernel_single_task<KernelName, KernelType,
2878+
ext::oneapi::experimental::empty_properties_t>(KernelFunc);
2879+
submit_no_handler<KernelName, KernelType, 1>(nd_range<1>{}, KernelFunc);
2880+
}
2881+
2882+
template <typename KernelName = detail::auto_name, int Dims,
2883+
typename KernelType>
2884+
void parallel_for_no_handler(nd_range<Dims> Range, const KernelType &KernelFunc) const {
2885+
2886+
kernel_parallel_for<KernelName, sycl::nd_item<Dims>, KernelType,
2887+
ext::oneapi::experimental::empty_properties_t>(KernelFunc);
2888+
submit_no_handler<KernelName, KernelType, Dims>(Range, KernelFunc);
2889+
}
2890+
2891+
2892+
27872893
/// parallel_for version with a kernel represented as a lambda + range that
27882894
/// specifies global size only.
27892895
///
@@ -3686,6 +3792,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
36863792
const detail::code_location &CodeLoc,
36873793
bool IsTopCodeLoc) const;
36883794

3795+
// no_handler
3796+
3797+
template<int Dims>
3798+
void submit_no_handler_impl(nd_range<Dims> Range, const char *KernelName, void *KernelFunc,
3799+
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
3800+
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const;
3801+
36893802
/// Submits a command group function object to the queue, in order to be
36903803
/// scheduled for execution on the device.
36913804
///

sycl/source/detail/queue_impl.cpp

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,116 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
421421
}
422422
#endif
423423

424+
// no_handler
425+
426+
void queue_impl::extractArgsAndReqsFromLambda(
427+
char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int),
428+
size_t NumKernelParams, std::vector<ArgDesc> &Args) {
429+
size_t IndexShift = 0;
430+
431+
Args.reserve(NumKernelParams);
432+
433+
for (size_t I = 0; I < NumKernelParams; ++I) {
434+
detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I);
435+
void *Ptr = LambdaPtr + ParamDesc.offset;
436+
const detail::kernel_param_kind_t &Kind = ParamDesc.kind;
437+
const int &Size = ParamDesc.info;
438+
439+
Args.emplace_back(Kind, Ptr, Size, I + IndexShift);
440+
}
441+
}
442+
443+
void queue_impl::submit_no_handler(
444+
const std::shared_ptr<queue_impl> &Self,
445+
detail::NDRDescT NDRDesc, const char *KernelName,
446+
void *KernelFunc, int KernelNumParams,
447+
detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
448+
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) {
449+
450+
std::vector<ur_event_handle_t> RawEvents;
451+
std::vector<detail::ArgDesc> Args;
452+
453+
// TODO exclude graphs
454+
455+
// TODO external event
456+
457+
bool KernelFastPath = true;
458+
459+
{
460+
std::unique_lock<std::mutex> Lock(MMutex);
461+
EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr;
462+
463+
if (isInOrder() && LastEvent && !Scheduler::CheckEventReadiness(MContext, LastEvent)) {
464+
KernelFastPath = false;
465+
ur_event_handle_t LastEventHandle = LastEvent->getHandle();
466+
if (LastEventHandle) {
467+
RawEvents.push_back(LastEventHandle);
468+
}
469+
}
470+
}
471+
472+
if (KernelFastPath) {
473+
enqueueImpKernel(
474+
Self,
475+
NDRDesc, // MNDRDesc
476+
Args,
477+
nullptr, // KernelBundleImpPtr
478+
nullptr, // MKernel
479+
KernelName,
480+
KernelNameBasedCachePtr, // MKernelNameBasedCachePtr
481+
RawEvents,
482+
nullptr, // out event
483+
nullptr, // getMemAllocationFunc
484+
UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig
485+
false, // MKernelIsCooperative
486+
false, // MKernelUsesClusterLaunch
487+
0, // MKernelWorkGroupMemorySize
488+
nullptr, // BinImage
489+
KernelFunc, // MKernelFuncPtr
490+
KernelNumParams, // MKernelNumArgs
491+
KernelParamDescGetter, // MKernelParamDescGetter
492+
false); // MKernelHasSpecialCaptures
493+
} else {
494+
std::unique_ptr<detail::CG> CommandGroup;
495+
detail::CG::StorageInitHelper CGData;
496+
std::vector<detail::ArgDesc> Args;
497+
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
498+
std::vector<std::shared_ptr<const void>> AuxiliaryResources;
499+
detail::code_location CodeLoc = {};
500+
501+
extractArgsAndReqsFromLambda((char *)KernelFunc, KernelParamDescGetter,
502+
KernelNumParams, Args);
503+
504+
CommandGroup.reset(new detail::CGExecKernel(
505+
std::move(NDRDesc),
506+
nullptr, // MHostKernel
507+
nullptr, // MKernel
508+
nullptr, // MKernelBundle
509+
std::move(CGData), // CGData
510+
Args, // MArgs
511+
KernelName, // MKernelName
512+
KernelNameBasedCachePtr, // MKernelNameBasedCachePtr
513+
std::move(StreamStorage), // MStreamStorage
514+
std::move(AuxiliaryResources), // MAuxiliaryResources
515+
detail::CGType::Kernel,
516+
UR_KERNEL_CACHE_CONFIG_DEFAULT, // MKernelCacheConfig
517+
false, // MKernelIsCooperative
518+
false, // MKernelUsesClusterLaunch
519+
0, // MKernelWorkGroupMemorySize
520+
CodeLoc)); // MCodeLoc
521+
522+
detail::EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG(
523+
std::move(CommandGroup),
524+
Self, // MQueue
525+
false); // MEventNeeded
526+
527+
if (isInOrder()) {
528+
std::unique_lock<std::mutex> Lock(MMutex);
529+
MDefaultGraphDeps.LastEventPtr = EventImpl;
530+
}
531+
}
532+
}
533+
424534
template <typename HandlerFuncT>
425535
event queue_impl::submitWithHandler(const std::vector<event> &DepEvents,
426536
bool CallerNeedsEvent,

sycl/source/detail/queue_impl.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,18 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
378378
/*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo);
379379
}
380380

381+
// no_handler
382+
private:
383+
void extractArgsAndReqsFromLambda(
384+
char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int),
385+
size_t NumKernelParams, std::vector<ArgDesc> &Args);
386+
387+
public:
388+
void submit_no_handler(const std::shared_ptr<queue_impl> &Self,
389+
detail::NDRDescT NDRDesc, const char *KernelName, void *KernelFunc, int KernelNumParams,
390+
detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
391+
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr);
392+
381393
/// Performs a blocking wait for the completion of all enqueued tasks in the
382394
/// queue.
383395
///

sycl/source/queue.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,21 @@ void queue::submit_without_event_impl(
319319
impl->submit_without_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc);
320320
}
321321

322+
// no_handler
323+
324+
template<int Dims>
325+
void queue::submit_no_handler_impl(nd_range<Dims> Range, const char *KernelName, void *KernelFunc,
326+
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
327+
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const {
328+
329+
detail::NDRDescT NDRDesc{padRange(Range.get_global_range()),
330+
padRange(Range.get_local_range()),
331+
padId(Range.get_offset()), Dims};
332+
333+
impl->submit_no_handler(impl, NDRDesc, KernelName, KernelFunc, KernelNumParams,
334+
KernelParamDescGetter, KernelNameBasedCachePtr);
335+
}
336+
322337
void queue::wait_proxy(const detail::code_location &CodeLoc) {
323338
impl->wait(CodeLoc);
324339
}
@@ -474,6 +489,18 @@ void queue::ext_oneapi_set_external_event(const event &external_event) {
474489

475490
const property_list &queue::getPropList() const { return impl->getPropList(); }
476491

492+
template void queue::submit_no_handler_impl<1>(nd_range<1> Range, const char *KernelName, void *KernelFunc,
493+
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
494+
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const;
495+
496+
template void queue::submit_no_handler_impl<2>(nd_range<2> Range, const char *KernelName, void *KernelFunc,
497+
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
498+
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const;
499+
500+
template void queue::submit_no_handler_impl<3>(nd_range<3> Range, const char *KernelName, void *KernelFunc,
501+
int KernelNumParams, detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
502+
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr) const;
503+
477504
} // namespace _V1
478505
} // namespace sycl
479506

0 commit comments

Comments
 (0)