Skip to content

[SYCL] Add support for native root group implementation #11043

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b6caf20
Add calls to new PI APIs for query and launch
0x12CC Aug 22, 2023
cc308d9
Merge branch 'sycl' into native_root_group
0x12CC Aug 30, 2023
64c4ad3
Update CGExecKernel construction in unit tests
0x12CC Aug 30, 2023
8807a89
Update barrier and kernel launch
0x12CC Sep 1, 2023
4b822ce
Add extension include to sycl header
0x12CC Sep 12, 2023
ba64c38
Merge branch 'native_root_group' of https://github.com/0x12CC/llvm in…
0x12CC Sep 12, 2023
7aefca9
Add `piextEnqueueCooperativeKernelLaunch` to CL plugin
0x12CC Sep 12, 2023
acc527d
Add new APIs to symbol dumps
0x12CC Sep 12, 2023
5260f80
Fix `is_fixed_topology` test case
0x12CC Sep 12, 2023
0811c57
Update barrier implementation
0x12CC Sep 12, 2023
58b8161
Merge branch 'sycl' into native_root_group
0x12CC Sep 12, 2023
8cc65c3
Merge branch 'sycl' into native_root_group
0x12CC Sep 12, 2023
4aa376a
Add new PI APIs to CUDA and HIP plugins
0x12CC Sep 12, 2023
40e7a1f
Update windows symbols dump
0x12CC Sep 12, 2023
7596203
Fix CUDA and HIP PI functions
0x12CC Sep 12, 2023
1637e89
Update `group_scope<root_group>` to `Device`
0x12CC Sep 12, 2023
0e6b172
Update symbol dumps
0x12CC Sep 12, 2023
1c6c9cf
Add TODO for UR changes
0x12CC Sep 13, 2023
47ce8f7
Disable root group test on L0
0x12CC Sep 13, 2023
881263d
Fix assertions in root group test
0x12CC Sep 13, 2023
092227b
Merge branch 'sycl' into native_root_group
0x12CC Sep 18, 2023
161b7b2
Merge branch 'sycl' into native_root_group
0x12CC Sep 20, 2023
317b969
Refactor feature test macro and checks
0x12CC Sep 20, 2023
3ae2361
Add comments for piextEnqueue*KernelLaunch parameters
0x12CC Sep 20, 2023
d08e935
Add TODO for `urEnqueueCooperativeKernelLaunch`
0x12CC Sep 20, 2023
af778bc
Merge branch 'sycl' into native_root_group
0x12CC Sep 20, 2023
33917f8
Remove L0 changes from PI plugin
0x12CC Sep 28, 2023
2cd01b0
Merge branch 'sycl' into native_root_group
0x12CC Sep 28, 2023
da862ac
Update sycl/include/sycl/detail/pi.h
0x12CC Nov 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions sycl/include/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ class CGExecKernel : public CG {
std::vector<std::shared_ptr<detail::stream_impl>> MStreams;
std::vector<std::shared_ptr<const void>> MAuxiliaryResources;
sycl::detail::pi::PiKernelCacheConfig MKernelCacheConfig;
bool MKernelIsCooperative = false;

CGExecKernel(NDRDescT NDRDesc, std::shared_ptr<HostKernelBase> HKernel,
std::shared_ptr<detail::kernel_impl> SyclKernel,
Expand All @@ -182,14 +183,15 @@ class CGExecKernel : public CG {
std::vector<std::shared_ptr<const void>> AuxiliaryResources,
CGTYPE Type,
sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
detail::code_location loc = {})
bool KernelIsCooperative, detail::code_location loc = {})
: CG(Type, std::move(CGData), std::move(loc)),
MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
MSyclKernel(std::move(SyclKernel)),
MKernelBundle(std::move(KernelBundle)), MArgs(std::move(Args)),
MKernelName(std::move(KernelName)), MStreams(std::move(Streams)),
MAuxiliaryResources(std::move(AuxiliaryResources)),
MKernelCacheConfig(std::move(KernelCacheConfig)) {
MKernelCacheConfig(std::move(KernelCacheConfig)),
MKernelIsCooperative(KernelIsCooperative) {
assert(getType() == Kernel && "Wrong type of exec kernel CG.");
}

Expand Down
2 changes: 2 additions & 0 deletions sycl/include/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ _PI_API(piextKernelSetArgPointer)
_PI_API(piKernelSetExecInfo)
_PI_API(piextKernelCreateWithNativeHandle)
_PI_API(piextKernelGetNativeHandle)
_PI_API(piextKernelSuggestMaxCooperativeGroupCount)
// Event
_PI_API(piEventCreate)
_PI_API(piEventGetInfo)
Expand All @@ -105,6 +106,7 @@ _PI_API(piSamplerRetain)
_PI_API(piSamplerRelease)
// Queue commands
_PI_API(piEnqueueKernelLaunch)
_PI_API(piextEnqueueCooperativeKernelLaunch)
_PI_API(piEnqueueEventsWait)
_PI_API(piEnqueueEventsWaitWithBarrier)
_PI_API(piEnqueueMemBufferRead)
Expand Down
14 changes: 14 additions & 0 deletions sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -1632,6 +1632,14 @@ __SYCL_EXPORT pi_result piextKernelCreateWithNativeHandle(
__SYCL_EXPORT pi_result
piextKernelGetNativeHandle(pi_kernel kernel, pi_native_handle *nativeHandle);

/// Gets the max work group count for a cooperative kernel.
///
/// \param kernel is the PI kernel being queried.
/// \param group_count_ret is a pointer to where the query result will be
/// stored.
__SYCL_EXPORT pi_result piextKernelSuggestMaxCooperativeGroupCount(
pi_kernel kernel, pi_uint32 *group_count_ret);

//
// Events
//
Expand Down Expand Up @@ -1714,6 +1722,12 @@ __SYCL_EXPORT pi_result piEnqueueKernelLaunch(
const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list, pi_event *event);

__SYCL_EXPORT pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
const size_t *global_work_offset, const size_t *global_work_size,
const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list, pi_event *event);

__SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue command_queue,
pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list,
Expand Down
6 changes: 6 additions & 0 deletions sycl/include/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ struct sub_group;
namespace experimental {
template <typename ParentGroup> class ballot_group;
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
template <int Dimensions> class root_group;
template <typename ParentGroup> class tangle_group;
class opportunistic_group;
} // namespace experimental
Expand Down Expand Up @@ -51,6 +52,11 @@ namespace spirv {

template <typename Group> struct group_scope {};

template <int Dimensions>
struct group_scope<sycl::ext::oneapi::experimental::root_group<Dimensions>> {
static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Device;
};

template <int Dimensions> struct group_scope<group<Dimensions>> {
static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Workgroup;
};
Expand Down
6 changes: 3 additions & 3 deletions sycl/include/sycl/detail/type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ template <class T>
inline constexpr bool is_fixed_topology_group_v =
is_fixed_topology_group<T>::value;

#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
template <> struct is_fixed_topology_group<root_group> : std::true_type {};
#endif
template <int Dimensions> class root_group;
template <int Dimensions>
struct is_fixed_topology_group<root_group<Dimensions>> : std::true_type {};

template <int Dimensions>
struct is_fixed_topology_group<sycl::group<Dimensions>> : std::true_type {};
Expand Down
33 changes: 10 additions & 23 deletions sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@

#include <sycl/builtins.hpp>
#include <sycl/ext/oneapi/properties/properties.hpp>
#include <sycl/group.hpp>
#include <sycl/memory_enums.hpp>
#include <sycl/queue.hpp>

#define SYCL_EXT_ONEAPI_ROOT_GROUP 1
#include <sycl/nd_item.hpp>
#include <sycl/sub_group.hpp>

namespace sycl {
inline namespace _V1 {
Expand Down Expand Up @@ -106,31 +106,18 @@ template <int Dimensions> root_group<Dimensions> get_root_group() {

} // namespace ext::oneapi::experimental

template <>
typename ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync::return_type
kernel::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(const queue &q) const {
// TODO: query the backend to return a value >= 1.
return 1;
}

template <int dimensions>
void group_barrier(ext::oneapi::experimental::root_group<dimensions> G,
memory_scope FenceScope = decltype(G)::fence_scope) {
(void)G;
(void)FenceScope;
#ifdef __SYCL_DEVICE_ONLY__
// TODO: Change __spv::Scope::Workgroup to __spv::Scope::Device once backends
// support device scope. __spv::Scope::Workgroup is only valid when
// max_num_work_group_sync is 1, so that all work items in a root group will
// also be in the same work group.
__spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
__spv::MemorySemanticsMask::SubgroupMemory |
__spv::MemorySemanticsMask::WorkgroupMemory |
__spv::MemorySemanticsMask::CrossWorkgroupMemory);
// Root group barrier first synchronizes using a work group barrier. This
// allows backends to ignore the second ControlBarrier (with Device scope) if
// their maximum number of work groups is 1.
group_barrier(get_child_group(G));
detail::spirv::ControlBarrier(G, FenceScope, memory_order::seq_cst);
#else
(void)G;
(void)FenceScope;
throw sycl::runtime_error("Barriers are not supported on host device",
PI_ERROR_INVALID_DEVICE);
#endif
Expand Down
7 changes: 7 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <sycl/ext/oneapi/device_global/device_global.hpp>
#include <sycl/ext/oneapi/device_global/properties.hpp>
#include <sycl/ext/oneapi/experimental/graph.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
#include <sycl/ext/oneapi/properties/properties.hpp>
#include <sycl/group.hpp>
Expand Down Expand Up @@ -921,6 +922,10 @@ class __SYCL_EXPORT handler {
} else {
std::ignore = Props;
}

constexpr bool UsesRootSync = PropertiesT::template has_property<
sycl::ext::oneapi::experimental::use_root_sync_key>();
setKernelIsCooperative(UsesRootSync);
}

/// Checks whether it is possible to copy the source shape to the destination
Expand Down Expand Up @@ -3604,6 +3609,8 @@ class __SYCL_EXPORT handler {

// Set value of the gpu cache configuration for the kernel.
void setKernelCacheConfig(sycl::detail::pi::PiKernelCacheConfig);
// Set value of the kernel is cooperative flag
void setKernelIsCooperative(bool);

template <
ext::oneapi::experimental::detail::UnsupportedGraphFeatures FeatureT>
Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/sycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
#include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>
#include <sycl/ext/oneapi/experimental/fixed_size_group.hpp>
#include <sycl/ext/oneapi/experimental/opportunistic_group.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/ext/oneapi/experimental/tangle_group.hpp>
#include <sycl/ext/oneapi/filter_selector.hpp>
#include <sycl/ext/oneapi/functional.hpp>
Expand Down
17 changes: 17 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *OutEvent) {
return pi2ur::piEnqueueKernelLaunch(
Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
pi_context Context,
pi_program Program,
Expand All @@ -553,6 +563,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
}

pi_result piextKernelSuggestMaxCooperativeGroupCount(pi_kernel Kernel,
pi_uint32 *GroupCountRet) {
(void)Kernel;
*GroupCountRet = 1;
return PI_SUCCESS;
}

pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
return pi2ur::piEventCreate(Context, RetEvent);
}
Expand Down
17 changes: 17 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *OutEvent) {
return pi2ur::piEnqueueKernelLaunch(
Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
pi_context Context,
pi_program Program,
Expand All @@ -561,6 +571,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
}

pi_result piextKernelSuggestMaxCooperativeGroupCount(pi_kernel Kernel,
pi_uint32 *GroupCountRet) {
(void)Kernel;
*GroupCountRet = 1;
return PI_SUCCESS;
}

pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
return pi2ur::piEventCreate(Context, RetEvent);
}
Expand Down
17 changes: 17 additions & 0 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,16 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextEnqueueCooperativeKernelLaunch(
pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *OutEvent) {
return pi2ur::piEnqueueKernelLaunch(
Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
NumEventsInWaitList, EventWaitList, OutEvent);
}

pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
pi_context Context,
pi_program Program,
Expand All @@ -573,6 +583,13 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
}

pi_result piextKernelSuggestMaxCooperativeGroupCount(pi_kernel Kernel,
pi_uint32 *GroupCountRet) {
(void)Kernel;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs to be also a redirection to pi2ur, even if all that we are doing is returning 1.

*GroupCountRet = 1;
return PI_SUCCESS;
}

//
// Events
//
Expand Down
11 changes: 11 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2599,6 +2599,14 @@ pi_result piEventGetInfo(pi_event event, pi_event_info param_name,
return static_cast<pi_result>(result);
}

pi_result
piextKernelSuggestMaxCooperativeGroupCount(pi_kernel kernel,
pi_uint32 *group_count_ret) {
(void)kernel;
*group_count_ret = 1;
return PI_SUCCESS;
}

const char SupportedVersion[] = _PI_OPENCL_PLUGIN_VERSION_STRING;

pi_result piPluginInit(pi_plugin *PluginInit) {
Expand Down Expand Up @@ -2683,6 +2691,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextKernelSetArgPointer, piextKernelSetArgPointer)
_PI_CL(piextKernelCreateWithNativeHandle, piextKernelCreateWithNativeHandle)
_PI_CL(piextKernelGetNativeHandle, piextKernelGetNativeHandle)
_PI_CL(piextKernelSuggestMaxCooperativeGroupCount,
piextKernelSuggestMaxCooperativeGroupCount)
// Event
_PI_CL(piEventCreate, piEventCreate)
_PI_CL(piEventGetInfo, piEventGetInfo)
Expand All @@ -2701,6 +2711,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piSamplerRelease, clReleaseSampler)
// Queue commands
_PI_CL(piEnqueueKernelLaunch, clEnqueueNDRangeKernel)
_PI_CL(piextEnqueueCooperativeKernelLaunch, clEnqueueNDRangeKernel)
_PI_CL(piEnqueueEventsWait, clEnqueueMarkerWithWaitList)
_PI_CL(piEnqueueEventsWaitWithBarrier, clEnqueueBarrierWithWaitList)
_PI_CL(piEnqueueMemBufferRead, clEnqueueReadBuffer)
Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
// TODO: Pass accessor mem allocations
nullptr,
// TODO: Extract from handler
PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT);
PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT, CG->MKernelIsCooperative);
if (Res != pi_result::PI_SUCCESS) {
throw sycl::exception(
sycl::make_error_code(sycl::errc::kernel),
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/handler_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class handler_impl {
sycl::detail::pi::PiKernelCacheConfig MKernelCacheConfig =
PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT;

bool MKernelIsCooperative = false;

// Extra information for bindless image copy
sycl::detail::pi::PiMemImageDesc MImageDesc;
sycl::detail::pi::PiMemImageFormat MImageFormat;
Expand Down
6 changes: 5 additions & 1 deletion sycl/source/detail/jit_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,10 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
auto &CG = KernelCmd->getCG();
assert(CG.getType() == CG::Kernel);
auto *KernelCG = static_cast<CGExecKernel *>(&CG);
if (KernelCG->MKernelIsCooperative) {
printPerformanceWarning("Cannot fuse cooperative kernel");
return nullptr;
}

auto KernelName = KernelCG->MKernelName;
if (KernelName.empty()) {
Expand Down Expand Up @@ -878,7 +882,7 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
FusedCG.reset(new detail::CGExecKernel(
NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
std::move(CGData), std::move(FusedArgs), FusedKernelInfo.Name, {}, {},
CG::CGTYPE::Kernel, KernelCacheConfig));
CG::CGTYPE::Kernel, KernelCacheConfig, false));
return FusedCG;
}

Expand Down
18 changes: 18 additions & 0 deletions sycl/source/detail/kernel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <sycl/detail/pi.h>
#include <sycl/detail/pi.hpp>
#include <sycl/device.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/info/info_desc.hpp>

#include <cassert>
Expand Down Expand Up @@ -141,6 +142,9 @@ class kernel_impl {
typename Param::return_type get_info(const device &Device,
const range<3> &WGSize) const;

template <typename Param>
typename Param::return_type ext_oneapi_get_info(const queue &q) const;

/// Get a reference to a raw kernel object.
///
/// \return a reference to a valid PiKernel instance with raw kernel object.
Expand Down Expand Up @@ -255,6 +259,20 @@ kernel_impl::get_info(const device &Device,
getPlugin());
}

template <>
inline typename ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync::return_type
kernel_impl::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(const queue &Queue) const {
const auto &Plugin = getPlugin();
const auto &Handle = getHandleRef();
pi_uint32 GroupCount = 0;
Plugin->call<PiApiKind::piextKernelSuggestMaxCooperativeGroupCount>(
Handle, &GroupCount);
return GroupCount;
}

} // namespace detail
} // namespace _V1
} // namespace sycl
Loading