Skip to content

[Offload] Add olLaunchKernelSuggestedGroupSize #142130

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions offload/liboffload/API/Kernel.td
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,38 @@ def : Function {
Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
];
}


def : Struct {
let name = "ol_kernel_launch_size_suggested_args_t";
let desc = "Size-related arguments for a kernel launch.";
let members = [
StructMember<"size_t", "Dimensions", "Number of work dimensions">,
StructMember<"size_t", "NumItemsX", "Number of work items on the X dimension">,
StructMember<"size_t", "NumItemsY", "Number of work items on the Y dimension">,
StructMember<"size_t", "NumItemsZ", "Number of work items on the Z dimension">,
StructMember<"size_t", "DynSharedMemory", "Size of dynamic shared memory in bytes.">
];
}

def : Function {
let name = "olLaunchKernelSuggestedGroupSize";
let desc = "Enqueue a kernel launch with the specified work items and parameters.";
let details = [
"Behaves the same as olLaunchKernel, but the implementation automatically determines optimal work group sizes"
];
let params = [
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
Param<"const ol_kernel_launch_size_suggested_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
];
let returns = [
Return<"OL_ERRC_INVALID_ARGUMENT", ["`Queue == NULL && EventOut != NULL`"]>,
Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
];
}
71 changes: 71 additions & 0 deletions offload/liboffload/include/generated/OffloadAPI.h
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,54 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernel(
// [out][optional] optional recorded event for the enqueued operation
ol_event_handle_t *EventOut);

///////////////////////////////////////////////////////////////////////////////
/// @brief Size-related arguments for a kernel launch.
typedef struct ol_kernel_launch_size_suggested_args_t {
size_t Dimensions; /// Number of work dimensions
size_t NumItemsX; /// Number of work items on the X dimension
size_t NumItemsY; /// Number of work items on the Y dimension
size_t NumItemsZ; /// Number of work items on the Z dimension
size_t DynSharedMemory; /// Size of dynamic shared memory in bytes.
} ol_kernel_launch_size_suggested_args_t;

///////////////////////////////////////////////////////////////////////////////
/// @brief Enqueue a kernel launch with the specified work items and parameters.
///
/// @details
/// - Behaves the same as olLaunchKernel, but the implementation
/// automatically determines optimal work group sizes
///
/// @returns
/// - ::OL_RESULT_SUCCESS
/// - ::OL_ERRC_UNINITIALIZED
/// - ::OL_ERRC_DEVICE_LOST
/// - ::OL_ERRC_INVALID_ARGUMENT
/// + `Queue == NULL && EventOut != NULL`
/// - ::OL_ERRC_INVALID_ARGUMENT
/// + `ArgumentsSize > 0 && ArgumentsData == NULL`
/// - ::OL_ERRC_INVALID_DEVICE
/// + If Queue is non-null but does not belong to Device
/// - ::OL_ERRC_INVALID_NULL_HANDLE
/// + `NULL == Device`
/// + `NULL == Kernel`
/// - ::OL_ERRC_INVALID_NULL_POINTER
/// + `NULL == LaunchSizeArgs`
OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
// [in][optional] handle of the queue
ol_queue_handle_t Queue,
// [in] handle of the device to execute on
ol_device_handle_t Device,
// [in] handle of the kernel
ol_kernel_handle_t Kernel,
// [in][optional] pointer to the kernel argument struct
const void *ArgumentsData,
// [in] size of the kernel argument struct
size_t ArgumentsSize,
// [in] pointer to the struct containing launch size parameters
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
// [out][optional] optional recorded event for the enqueued operation
ol_event_handle_t *EventOut);

///////////////////////////////////////////////////////////////////////////////
/// @brief Function parameters for olGetPlatformInfo
/// @details Each entry is a pointer to the parameter passed to the function;
Expand Down Expand Up @@ -874,6 +922,19 @@ typedef struct ol_launch_kernel_params_t {
ol_event_handle_t **pEventOut;
} ol_launch_kernel_params_t;

///////////////////////////////////////////////////////////////////////////////
/// @brief Function parameters for olLaunchKernelSuggestedGroupSize
/// @details Each entry is a pointer to the parameter passed to the function;
typedef struct ol_launch_kernel_suggested_group_size_params_t {
ol_queue_handle_t *pQueue;
ol_device_handle_t *pDevice;
ol_kernel_handle_t *pKernel;
const void **pArgumentsData;
size_t *pArgumentsSize;
const ol_kernel_launch_size_suggested_args_t **pLaunchSizeArgs;
ol_event_handle_t **pEventOut;
} ol_launch_kernel_suggested_group_size_params_t;

///////////////////////////////////////////////////////////////////////////////
/// @brief Variant of olInit that also sets source code location information
/// @details See also ::olInit
Expand Down Expand Up @@ -1016,6 +1077,16 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelWithCodeLoc(
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);

///////////////////////////////////////////////////////////////////////////////
/// @brief Variant of olLaunchKernelSuggestedGroupSize that also sets source
/// code location information
/// @details See also ::olLaunchKernelSuggestedGroupSize
OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSizeWithCodeLoc(
ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);

#if defined(__cplusplus)
} // extern "C"
#endif
79 changes: 79 additions & 0 deletions offload/liboffload/include/generated/OffloadEntryPoints.inc
Original file line number Diff line number Diff line change
Expand Up @@ -901,3 +901,82 @@ ol_result_t olLaunchKernelWithCodeLoc(
currentCodeLocation() = nullptr;
return Result;
}

///////////////////////////////////////////////////////////////////////////////
llvm::Error olLaunchKernelSuggestedGroupSize_val(
ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut) {
if (offloadConfig().ValidationEnabled) {
if (Queue == NULL && EventOut != NULL) {
return createOffloadError(
error::ErrorCode::INVALID_ARGUMENT,
"validation failure: Queue == NULL && EventOut != NULL");
}

if (ArgumentsSize > 0 && ArgumentsData == NULL) {
return createOffloadError(
error::ErrorCode::INVALID_ARGUMENT,
"validation failure: ArgumentsSize > 0 && ArgumentsData == NULL");
}

if (NULL == Device) {
return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
"validation failure: NULL == Device");
}

if (NULL == Kernel) {
return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
"validation failure: NULL == Kernel");
}

if (NULL == LaunchSizeArgs) {
return createOffloadError(error::ErrorCode::INVALID_NULL_POINTER,
"validation failure: NULL == LaunchSizeArgs");
}
}

return llvm::offload::olLaunchKernelSuggestedGroupSize_impl(
Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
EventOut);
}
OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut) {
if (offloadConfig().TracingEnabled) {
llvm::errs() << "---> olLaunchKernelSuggestedGroupSize";
}

ol_result_t Result =
llvmErrorToOffloadError(olLaunchKernelSuggestedGroupSize_val(
Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
EventOut));

if (offloadConfig().TracingEnabled) {
ol_launch_kernel_suggested_group_size_params_t Params = {
&Queue, &Device, &Kernel, &ArgumentsData,
&ArgumentsSize, &LaunchSizeArgs, &EventOut};
llvm::errs() << "(" << &Params << ")";
llvm::errs() << "-> " << Result << "\n";
if (Result && Result->Details) {
llvm::errs() << " *Error Details* " << Result->Details << " \n";
}
}
return Result;
}
ol_result_t olLaunchKernelSuggestedGroupSizeWithCodeLoc(
ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation) {
currentCodeLocation() = CodeLocation;
ol_result_t Result = ::olLaunchKernelSuggestedGroupSize(
Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
EventOut);

currentCodeLocation() = nullptr;
return Result;
}
2 changes: 2 additions & 0 deletions offload/liboffload/include/generated/OffloadFuncs.inc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ OFFLOAD_FUNC(olCreateProgram)
OFFLOAD_FUNC(olDestroyProgram)
OFFLOAD_FUNC(olGetKernel)
OFFLOAD_FUNC(olLaunchKernel)
OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSize)
OFFLOAD_FUNC(olInitWithCodeLoc)
OFFLOAD_FUNC(olShutDownWithCodeLoc)
OFFLOAD_FUNC(olGetPlatformInfoWithCodeLoc)
Expand All @@ -48,5 +49,6 @@ OFFLOAD_FUNC(olCreateProgramWithCodeLoc)
OFFLOAD_FUNC(olDestroyProgramWithCodeLoc)
OFFLOAD_FUNC(olGetKernelWithCodeLoc)
OFFLOAD_FUNC(olLaunchKernelWithCodeLoc)
OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSizeWithCodeLoc)

#undef OFFLOAD_FUNC
6 changes: 6 additions & 0 deletions offload/liboffload/include/generated/OffloadImplFuncDecls.inc
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,9 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
size_t ArgumentsSize,
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut);

Error olLaunchKernelSuggestedGroupSize_impl(
ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut);
51 changes: 51 additions & 0 deletions offload/liboffload/include/generated/OffloadPrint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,31 @@ operator<<(llvm::raw_ostream &os,
os << "}";
return os;
}
///////////////////////////////////////////////////////////////////////////////
/// @brief Print operator for the ol_kernel_launch_size_suggested_args_t type
/// @returns llvm::raw_ostream &

inline llvm::raw_ostream &
operator<<(llvm::raw_ostream &os,
const struct ol_kernel_launch_size_suggested_args_t params) {
os << "(struct ol_kernel_launch_size_suggested_args_t){";
os << ".Dimensions = ";
os << params.Dimensions;
os << ", ";
os << ".NumItemsX = ";
os << params.NumItemsX;
os << ", ";
os << ".NumItemsY = ";
os << params.NumItemsY;
os << ", ";
os << ".NumItemsZ = ";
os << params.NumItemsZ;
os << ", ";
os << ".DynSharedMemory = ";
os << params.DynSharedMemory;
os << "}";
return os;
}

inline llvm::raw_ostream &
operator<<(llvm::raw_ostream &os,
Expand Down Expand Up @@ -619,6 +644,32 @@ operator<<(llvm::raw_ostream &os,
return os;
}

inline llvm::raw_ostream &operator<<(
llvm::raw_ostream &os,
const struct ol_launch_kernel_suggested_group_size_params_t *params) {
os << ".Queue = ";
printPtr(os, *params->pQueue);
os << ", ";
os << ".Device = ";
printPtr(os, *params->pDevice);
os << ", ";
os << ".Kernel = ";
printPtr(os, *params->pKernel);
os << ", ";
os << ".ArgumentsData = ";
printPtr(os, *params->pArgumentsData);
os << ", ";
os << ".ArgumentsSize = ";
os << *params->pArgumentsSize;
os << ", ";
os << ".LaunchSizeArgs = ";
printPtr(os, *params->pLaunchSizeArgs);
os << ", ";
os << ".EventOut = ";
printPtr(os, *params->pEventOut);
return os;
}

///////////////////////////////////////////////////////////////////////////////
// @brief Print pointer value
template <typename T>
Expand Down
63 changes: 49 additions & 14 deletions offload/liboffload/src/OffloadImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -484,11 +484,10 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
return Error::success();
}

Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData,
size_t ArgumentsSize,
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut) {
namespace {
Error do_launch(ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, KernelArgsTy &Args,
ol_event_handle_t *EventOut) {
auto *DeviceImpl = Device->Device;
if (Queue && Device != Queue->Device) {
return createOffloadError(
Expand All @@ -498,6 +497,26 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,

auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
auto Err = KernelImpl->launch(*DeviceImpl, Args.ArgPtrs, nullptr, Args,
AsyncInfoWrapper);

AsyncInfoWrapper.finalize(Err);
if (Err)
return Err;

if (EventOut)
*EventOut = makeEvent(Queue);

return Error::success();
}
} // namespace

Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData,
size_t ArgumentsSize,
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut) {
KernelArgsTy LaunchArgs{};
LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
Expand All @@ -514,18 +533,34 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
// Don't do anything with pointer indirection; use arg data as-is
LaunchArgs.Flags.IsCUDA = true;

auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
LaunchArgs, AsyncInfoWrapper);
return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
}

AsyncInfoWrapper.finalize(Err);
if (Err)
return Err;
Error olLaunchKernelSuggestedGroupSize_impl(
ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
ol_event_handle_t *EventOut) {
// TODO: Use backend specific magic to determine the best work group size
size_t PreferredSize[3] = {1, 1, 1};

if (EventOut)
*EventOut = makeEvent(Queue);
KernelArgsTy LaunchArgs{};
LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumItemsX / PreferredSize[0];
LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumItemsY / PreferredSize[1];
LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumItemsZ / PreferredSize[2];
LaunchArgs.ThreadLimit[0] = PreferredSize[0];
LaunchArgs.ThreadLimit[1] = PreferredSize[1];
LaunchArgs.ThreadLimit[2] = PreferredSize[2];
LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;

return Error::success();
KernelLaunchParamsTy Params;
Params.Data = const_cast<void *>(ArgumentsData);
Params.Size = ArgumentsSize;
LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
// Don't do anything with pointer indirection; use arg data as-is
LaunchArgs.Flags.IsCUDA = true;

return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
}

} // namespace offload
Expand Down
1 change: 1 addition & 0 deletions offload/unittests/OffloadAPI/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ add_offload_unittest("offload.unittests"
${CMAKE_CURRENT_SOURCE_DIR}/program/olDestroyProgram.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kernel/olGetKernel.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kernel/olLaunchKernel.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kernel/olLaunchKernelSuggestedGroupSize.cpp
${CMAKE_CURRENT_SOURCE_DIR}/event/olDestroyEvent.cpp
${CMAKE_CURRENT_SOURCE_DIR}/event/olWaitEvent.cpp
)
Expand Down
Loading
Loading