Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 30 additions & 108 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2532,93 +2532,6 @@ static ur_result_t SetKernelParamsAndLaunch(
return Error;
}

// Sets arguments for a given kernel and device based on the argument type.
// This is a legacy path which the graphs extension still uses.
static void SetArgBasedOnType(
adapter_impl &Adapter, ur_kernel_handle_t Kernel,
device_image_impl *DeviceImageImpl,
const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex) {
switch (Arg.MType) {
case kernel_param_kind_t::kind_dynamic_work_group_memory:
break;
case kernel_param_kind_t::kind_work_group_memory:
break;
case kernel_param_kind_t::kind_stream:
break;
case kernel_param_kind_t::kind_dynamic_accessor:
case kernel_param_kind_t::kind_accessor: {
Requirement *Req = (Requirement *)(Arg.MPtr);

// getMemAllocationFunc is nullptr when there are no requirements. However,
// we may pass default constructed accessors to a command, which don't add
// requirements. In such case, getMemAllocationFunc is nullptr, but it's a
// valid case, so we need to properly handle it.
ur_mem_handle_t MemArg =
getMemAllocationFunc
? reinterpret_cast<ur_mem_handle_t>(getMemAllocationFunc(Req))
: nullptr;
ur_kernel_arg_mem_obj_properties_t MemObjData{};
MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode);
Adapter.call<UrApiKind::urKernelSetArgMemObj>(Kernel, NextTrueIndex,
&MemObjData, MemArg);
break;
}
case kernel_param_kind_t::kind_std_layout: {
if (Arg.MPtr) {
Adapter.call<UrApiKind::urKernelSetArgValue>(
Kernel, NextTrueIndex, Arg.MSize, nullptr, Arg.MPtr);
} else {
Adapter.call<UrApiKind::urKernelSetArgLocal>(Kernel, NextTrueIndex,
Arg.MSize, nullptr);
}

break;
}
case kernel_param_kind_t::kind_struct_with_special_type: {
Adapter.call<UrApiKind::urKernelSetArgValue>(Kernel, NextTrueIndex,
Arg.MSize, nullptr, Arg.MPtr);
break;
}
case kernel_param_kind_t::kind_sampler: {
sampler *SamplerPtr = (sampler *)Arg.MPtr;
ur_sampler_handle_t Sampler =
(ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
->getOrCreateSampler(ContextImpl);
Adapter.call<UrApiKind::urKernelSetArgSampler>(Kernel, NextTrueIndex,
nullptr, Sampler);
break;
}
case kernel_param_kind_t::kind_pointer: {
// We need to de-rerence this to get the actual USM allocation - that's the
// pointer UR is expecting.
const void *Ptr = *static_cast<const void *const *>(Arg.MPtr);
Adapter.call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
nullptr, Ptr);
break;
}
case kernel_param_kind_t::kind_specialization_constants_buffer: {
assert(DeviceImageImpl != nullptr);
ur_mem_handle_t SpecConstsBuffer =
DeviceImageImpl->get_spec_const_buffer_ref();

ur_kernel_arg_mem_obj_properties_t MemObjProps{};
MemObjProps.pNext = nullptr;
MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
Adapter.call<UrApiKind::urKernelSetArgMemObj>(
Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer);
break;
}
case kernel_param_kind_t::kind_invalid:
throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
"Invalid kernel param kind " +
codeToString(UR_RESULT_ERROR_INVALID_VALUE));
break;
}
}

static std::tuple<ur_kernel_handle_t, device_image_impl *,
const KernelArgMask *>
getCGKernelInfo(const CGExecKernel &CommandGroup, context_impl &ContextImpl,
Expand Down Expand Up @@ -2689,17 +2602,22 @@ ur_result_t enqueueImpCommandBufferKernel(
AltUrKernels.push_back(AltUrKernel);
}

adapter_impl &Adapter = ContextImpl.getAdapter();
auto SetFunc = [&Adapter, &UrKernel, &ContextImpl, &getMemAllocationFunc,
DeviceImageImpl](sycl::detail::ArgDesc &Arg,
size_t NextTrueIndex) {
sycl::detail::SetArgBasedOnType(Adapter, UrKernel, DeviceImageImpl,
getMemAllocationFunc, ContextImpl, Arg,
NextTrueIndex);
};
// Copy args for modification
auto Args = CommandGroup.MArgs;
sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, SetFunc);

// just a performance optimization - avoid heap allocations
static thread_local std::vector<ur_exp_kernel_arg_properties_t> UrCmdArgs;
UrCmdArgs.clear();
UrCmdArgs.reserve(Args.size());

adapter_impl &Adapter = ContextImpl.getAdapter();
auto GetFunc = [&DeviceImageImpl, &getMemAllocationFunc,
&ContextImpl](detail::ArgDesc &Arg, size_t NextTrueIndex) {
GetUrArgsBasedOnType(DeviceImageImpl, getMemAllocationFunc, ContextImpl,
Arg, NextTrueIndex, UrCmdArgs);
};

sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, GetFunc);

const std::optional<int> &ImplicitLocalArg =
CommandGroup.MDeviceKernelInfo.getImplicitLocalArgPos();
Expand All @@ -2709,9 +2627,12 @@ ur_result_t enqueueImpCommandBufferKernel(
// CUDA-style local memory setting. Note that we may have -1 as a position,
// this indicates the buffer is actually unused and was elided.
if (ImplicitLocalArg.has_value() && ImplicitLocalArg.value() != -1) {
Adapter.call<UrApiKind::urKernelSetArgLocal>(
UrKernel, ImplicitLocalArg.value(),
CommandGroup.MKernelWorkGroupMemorySize, nullptr);
UrCmdArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES,
nullptr,
UR_EXP_KERNEL_ARG_TYPE_LOCAL,
static_cast<uint32_t>(ImplicitLocalArg.value()),
CommandGroup.MKernelWorkGroupMemorySize,
{nullptr}});
}

// Remember this information before the range dimensions are reversed
Expand Down Expand Up @@ -2766,15 +2687,16 @@ ur_result_t enqueueImpCommandBufferKernel(
ur_exp_command_buffer_info_t::UR_EXP_COMMAND_BUFFER_INFO_DESCRIPTOR,
sizeof(ur_exp_command_buffer_desc_t), &CommandBufferDesc, nullptr);

ur_result_t Res =
Adapter.call_nocheck<UrApiKind::urCommandBufferAppendKernelLaunchExp>(
CommandBuffer, UrKernel, NDRDesc.Dims,
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr,
&NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(),
AltUrKernels.size() ? AltUrKernels.data() : nullptr,
SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0,
nullptr, OutSyncPoint, nullptr,
CommandBufferDesc.isUpdatable ? OutCommand : nullptr);
// urCommandBufferAppendKernelLaunchExp ->
// urCommandBufferAppendKernelLaunchWithArgsExp
ur_result_t Res = Adapter.call_nocheck<
UrApiKind::urCommandBufferAppendKernelLaunchWithArgsExp>(
CommandBuffer, UrKernel, NDRDesc.Dims,
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0],
LocalSize, UrCmdArgs.size(), UrCmdArgs.data(), AltUrKernels.size(),
AltUrKernels.size() ? AltUrKernels.data() : nullptr, SyncPoints.size(),
SyncPoints.size() ? SyncPoints.data() : nullptr, 0, nullptr, OutSyncPoint,
nullptr, CommandBufferDesc.isUpdatable ? OutCommand : nullptr);

if (Res != UR_RESULT_SUCCESS) {
detail::enqueue_kernel_launch::handleErrorOrWarning(Res, DeviceImpl,
Expand Down
2 changes: 1 addition & 1 deletion sycl/test-e2e/Graph/Explicit/kernel_bundle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
//
// CHECK:<--- urKernelRetain(.hKernel = [[KERNEL_HANDLE]]) -> UR_RESULT_SUCCESS
//
// CHECK:<--- urCommandBufferAppendKernelLaunchExp(
// CHECK:<--- urCommandBufferAppendKernelLaunchWithArgsExp(
// CHECK-SAME: .hKernel = [[KERNEL_HANDLE]]
//
// CHECK:<--- urKernelRelease(.hKernel = [[KERNEL_HANDLE]]) -> UR_RESULT_SUCCESS
Expand Down
2 changes: 1 addition & 1 deletion sycl/test-e2e/Graph/RecordReplay/kernel_bundle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
// CHECK-SAME: .hKernel = [[KERNEL_HANDLE]]
// CHECK-SAME: -> UR_RESULT_SUCCESS;
//
// CHECK:<--- urCommandBufferAppendKernelLaunchExp(
// CHECK:<--- urCommandBufferAppendKernelLaunchWithArgsExp(
// CHECK-SAME: .hKernel = [[KERNEL_HANDLE]]
//
// CHECK:<--- urKernelRelease(
Expand Down
15 changes: 1 addition & 14 deletions sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,7 @@ int main() {
});
};

// CHECK: <--- urKernelSetArgPointer(
// CHECK-SAME: .hKernel = [[KERNEL_HANDLE1:[0-9a-fA-Fx]+]]
// CHECL-SAME: .argIndex = 0

// CHECK: <--- urKernelSetArgValue
// CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]]
// CHECK-SAME: .argIndex = 1

// CHECK: <--- urKernelSetArgValue
// CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]]
// CHECK-SAME: .argIndex = 2

// CHECK: <--- urCommandBufferAppendKernelLaunchExp
// CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]]
// CHECK: <--- urCommandBufferAppendKernelLaunchWithArgsExp
// CHECK-SAME: .numKernelAlternatives = 3
// CHECK-SAME: .phKernelAlternatives = {{[0-9a-fA-Fx]* ?}}{[[KERNEL_HANDLE2:[0-9a-fA-Fx]+]], [[KERNEL_HANDLE3:[0-9a-fA-Fx]+]], [[KERNEL_HANDLE4:[0-9a-fA-Fx]+]]}
auto DynamicCG =
Expand Down
14 changes: 7 additions & 7 deletions sycl/unittests/Extensions/CommandGraph/Update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,14 +426,14 @@ TEST_F(WholeGraphUpdateTest, EmptyNode) {

// Vars and callbacks for tracking how many times mocked functions are called
static int GetInfoCount = 0;
static int AppendKernelLaunchCount = 0;
static int AppendKernelLaunchWithArgsCount = 0;
static ur_result_t redefinedCommandBufferGetInfoExpAfter(void *pParams) {
GetInfoCount++;
return UR_RESULT_SUCCESS;
}
static ur_result_t
redefinedCommandBufferAppendKernelLaunchExpAfter(void *pParams) {
AppendKernelLaunchCount++;
redefinedCommandBufferAppendKernelLaunchWithArgsExpAfter(void *pParams) {
AppendKernelLaunchWithArgsCount++;
return UR_RESULT_SUCCESS;
}

Expand All @@ -445,16 +445,16 @@ TEST_F(CommandGraphTest, CheckFinalizeBehavior) {
mock::getCallbacks().set_after_callback(
"urCommandBufferGetInfoExp", &redefinedCommandBufferGetInfoExpAfter);
mock::getCallbacks().set_after_callback(
"urCommandBufferAppendKernelLaunchExp",
&redefinedCommandBufferAppendKernelLaunchExpAfter);
"urCommandBufferAppendKernelLaunchWithArgsExp",
&redefinedCommandBufferAppendKernelLaunchWithArgsExpAfter);

ASSERT_NO_THROW(Graph.finalize(experimental::property::graph::updatable{}));
// GetInfo and AppendKernelLaunch should be called once each time a node is
// added to a command buffer during finalization
ASSERT_EQ(GetInfoCount, 1);
ASSERT_EQ(AppendKernelLaunchCount, 1);
ASSERT_EQ(AppendKernelLaunchWithArgsCount, 1);

ASSERT_NO_THROW(Graph.finalize());
ASSERT_EQ(GetInfoCount, 2);
ASSERT_EQ(AppendKernelLaunchCount, 2);
ASSERT_EQ(AppendKernelLaunchWithArgsCount, 2);
}
Loading
Loading