Skip to content

[SYCL] Fix race that occurs when submitting to single queue in parallel #1872

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jul 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion sycl/source/detail/kernel_program_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,15 @@ class KernelProgramCache {
using ContextPtr = context_impl *;

using PiKernelT = std::remove_pointer<RT::PiKernel>::type;

struct BuildResultKernel : public BuildResult<PiKernelT> {
std::mutex MKernelMutex;

BuildResultKernel(PiKernelT *P, int S) : BuildResult(P, S) {}
};

using PiKernelPtrT = std::atomic<PiKernelT *>;
using KernelWithBuildStateT = BuildResult<PiKernelT>;
using KernelWithBuildStateT = BuildResultKernel;
using KernelByNameT = std::map<string_class, KernelWithBuildStateT>;
using KernelCacheT = std::map<RT::PiProgram, KernelByNameT>;

Expand Down
5 changes: 3 additions & 2 deletions sycl/source/detail/program_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,8 +402,9 @@ RT::PiKernel program_impl::get_pi_kernel(const string_class &KernelName) const {
RT::PiKernel Kernel;

if (is_cacheable()) {
Kernel = ProgramManager::getInstance().getOrCreateKernel(
MProgramModuleHandle, get_context(), KernelName, this);
std::tie(Kernel, std::ignore) =
ProgramManager::getInstance().getOrCreateKernel(
MProgramModuleHandle, get_context(), KernelName, this);
getPlugin().call<PiApiKind::piKernelRetain>(Kernel);
} else {
const detail::plugin &Plugin = getPlugin();
Expand Down
6 changes: 3 additions & 3 deletions sycl/source/detail/program_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,9 @@ class program_impl {
/// Tells whether a specialization constant has been set for this program.
bool hasSetSpecConstants() const { return !SpecConstRegistry.empty(); }

/// \return true if caching is allowed for this program.
bool is_cacheable() const { return MProgramAndKernelCachingAllowed; }

/// Returns the native plugin handle.
pi_native_handle getNative() const;

Expand Down Expand Up @@ -371,9 +374,6 @@ class program_impl {
/// \return a vector of devices managed by the plugin.
vector_class<RT::PiDevice> get_pi_devices() const;

/// \return true if caching is allowed for this program.
bool is_cacheable() const { return MProgramAndKernelCachingAllowed; }

/// \param Options is a string containing OpenCL C build options.
/// \return true if caching is allowed for this program and build options.
static bool is_cacheable_with_options(const string_class &Options) {
Expand Down
26 changes: 15 additions & 11 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,9 @@ RetT *waitUntilBuilt(KernelProgramCache &Cache,
/// cache. Accepts nothing. Return pointer to built entity.
template <typename RetT, typename ExceptionT, typename KeyT, typename AcquireFT,
typename GetCacheFT, typename BuildFT>
RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey,
AcquireFT &&Acquire, GetCacheFT &&GetCache, BuildFT &&Build) {
KernelProgramCache::BuildResult<RetT> *
getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, AcquireFT &&Acquire,
GetCacheFT &&GetCache, BuildFT &&Build) {
bool InsertionTookPlace;
KernelProgramCache::BuildResult<RetT> *BuildResult;

Expand All @@ -183,7 +184,7 @@ RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey,
RetT *Result = waitUntilBuilt<ExceptionT>(KPCache, BuildResult);

if (Result)
return Result;
return BuildResult;

// Previous build is failed. There was no SYCL exception though.
// We might try to build once more.
Expand Down Expand Up @@ -213,7 +214,7 @@ RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey,

KPCache.notifyAllBuild();

return Desired;
return BuildResult;
} catch (const exception &Ex) {
BuildResult->Error.Msg = Ex.what();
BuildResult->Error.Code = Ex.get_cl_code();
Expand Down Expand Up @@ -395,14 +396,15 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(OSModuleHandle M,
if (Prg)
Prg->stableSerializeSpecConstRegistry(SpecConsts);

return getOrBuild<PiProgramT, compile_program_error>(
auto BuildResult = getOrBuild<PiProgramT, compile_program_error>(
Cache, KeyT(std::move(SpecConsts), KSId), AcquireF, GetF, BuildF);
return BuildResult->Ptr.load();
}

RT::PiKernel ProgramManager::getOrCreateKernel(OSModuleHandle M,
const context &Context,
const string_class &KernelName,
const program_impl *Prg) {
std::pair<RT::PiKernel, std::mutex *>
ProgramManager::getOrCreateKernel(OSModuleHandle M, const context &Context,
const string_class &KernelName,
const program_impl *Prg) {
if (DbgProgMgr > 0) {
std::cerr << ">>> ProgramManager::getOrCreateKernel(" << M << ", "
<< getRawSyclObjImpl(Context) << ", " << KernelName << ")\n";
Expand Down Expand Up @@ -436,8 +438,10 @@ RT::PiKernel ProgramManager::getOrCreateKernel(OSModuleHandle M,
return Result;
};

return getOrBuild<PiKernelT, invalid_object_error>(Cache, KernelName,
AcquireF, GetF, BuildF);
auto BuildResult = static_cast<KernelProgramCache::BuildResultKernel *>(
getOrBuild<PiKernelT, invalid_object_error>(Cache, KernelName, AcquireF,
GetF, BuildF));
return std::make_pair(BuildResult->Ptr.load(), &(BuildResult->MKernelMutex));
}

RT::PiProgram
Expand Down
6 changes: 3 additions & 3 deletions sycl/source/detail/program_manager/program_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ class ProgramManager {
const string_class &KernelName,
const program_impl *Prg = nullptr,
bool JITCompilationIsRequired = false);
RT::PiKernel getOrCreateKernel(OSModuleHandle M, const context &Context,
const string_class &KernelName,
const program_impl *Prg);
std::pair<RT::PiKernel, std::mutex *>
getOrCreateKernel(OSModuleHandle M, const context &Context,
const string_class &KernelName, const program_impl *Prg);
RT::PiProgram getPiProgramFromPiKernel(RT::PiKernel Kernel,
const ContextImplPtr Context);

Expand Down
140 changes: 85 additions & 55 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1629,6 +1629,65 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
}
}

pi_result ExecCGCommand::SetKernelParamsAndLaunch(
CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc,
std::vector<RT::PiEvent> &RawEvents, RT::PiEvent &Event) {
const detail::plugin &Plugin = MQueue->getPlugin();
for (ArgDesc &Arg : ExecKernel->MArgs) {
switch (Arg.MType) {
case kernel_param_kind_t::kind_accessor: {
Requirement *Req = (Requirement *)(Arg.MPtr);
AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation();
if (Plugin.getBackend() == backend::opencl) {
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
sizeof(RT::PiMem), &MemArg);
} else {
Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, Arg.MIndex,
&MemArg);
}
break;
}
case kernel_param_kind_t::kind_std_layout: {
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex, Arg.MSize,
Arg.MPtr);
break;
}
case kernel_param_kind_t::kind_sampler: {
sampler *SamplerPtr = (sampler *)Arg.MPtr;
RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr)
->getOrCreateSampler(MQueue->get_context());
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
sizeof(cl_sampler), &Sampler);
break;
}
case kernel_param_kind_t::kind_pointer: {
Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, Arg.MIndex,
Arg.MSize, Arg.MPtr);
break;
}
}
}

adjustNDRangePerKernel(NDRDesc, Kernel,
*(detail::getSyclObjImpl(MQueue->get_device())));

// Some PI Plugins (like OpenCL) require this call to enable USM
// For others, PI will turn this into a NOP.
Plugin.call<PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
sizeof(pi_bool), &PI_TRUE);

// Remember this information before the range dimensions are reversed
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);

ReverseRangeDimensionsForKernel(NDRDesc);
pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueKernelLaunch>(
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
&NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event);
return Error;
}

// The function initialize accessors and calls lambda.
// The function is used as argument to piEnqueueNativeKernel which requires
// that the passed function takes one void* argument.
Expand Down Expand Up @@ -1809,71 +1868,42 @@ cl_int ExecCGCommand::enqueueImp() {

// Run OpenCL kernel
sycl::context Context = MQueue->get_context();
const detail::plugin &Plugin = MQueue->getPlugin();
RT::PiKernel Kernel = nullptr;
std::mutex *KernelMutex = nullptr;

if (nullptr != ExecKernel->MSyclKernel) {
assert(ExecKernel->MSyclKernel->get_info<info::kernel::context>() ==
Context);
Kernel = ExecKernel->MSyclKernel->getHandleRef();
} else
Kernel = detail::ProgramManager::getInstance().getOrCreateKernel(
ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName,
nullptr);

for (ArgDesc &Arg : ExecKernel->MArgs) {
switch (Arg.MType) {
case kernel_param_kind_t::kind_accessor: {
Requirement *Req = (Requirement *)(Arg.MPtr);
AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation();
if (Plugin.getBackend() == backend::opencl) {
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
sizeof(RT::PiMem), &MemArg);
} else {
Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, Arg.MIndex,
&MemArg);
}
break;
}
case kernel_param_kind_t::kind_std_layout: {
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex, Arg.MSize,
Arg.MPtr);
break;
}
case kernel_param_kind_t::kind_sampler: {
sampler *SamplerPtr = (sampler *)Arg.MPtr;
RT::PiSampler Sampler =
detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context);
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
sizeof(cl_sampler), &Sampler);
break;
}
case kernel_param_kind_t::kind_pointer: {
Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, Arg.MIndex,
Arg.MSize, Arg.MPtr);
break;
}
auto SyclProg = detail::getSyclObjImpl(
ExecKernel->MSyclKernel->get_info<info::kernel::program>());
if (SyclProg->is_cacheable()) {
RT::PiKernel FoundKernel = nullptr;
std::tie(FoundKernel, KernelMutex) =
detail::ProgramManager::getInstance().getOrCreateKernel(
ExecKernel->MOSModuleHandle,
ExecKernel->MSyclKernel->get_info<info::kernel::context>(),
ExecKernel->MKernelName, SyclProg.get());
assert(FoundKernel == Kernel);
}
} else {
std::tie(Kernel, KernelMutex) =
detail::ProgramManager::getInstance().getOrCreateKernel(
ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName,
nullptr);
}

adjustNDRangePerKernel(NDRDesc, Kernel,
*(detail::getSyclObjImpl(MQueue->get_device())));

// Some PI Plugins (like OpenCL) require this call to enable USM
// For others, PI will turn this into a NOP.
Plugin.call<PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
sizeof(pi_bool), &PI_TRUE);

// Remember this information before the range dimensions are reversed
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);

ReverseRangeDimensionsForKernel(NDRDesc);

pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueKernelLaunch>(
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
&NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event);
pi_result Error = PI_SUCCESS;
if (KernelMutex != nullptr) {
// For cacheable kernels, we use per-kernel mutex
std::lock_guard<std::mutex> Lock(*KernelMutex);
Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents,
Event);
} else {
Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents,
Event);
}

if (PI_SUCCESS != Error) {
// If we have got non-success error code, let's analyze it to emit nice
Expand Down
5 changes: 5 additions & 0 deletions sycl/source/detail/scheduler/commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,11 @@ class ExecCGCommand : public Command {

AllocaCommandBase *getAllocaForReq(Requirement *Req);

pi_result SetKernelParamsAndLaunch(CGExecKernel *ExecKernel,
RT::PiKernel Kernel, NDRDescT &NDRDesc,
std::vector<RT::PiEvent> &RawEvents,
RT::PiEvent &Event);

std::unique_ptr<detail::CG> MCommandGroup;

friend class Command;
Expand Down