Skip to content

Commit 95d3ec6

Browse files
authored
[SYCL] Fix race that occurs when submitting to single queue in parallel (#1872)
This PR aims to fix the race when setting kernel parameters in parallel. That occurs when submitting to single queue by multiple threads. - In cases when we use cacheable kernel we must use mutex, as we share the same instance by threads. - In cases when we don't use cacheable kernel each thread has its own instance of the kernel and so race-condition doesn't occur. Regression: queue submit. single queue Signed-off-by: Alexander Flegontov <alexander.flegontov@intel.com>
1 parent df638c2 commit 95d3ec6

File tree

7 files changed

+122
-75
lines changed

7 files changed

+122
-75
lines changed

sycl/source/detail/kernel_program_cache.hpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,15 @@ class KernelProgramCache {
5858
using ContextPtr = context_impl *;
5959

6060
using PiKernelT = std::remove_pointer<RT::PiKernel>::type;
61+
62+
struct BuildResultKernel : public BuildResult<PiKernelT> {
63+
std::mutex MKernelMutex;
64+
65+
BuildResultKernel(PiKernelT *P, int S) : BuildResult(P, S) {}
66+
};
67+
6168
using PiKernelPtrT = std::atomic<PiKernelT *>;
62-
using KernelWithBuildStateT = BuildResult<PiKernelT>;
69+
using KernelWithBuildStateT = BuildResultKernel;
6370
using KernelByNameT = std::map<string_class, KernelWithBuildStateT>;
6471
using KernelCacheT = std::map<RT::PiProgram, KernelByNameT>;
6572

sycl/source/detail/program_impl.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,9 @@ RT::PiKernel program_impl::get_pi_kernel(const string_class &KernelName) const {
402402
RT::PiKernel Kernel;
403403

404404
if (is_cacheable()) {
405-
Kernel = ProgramManager::getInstance().getOrCreateKernel(
406-
MProgramModuleHandle, get_context(), KernelName, this);
405+
std::tie(Kernel, std::ignore) =
406+
ProgramManager::getInstance().getOrCreateKernel(
407+
MProgramModuleHandle, get_context(), KernelName, this);
407408
getPlugin().call<PiApiKind::piKernelRetain>(Kernel);
408409
} else {
409410
const detail::plugin &Plugin = getPlugin();

sycl/source/detail/program_impl.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,9 @@ class program_impl {
318318
/// Tells whether a specialization constant has been set for this program.
319319
bool hasSetSpecConstants() const { return !SpecConstRegistry.empty(); }
320320

321+
/// \return true if caching is allowed for this program.
322+
bool is_cacheable() const { return MProgramAndKernelCachingAllowed; }
323+
321324
/// Returns the native plugin handle.
322325
pi_native_handle getNative() const;
323326

@@ -371,9 +374,6 @@ class program_impl {
371374
/// \return a vector of devices managed by the plugin.
372375
vector_class<RT::PiDevice> get_pi_devices() const;
373376

374-
/// \return true if caching is allowed for this program.
375-
bool is_cacheable() const { return MProgramAndKernelCachingAllowed; }
376-
377377
/// \param Options is a string containing OpenCL C build options.
378378
/// \return true if caching is allowed for this program and build options.
379379
static bool is_cacheable_with_options(const string_class &Options) {

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,9 @@ RetT *waitUntilBuilt(KernelProgramCache &Cache,
160160
/// cache. Accepts nothing. Return pointer to built entity.
161161
template <typename RetT, typename ExceptionT, typename KeyT, typename AcquireFT,
162162
typename GetCacheFT, typename BuildFT>
163-
RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey,
164-
AcquireFT &&Acquire, GetCacheFT &&GetCache, BuildFT &&Build) {
163+
KernelProgramCache::BuildResult<RetT> *
164+
getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, AcquireFT &&Acquire,
165+
GetCacheFT &&GetCache, BuildFT &&Build) {
165166
bool InsertionTookPlace;
166167
KernelProgramCache::BuildResult<RetT> *BuildResult;
167168

@@ -183,7 +184,7 @@ RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey,
183184
RetT *Result = waitUntilBuilt<ExceptionT>(KPCache, BuildResult);
184185

185186
if (Result)
186-
return Result;
187+
return BuildResult;
187188

188189
// Previous build is failed. There was no SYCL exception though.
189190
// We might try to build once more.
@@ -213,7 +214,7 @@ RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey,
213214

214215
KPCache.notifyAllBuild();
215216

216-
return Desired;
217+
return BuildResult;
217218
} catch (const exception &Ex) {
218219
BuildResult->Error.Msg = Ex.what();
219220
BuildResult->Error.Code = Ex.get_cl_code();
@@ -395,14 +396,15 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(OSModuleHandle M,
395396
if (Prg)
396397
Prg->stableSerializeSpecConstRegistry(SpecConsts);
397398

398-
return getOrBuild<PiProgramT, compile_program_error>(
399+
auto BuildResult = getOrBuild<PiProgramT, compile_program_error>(
399400
Cache, KeyT(std::move(SpecConsts), KSId), AcquireF, GetF, BuildF);
401+
return BuildResult->Ptr.load();
400402
}
401403

402-
RT::PiKernel ProgramManager::getOrCreateKernel(OSModuleHandle M,
403-
const context &Context,
404-
const string_class &KernelName,
405-
const program_impl *Prg) {
404+
std::pair<RT::PiKernel, std::mutex *>
405+
ProgramManager::getOrCreateKernel(OSModuleHandle M, const context &Context,
406+
const string_class &KernelName,
407+
const program_impl *Prg) {
406408
if (DbgProgMgr > 0) {
407409
std::cerr << ">>> ProgramManager::getOrCreateKernel(" << M << ", "
408410
<< getRawSyclObjImpl(Context) << ", " << KernelName << ")\n";
@@ -436,8 +438,10 @@ RT::PiKernel ProgramManager::getOrCreateKernel(OSModuleHandle M,
436438
return Result;
437439
};
438440

439-
return getOrBuild<PiKernelT, invalid_object_error>(Cache, KernelName,
440-
AcquireF, GetF, BuildF);
441+
auto BuildResult = static_cast<KernelProgramCache::BuildResultKernel *>(
442+
getOrBuild<PiKernelT, invalid_object_error>(Cache, KernelName, AcquireF,
443+
GetF, BuildF));
444+
return std::make_pair(BuildResult->Ptr.load(), &(BuildResult->MKernelMutex));
441445
}
442446

443447
RT::PiProgram

sycl/source/detail/program_manager/program_manager.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ class ProgramManager {
8181
const string_class &KernelName,
8282
const program_impl *Prg = nullptr,
8383
bool JITCompilationIsRequired = false);
84-
RT::PiKernel getOrCreateKernel(OSModuleHandle M, const context &Context,
85-
const string_class &KernelName,
86-
const program_impl *Prg);
84+
std::pair<RT::PiKernel, std::mutex *>
85+
getOrCreateKernel(OSModuleHandle M, const context &Context,
86+
const string_class &KernelName, const program_impl *Prg);
8787
RT::PiProgram getPiProgramFromPiKernel(RT::PiKernel Kernel,
8888
const ContextImplPtr Context);
8989

sycl/source/detail/scheduler/commands.cpp

Lines changed: 85 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,65 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
16291629
}
16301630
}
16311631

1632+
pi_result ExecCGCommand::SetKernelParamsAndLaunch(
1633+
CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc,
1634+
std::vector<RT::PiEvent> &RawEvents, RT::PiEvent &Event) {
1635+
const detail::plugin &Plugin = MQueue->getPlugin();
1636+
for (ArgDesc &Arg : ExecKernel->MArgs) {
1637+
switch (Arg.MType) {
1638+
case kernel_param_kind_t::kind_accessor: {
1639+
Requirement *Req = (Requirement *)(Arg.MPtr);
1640+
AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
1641+
RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation();
1642+
if (Plugin.getBackend() == backend::opencl) {
1643+
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
1644+
sizeof(RT::PiMem), &MemArg);
1645+
} else {
1646+
Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, Arg.MIndex,
1647+
&MemArg);
1648+
}
1649+
break;
1650+
}
1651+
case kernel_param_kind_t::kind_std_layout: {
1652+
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex, Arg.MSize,
1653+
Arg.MPtr);
1654+
break;
1655+
}
1656+
case kernel_param_kind_t::kind_sampler: {
1657+
sampler *SamplerPtr = (sampler *)Arg.MPtr;
1658+
RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr)
1659+
->getOrCreateSampler(MQueue->get_context());
1660+
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
1661+
sizeof(cl_sampler), &Sampler);
1662+
break;
1663+
}
1664+
case kernel_param_kind_t::kind_pointer: {
1665+
Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, Arg.MIndex,
1666+
Arg.MSize, Arg.MPtr);
1667+
break;
1668+
}
1669+
}
1670+
}
1671+
1672+
adjustNDRangePerKernel(NDRDesc, Kernel,
1673+
*(detail::getSyclObjImpl(MQueue->get_device())));
1674+
1675+
// Some PI Plugins (like OpenCL) require this call to enable USM
1676+
// For others, PI will turn this into a NOP.
1677+
Plugin.call<PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
1678+
sizeof(pi_bool), &PI_TRUE);
1679+
1680+
// Remember this information before the range dimensions are reversed
1681+
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
1682+
1683+
ReverseRangeDimensionsForKernel(NDRDesc);
1684+
pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueKernelLaunch>(
1685+
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
1686+
&NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
1687+
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event);
1688+
return Error;
1689+
}
1690+
16321691
// The function initialize accessors and calls lambda.
16331692
// The function is used as argument to piEnqueueNativeKernel which requires
16341693
// that the passed function takes one void* argument.
@@ -1809,71 +1868,42 @@ cl_int ExecCGCommand::enqueueImp() {
18091868

18101869
// Run OpenCL kernel
18111870
sycl::context Context = MQueue->get_context();
1812-
const detail::plugin &Plugin = MQueue->getPlugin();
18131871
RT::PiKernel Kernel = nullptr;
1872+
std::mutex *KernelMutex = nullptr;
18141873

18151874
if (nullptr != ExecKernel->MSyclKernel) {
18161875
assert(ExecKernel->MSyclKernel->get_info<info::kernel::context>() ==
18171876
Context);
18181877
Kernel = ExecKernel->MSyclKernel->getHandleRef();
1819-
} else
1820-
Kernel = detail::ProgramManager::getInstance().getOrCreateKernel(
1821-
ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName,
1822-
nullptr);
18231878

1824-
for (ArgDesc &Arg : ExecKernel->MArgs) {
1825-
switch (Arg.MType) {
1826-
case kernel_param_kind_t::kind_accessor: {
1827-
Requirement *Req = (Requirement *)(Arg.MPtr);
1828-
AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
1829-
RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation();
1830-
if (Plugin.getBackend() == backend::opencl) {
1831-
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
1832-
sizeof(RT::PiMem), &MemArg);
1833-
} else {
1834-
Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, Arg.MIndex,
1835-
&MemArg);
1836-
}
1837-
break;
1838-
}
1839-
case kernel_param_kind_t::kind_std_layout: {
1840-
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex, Arg.MSize,
1841-
Arg.MPtr);
1842-
break;
1843-
}
1844-
case kernel_param_kind_t::kind_sampler: {
1845-
sampler *SamplerPtr = (sampler *)Arg.MPtr;
1846-
RT::PiSampler Sampler =
1847-
detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context);
1848-
Plugin.call<PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex,
1849-
sizeof(cl_sampler), &Sampler);
1850-
break;
1851-
}
1852-
case kernel_param_kind_t::kind_pointer: {
1853-
Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, Arg.MIndex,
1854-
Arg.MSize, Arg.MPtr);
1855-
break;
1856-
}
1879+
auto SyclProg = detail::getSyclObjImpl(
1880+
ExecKernel->MSyclKernel->get_info<info::kernel::program>());
1881+
if (SyclProg->is_cacheable()) {
1882+
RT::PiKernel FoundKernel = nullptr;
1883+
std::tie(FoundKernel, KernelMutex) =
1884+
detail::ProgramManager::getInstance().getOrCreateKernel(
1885+
ExecKernel->MOSModuleHandle,
1886+
ExecKernel->MSyclKernel->get_info<info::kernel::context>(),
1887+
ExecKernel->MKernelName, SyclProg.get());
1888+
assert(FoundKernel == Kernel);
18571889
}
1890+
} else {
1891+
std::tie(Kernel, KernelMutex) =
1892+
detail::ProgramManager::getInstance().getOrCreateKernel(
1893+
ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName,
1894+
nullptr);
18581895
}
18591896

1860-
adjustNDRangePerKernel(NDRDesc, Kernel,
1861-
*(detail::getSyclObjImpl(MQueue->get_device())));
1862-
1863-
// Some PI Plugins (like OpenCL) require this call to enable USM
1864-
// For others, PI will turn this into a NOP.
1865-
Plugin.call<PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
1866-
sizeof(pi_bool), &PI_TRUE);
1867-
1868-
// Remember this information before the range dimensions are reversed
1869-
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
1870-
1871-
ReverseRangeDimensionsForKernel(NDRDesc);
1872-
1873-
pi_result Error = Plugin.call_nocheck<PiApiKind::piEnqueueKernelLaunch>(
1874-
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
1875-
&NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
1876-
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event);
1897+
pi_result Error = PI_SUCCESS;
1898+
if (KernelMutex != nullptr) {
1899+
// For cacheable kernels, we use per-kernel mutex
1900+
std::lock_guard<std::mutex> Lock(*KernelMutex);
1901+
Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents,
1902+
Event);
1903+
} else {
1904+
Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents,
1905+
Event);
1906+
}
18771907

18781908
if (PI_SUCCESS != Error) {
18791909
// If we have got non-success error code, let's analyze it to emit nice

sycl/source/detail/scheduler/commands.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,11 @@ class ExecCGCommand : public Command {
482482

483483
AllocaCommandBase *getAllocaForReq(Requirement *Req);
484484

485+
pi_result SetKernelParamsAndLaunch(CGExecKernel *ExecKernel,
486+
RT::PiKernel Kernel, NDRDescT &NDRDesc,
487+
std::vector<RT::PiEvent> &RawEvents,
488+
RT::PiEvent &Event);
489+
485490
std::unique_ptr<detail::CG> MCommandGroup;
486491

487492
friend class Command;

0 commit comments

Comments
 (0)