@@ -1629,6 +1629,65 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
1629
1629
}
1630
1630
}
1631
1631
1632
+ pi_result ExecCGCommand::SetKernelParamsAndLaunch (
1633
+ CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc,
1634
+ std::vector<RT::PiEvent> &RawEvents, RT::PiEvent &Event) {
1635
+ const detail::plugin &Plugin = MQueue->getPlugin ();
1636
+ for (ArgDesc &Arg : ExecKernel->MArgs ) {
1637
+ switch (Arg.MType ) {
1638
+ case kernel_param_kind_t ::kind_accessor: {
1639
+ Requirement *Req = (Requirement *)(Arg.MPtr );
1640
+ AllocaCommandBase *AllocaCmd = getAllocaForReq (Req);
1641
+ RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation ();
1642
+ if (Plugin.getBackend () == backend::opencl) {
1643
+ Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex ,
1644
+ sizeof (RT::PiMem), &MemArg);
1645
+ } else {
1646
+ Plugin.call <PiApiKind::piextKernelSetArgMemObj>(Kernel, Arg.MIndex ,
1647
+ &MemArg);
1648
+ }
1649
+ break ;
1650
+ }
1651
+ case kernel_param_kind_t ::kind_std_layout: {
1652
+ Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex , Arg.MSize ,
1653
+ Arg.MPtr );
1654
+ break ;
1655
+ }
1656
+ case kernel_param_kind_t ::kind_sampler: {
1657
+ sampler *SamplerPtr = (sampler *)Arg.MPtr ;
1658
+ RT::PiSampler Sampler = detail::getSyclObjImpl (*SamplerPtr)
1659
+ ->getOrCreateSampler (MQueue->get_context ());
1660
+ Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex ,
1661
+ sizeof (cl_sampler), &Sampler);
1662
+ break ;
1663
+ }
1664
+ case kernel_param_kind_t ::kind_pointer: {
1665
+ Plugin.call <PiApiKind::piextKernelSetArgPointer>(Kernel, Arg.MIndex ,
1666
+ Arg.MSize , Arg.MPtr );
1667
+ break ;
1668
+ }
1669
+ }
1670
+ }
1671
+
1672
+ adjustNDRangePerKernel (NDRDesc, Kernel,
1673
+ *(detail::getSyclObjImpl (MQueue->get_device ())));
1674
+
1675
+ // Some PI Plugins (like OpenCL) require this call to enable USM
1676
+ // For others, PI will turn this into a NOP.
1677
+ Plugin.call <PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
1678
+ sizeof (pi_bool), &PI_TRUE);
1679
+
1680
+ // Remember this information before the range dimensions are reversed
1681
+ const bool HasLocalSize = (NDRDesc.LocalSize [0 ] != 0 );
1682
+
1683
+ ReverseRangeDimensionsForKernel (NDRDesc);
1684
+ pi_result Error = Plugin.call_nocheck <PiApiKind::piEnqueueKernelLaunch>(
1685
+ MQueue->getHandleRef (), Kernel, NDRDesc.Dims , &NDRDesc.GlobalOffset [0 ],
1686
+ &NDRDesc.GlobalSize [0 ], HasLocalSize ? &NDRDesc.LocalSize [0 ] : nullptr ,
1687
+ RawEvents.size (), RawEvents.empty () ? nullptr : &RawEvents[0 ], &Event);
1688
+ return Error;
1689
+ }
1690
+
1632
1691
// The function initialize accessors and calls lambda.
1633
1692
// The function is used as argument to piEnqueueNativeKernel which requires
1634
1693
// that the passed function takes one void* argument.
@@ -1809,71 +1868,42 @@ cl_int ExecCGCommand::enqueueImp() {
1809
1868
1810
1869
// Run OpenCL kernel
1811
1870
sycl::context Context = MQueue->get_context ();
1812
- const detail::plugin &Plugin = MQueue->getPlugin ();
1813
1871
RT::PiKernel Kernel = nullptr ;
1872
+ std::mutex *KernelMutex = nullptr ;
1814
1873
1815
1874
if (nullptr != ExecKernel->MSyclKernel ) {
1816
1875
assert (ExecKernel->MSyclKernel ->get_info <info::kernel::context>() ==
1817
1876
Context);
1818
1877
Kernel = ExecKernel->MSyclKernel ->getHandleRef ();
1819
- } else
1820
- Kernel = detail::ProgramManager::getInstance ().getOrCreateKernel (
1821
- ExecKernel->MOSModuleHandle , Context, ExecKernel->MKernelName ,
1822
- nullptr );
1823
1878
1824
- for (ArgDesc &Arg : ExecKernel->MArgs ) {
1825
- switch (Arg.MType ) {
1826
- case kernel_param_kind_t ::kind_accessor: {
1827
- Requirement *Req = (Requirement *)(Arg.MPtr );
1828
- AllocaCommandBase *AllocaCmd = getAllocaForReq (Req);
1829
- RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation ();
1830
- if (Plugin.getBackend () == backend::opencl) {
1831
- Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex ,
1832
- sizeof (RT::PiMem), &MemArg);
1833
- } else {
1834
- Plugin.call <PiApiKind::piextKernelSetArgMemObj>(Kernel, Arg.MIndex ,
1835
- &MemArg);
1836
- }
1837
- break ;
1838
- }
1839
- case kernel_param_kind_t ::kind_std_layout: {
1840
- Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex , Arg.MSize ,
1841
- Arg.MPtr );
1842
- break ;
1843
- }
1844
- case kernel_param_kind_t ::kind_sampler: {
1845
- sampler *SamplerPtr = (sampler *)Arg.MPtr ;
1846
- RT::PiSampler Sampler =
1847
- detail::getSyclObjImpl (*SamplerPtr)->getOrCreateSampler (Context);
1848
- Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex ,
1849
- sizeof (cl_sampler), &Sampler);
1850
- break ;
1851
- }
1852
- case kernel_param_kind_t ::kind_pointer: {
1853
- Plugin.call <PiApiKind::piextKernelSetArgPointer>(Kernel, Arg.MIndex ,
1854
- Arg.MSize , Arg.MPtr );
1855
- break ;
1856
- }
1879
+ auto SyclProg = detail::getSyclObjImpl (
1880
+ ExecKernel->MSyclKernel ->get_info <info::kernel::program>());
1881
+ if (SyclProg->is_cacheable ()) {
1882
+ RT::PiKernel FoundKernel = nullptr ;
1883
+ std::tie (FoundKernel, KernelMutex) =
1884
+ detail::ProgramManager::getInstance ().getOrCreateKernel (
1885
+ ExecKernel->MOSModuleHandle ,
1886
+ ExecKernel->MSyclKernel ->get_info <info::kernel::context>(),
1887
+ ExecKernel->MKernelName , SyclProg.get ());
1888
+ assert (FoundKernel == Kernel);
1857
1889
}
1890
+ } else {
1891
+ std::tie (Kernel, KernelMutex) =
1892
+ detail::ProgramManager::getInstance ().getOrCreateKernel (
1893
+ ExecKernel->MOSModuleHandle , Context, ExecKernel->MKernelName ,
1894
+ nullptr );
1858
1895
}
1859
1896
1860
- adjustNDRangePerKernel (NDRDesc, Kernel,
1861
- *(detail::getSyclObjImpl (MQueue->get_device ())));
1862
-
1863
- // Some PI Plugins (like OpenCL) require this call to enable USM
1864
- // For others, PI will turn this into a NOP.
1865
- Plugin.call <PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
1866
- sizeof (pi_bool), &PI_TRUE);
1867
-
1868
- // Remember this information before the range dimensions are reversed
1869
- const bool HasLocalSize = (NDRDesc.LocalSize [0 ] != 0 );
1870
-
1871
- ReverseRangeDimensionsForKernel (NDRDesc);
1872
-
1873
- pi_result Error = Plugin.call_nocheck <PiApiKind::piEnqueueKernelLaunch>(
1874
- MQueue->getHandleRef (), Kernel, NDRDesc.Dims , &NDRDesc.GlobalOffset [0 ],
1875
- &NDRDesc.GlobalSize [0 ], HasLocalSize ? &NDRDesc.LocalSize [0 ] : nullptr ,
1876
- RawEvents.size (), RawEvents.empty () ? nullptr : &RawEvents[0 ], &Event);
1897
+ pi_result Error = PI_SUCCESS;
1898
+ if (KernelMutex != nullptr ) {
1899
+ // For cacheable kernels, we use per-kernel mutex
1900
+ std::lock_guard<std::mutex> Lock (*KernelMutex);
1901
+ Error = SetKernelParamsAndLaunch (ExecKernel, Kernel, NDRDesc, RawEvents,
1902
+ Event);
1903
+ } else {
1904
+ Error = SetKernelParamsAndLaunch (ExecKernel, Kernel, NDRDesc, RawEvents,
1905
+ Event);
1906
+ }
1877
1907
1878
1908
if (PI_SUCCESS != Error) {
1879
1909
// If we have got non-success error code, let's analyze it to emit nice
0 commit comments