diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp index f387a5f1c8e..c01073c43c9 100644 --- a/Src/AmrCore/AMReX_TagBox.cpp +++ b/Src/AmrCore/AMReX_TagBox.cpp @@ -439,9 +439,9 @@ TagBoxArray::local_collate_gpu (Vector& v) const Gpu::Device::gpuStream(), [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept { - int bid = h.item.get_group_linear_id(); - int tid = h.item.get_local_id(0); - int icell = h.item.get_global_id(0); + int bid = h.item->get_group_linear_id(); + int tid = h.item->get_local_id(0); + int icell = h.item->get_global_id(0); int t = 0; if (icell < ncells && tags[icell] != TagBox::CLEAR) { @@ -517,15 +517,15 @@ TagBoxArray::local_collate_gpu (Vector& v) const amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(), [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept { - int bid = h.item.get_group(0); - int tid = h.item.get_local_id(0); - int icell = h.item.get_global_id(0); + int bid = h.item->get_group(0); + int tid = h.item->get_local_id(0); + int icell = h.item->get_global_id(0); unsigned int* shared_counter = (unsigned int*)h.local; if (tid == 0) { *shared_counter = 0; } - h.item.barrier(sycl::access::fence_space::local_space); + h.item->barrier(sycl::access::fence_space::local_space); if (icell < ncells && tags[icell] != TagBox::CLEAR) { unsigned int itag = Gpu::Atomic::Inc diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index f3feae49837..36a73e7fc4f 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -586,6 +586,10 @@ amrex::Finalize () void amrex::Finalize (amrex::AMReX* pamrex) { +#ifdef AMREX_USE_GPU + Gpu::synchronize(); +#endif + AMReX::erase(pamrex); BL_TINY_PROFILE_FINALIZE(); diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H index 9e9ec893689..f0ff7bed3b0 100644 --- a/Src/Base/AMReX_Box.H +++ b/Src/Base/AMReX_Box.H @@ -221,6 +221,20 @@ public: && p.z >= smallend[2] && p.z <= bigend[2]); } + //! Returns true if argument is contained within Box. + AMREX_GPU_HOST_DEVICE +#if (AMREX_SPACEDIM == 1) + bool contains (int i, int, int) const noexcept { +#elif (AMREX_SPACEDIM == 2) + bool contains (int i, int j, int) const noexcept { +#else + bool contains (int i, int j, int k) const noexcept { +#endif + return AMREX_D_TERM(i >= smallend[0] && i <= bigend[0], + && j >= smallend[1] && j <= bigend[1], + && k >= smallend[2] && k <= bigend[2]); + } + /** \brief Returns true if argument is contained within Box. * It is an error if the Boxes have different types. */ diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H index c4db3d08a15..32a0b5a8461 100644 --- a/Src/Base/AMReX_Gpu.H +++ b/Src/Base/AMReX_Gpu.H @@ -22,6 +22,7 @@ namespace amrex { namespace Cuda {} } #include #include #include +#include #include #include #include diff --git a/Src/Base/AMReX_GpuAsyncArray.H b/Src/Base/AMReX_GpuAsyncArray.H index 262ae1a4ae5..46d8cd6a0e9 100644 --- a/Src/Base/AMReX_GpuAsyncArray.H +++ b/Src/Base/AMReX_GpuAsyncArray.H @@ -38,7 +38,7 @@ public: #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - d_data = static_cast(The_Device_Arena()->alloc(n*sizeof(T))); + d_data = static_cast(The_Arena()->alloc(n*sizeof(T))); Gpu::htod_memcpy_async(d_data, h_data, n*sizeof(T)); } #endif @@ -51,7 +51,7 @@ public: #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - d_data = static_cast(The_Device_Arena()->alloc(n*sizeof(T))); + d_data = static_cast(The_Arena()->alloc(n*sizeof(T))); } else #endif @@ -62,6 +62,11 @@ public: ~AsyncArray () { clear(); } + AsyncArray (AsyncArray const&) = delete; + AsyncArray (AsyncArray &&) = delete; + void operator= (AsyncArray const&) = delete; + void operator= (AsyncArray &&) = delete; + T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; } T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; } void clear () @@ -88,7 +93,7 @@ public: #else // xxxxx DPCPP todo Gpu::streamSynchronize(); - The_Device_Arena()->free(d_data); + The_Arena()->free(d_data); std::free(h_data); #endif } diff --git a/Src/Base/AMReX_GpuAsyncArray.cpp b/Src/Base/AMReX_GpuAsyncArray.cpp index 35355218b91..520de2f30b0 100644 --- a/Src/Base/AMReX_GpuAsyncArray.cpp +++ b/Src/Base/AMReX_GpuAsyncArray.cpp @@ -17,7 +17,7 @@ extern "C" { void* hp = pp[1]; std::free(hp); std::free(p); - amrex::The_Device_Arena()->free(dp); + amrex::The_Arena()->free(dp); } } #endif diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H index 32305d58b6f..71d88994825 100644 --- a/Src/Base/AMReX_GpuAtomic.H +++ b/Src/Base/AMReX_GpuAtomic.H @@ -452,5 +452,50 @@ namespace HostDevice { namespace Atomic { }} +#ifdef AMREX_USE_GPU +// functors +namespace Gpu { + template + struct AtomicAdd + { + AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept { + Gpu::Atomic::AddNoRet(dest, source); + } + }; + + template + struct AtomicMin + { + AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept { + Gpu::Atomic::Min(dest, source); + } + }; + + template + struct AtomicMax + { + AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept { + Gpu::Atomic::Max(dest, source); + } + }; + + template + struct AtomicLogicalAnd + { + AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept { + Gpu::Atomic::LogicalAnd(dest, source); + } + }; + + template + struct AtomicLogicalOr + { + AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept { + Gpu::Atomic::LogicalOr(dest, source); + } + }; +} +#endif + } #endif diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H new file mode 100644 index 00000000000..23b02c255ee --- /dev/null +++ b/Src/Base/AMReX_GpuBuffer.H @@ -0,0 +1,113 @@ +#ifndef AMREX_GPU_DEVICE_BUFFER_H_ +#define AMREX_GPU_DEVICE_BUFFER_H_ +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace amrex { +namespace Gpu { + +template ::type = 0> +class Buffer +{ +public: + + Buffer (std::initializer_list init) + : m_size(init.size()) + { + if (m_size == 0) return; +#ifdef AMREX_USE_GPU + h_data = static_cast(The_Pinned_Arena()->alloc(m_size*sizeof(T))); +#else + h_data = static_cast(std::malloc(m_size*sizeof(T))); +#endif + std::memcpy(h_data, init.begin(), m_size*sizeof(T)); +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + d_data = static_cast(The_Arena()->alloc(m_size*sizeof(T))); + Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T)); +#ifdef AMREX_USE_DPCPP + if (Gpu::onNullStream()) Gpu::synchronize(); +#endif + } +#endif + } + + Buffer (T const* h_p, const std::size_t n) + : m_size(n) + { + if (m_size == 0) return; +#ifdef AMREX_USE_GPU + h_data = static_cast(The_Pinned_Arena()->alloc(m_size*sizeof(T))); +#else + h_data = static_cast(std::malloc(m_size*sizeof(T))); +#endif + std::memcpy(h_data, h_p, m_size*sizeof(T)); +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) + { + d_data = static_cast(The_Arena()->alloc(m_size*sizeof(T))); + Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T)); +#ifdef AMREX_USE_DPCPP + if (Gpu::onNullStream()) Gpu::synchronize(); +#endif + } +#endif + } + + ~Buffer () { clear(); } + + Buffer (Buffer const&) = delete; + Buffer (Buffer &&) = delete; + void operator= (Buffer const&) = delete; + void operator= (Buffer &&) = delete; + + T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; } + T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; } + + T const* hostData () const noexcept { return h_data; } + T* hostDatat () noexcept { return h_data; } + + std::size_t size () const noexcept { return m_size; } + + void clear () + { +#ifdef AMREX_USE_GPU + if (d_data) The_Arena()->free(d_data); + if (h_data) The_Pinned_Arena()->free(h_data); +#else + std::free(h_data); +#endif + d_data = nullptr; + h_data = nullptr; + } + + T* copyToHost () + { +#ifdef AMREX_USE_GPU + if (d_data) + { + Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T)); + Gpu::streamSynchronize(); + } +#endif + return h_data; + } + +private: + std::size_t m_size; + T* d_data = nullptr; + T* h_data = nullptr; +}; + +} +} + +#endif diff --git a/Src/Base/AMReX_GpuFuse.H b/Src/Base/AMReX_GpuFuse.H index effad729364..80007f849ae 100644 --- a/Src/Base/AMReX_GpuFuse.H +++ b/Src/Base/AMReX_GpuFuse.H @@ -77,15 +77,69 @@ PutLambda (Lambda const& f, char* buf) return Arena::align(sizeof(Lambda)); } +template +AMREX_GPU_DEVICE +auto GetLambda1DLauncherFnPtr (Lambda const& f, Lambda1DLauncher& launcher) + -> decltype(f(0)) +{ + amrex::ignore_unused(f); + launcher = &LaunchLambda1D; +} + +template +AMREX_GPU_DEVICE +auto GetLambda1DLauncherFnPtr (Lambda const& f, Lambda1DLauncher& launcher) + -> decltype(f(0,Gpu::Handler{})) +{ + amrex::ignore_unused(f); + launcher = nullptr; +} + +template +AMREX_GPU_DEVICE +auto GetLambda3DLauncherFnPtr (Lambda const& f, Lambda3DLauncher& launcher) + -> decltype(f(0,0,0)) +{ + amrex::ignore_unused(f); + launcher = &LaunchLambda3D; +} + +template +AMREX_GPU_DEVICE +auto GetLambda3DLauncherFnPtr (Lambda const& f, Lambda3DLauncher& launcher) + -> decltype(f(0,0,0,Gpu::Handler{})) +{ + amrex::ignore_unused(f); + launcher = nullptr; +} + +template +AMREX_GPU_DEVICE +auto GetLambda4DLauncherFnPtr (Lambda const& f, Lambda4DLauncher& launcher) + -> decltype(f(0,0,0,0)) +{ + amrex::ignore_unused(f); + launcher = &LaunchLambda4D; +} + +template +AMREX_GPU_DEVICE +auto GetLambda4DLauncherFnPtr (Lambda const& f, Lambda4DLauncher& launcher) + -> decltype(f(0,0,0,0,Gpu::Handler{})) +{ + amrex::ignore_unused(f); + launcher = nullptr; +} + template void -PutLambda1DLauncher (FuseHelper* helper) +PutLambda1DLauncher (FuseHelper* helper, Lambda const& f) { static Lambda1DLauncher fp = nullptr; if (fp == nullptr) { launch_global<<<1,1>>>([=] AMREX_GPU_DEVICE () { - helper->m_fp.L1D = &LaunchLambda1D; + GetLambda1DLauncherFnPtr(f, helper->m_fp.L1D); }); Gpu::synchronize(); fp = helper->m_fp.L1D; @@ -96,13 +150,13 @@ PutLambda1DLauncher (FuseHelper* helper) template void -PutLambda3DLauncher (FuseHelper* helper) +PutLambda3DLauncher (FuseHelper* helper, Lambda const& f) { static Lambda3DLauncher fp = nullptr; if (fp == nullptr) { launch_global<<<1,1>>>([=] AMREX_GPU_DEVICE () { - helper->m_fp.L3D = &LaunchLambda3D; + GetLambda3DLauncherFnPtr(f, helper->m_fp.L3D); }); Gpu::synchronize(); fp = helper->m_fp.L3D; @@ -113,13 +167,13 @@ PutLambda3DLauncher (FuseHelper* helper) template void -PutLambda4DLauncher (FuseHelper* helper) +PutLambda4DLauncher (FuseHelper* helper, Lambda const& f) { static Lambda4DLauncher fp = nullptr; if (fp == nullptr) { launch_global<<<1,1>>>([=] AMREX_GPU_DEVICE () { - helper->m_fp.L4D = &LaunchLambda4D; + GetLambda4DLauncherFnPtr(f, helper->m_fp.L4D); }); Gpu::synchronize(); fp = helper->m_fp.L4D; @@ -152,7 +206,7 @@ public: if (bx.isEmpty()) return; using Lambda = typename std::decay::type; Register_doit(bx, 0, f); - PutLambda3DLauncher(m_helper_buf+(m_nlambdas-1)); + PutLambda3DLauncher(m_helper_buf+(m_nlambdas-1), f); } template @@ -163,7 +217,7 @@ public: if (bx.isEmpty()) return; using Lambda = typename std::decay::type; Register_doit(bx, ncomp, f); - PutLambda4DLauncher(m_helper_buf+(m_nlambdas-1)); + PutLambda4DLauncher(m_helper_buf+(m_nlambdas-1), f); } template ::value> > @@ -176,7 +230,7 @@ public: if (N <= 0) return; using Lambda = typename std::decay::type; Register_doit(Box(), N, f); - PutLambda1DLauncher(m_helper_buf+(m_nlambdas-1)); + PutLambda1DLauncher(m_helper_buf+(m_nlambdas-1), f); } void Launch (); diff --git a/Src/Base/AMReX_GpuKernelInfo.H b/Src/Base/AMReX_GpuKernelInfo.H index 72076bbb64f..dc7e452ee11 100644 --- a/Src/Base/AMReX_GpuKernelInfo.H +++ b/Src/Base/AMReX_GpuKernelInfo.H @@ -9,9 +9,12 @@ class KernelInfo { public: KernelInfo& setFusible (bool flag) { fusible = flag; return *this; } + KernelInfo& setReduction (bool flag) { has_reduction = flag; return *this; } bool isFusible () const { return fusible; } + bool hasReduction () const { return has_reduction; } private: bool fusible = false; + bool has_reduction = false; }; }} diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 6eee4c94800..6799eb9d133 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -4,6 +4,56 @@ namespace amrex { +namespace detail { + template + AMREX_GPU_DEVICE + auto call_f (F const& f, N i) + noexcept -> decltype(f(0)) + { + f(i); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, N i) + noexcept -> decltype(f(0,Gpu::Handler{})) + { + f(i,Gpu::Handler{}); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k) + noexcept -> decltype(f(0,0,0)) + { + f(i,j,k); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k) + noexcept -> decltype(f(0,0,0,Gpu::Handler{})) + { + f(i,j,k,Gpu::Handler{}); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, T n) + noexcept -> decltype(f(0,0,0,0)) + { + f(i,j,k,n); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, T n) + noexcept -> decltype(f(0,0,0,0,Gpu::Handler{})) + { + f(i,j,k,n,Gpu::Handler{}); + } +} + template void launch (T const& n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept { @@ -14,7 +64,7 @@ template item) AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) { - f(Gpu::Handler{item,shared_data.get_pointer()}); + f(Gpu::Handler{&item,shared_data.get_pointer()}); }); }); } catch (sycl::exception const& ex) { @@ -94,8 +94,58 @@ void launch (T const& n, L&& f) noexcept } } +namespace detail { + template + AMREX_GPU_DEVICE + auto call_f (F const& f, N i, Gpu::Handler const&) + noexcept -> decltype(f(0)) + { + f(i); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, N i, Gpu::Handler const& handler) + noexcept -> decltype(f(0,Gpu::Handler{})) + { + f(i,handler); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, Gpu::Handler const&) + noexcept -> decltype(f(0,0,0)) + { + f(i,j,k); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, Gpu::Handler const& handler) + noexcept -> decltype(f(0,0,0,Gpu::Handler{})) + { + f(i,j,k,handler); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, T ncomp, Gpu::Handler const&) + noexcept -> decltype(f(0,0,0,0)) + { + for (T n = 0; n < ncomp; ++n) f(i,j,k,n); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, T ncomp, Gpu::Handler const& handler) + noexcept -> decltype(f(0,0,0,0,Gpu::Handler{})) + { + for (T n = 0; n < ncomp; ++n) f(i,j,k,n,handler); + } +} + template ::value> > -void ParallelFor (Gpu::KernelInfo const& /*info*/, T n, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (amrex::isEmpty(n)) return; const auto ec = Gpu::ExecutionConfig(n); @@ -105,25 +155,46 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, T n, L&& f) noexcept int nthreads_total = nthreads_per_block * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { - q.submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), - sycl::range<1>(nthreads_per_block)), - [=] (sycl::nd_item<1> item) - AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) - { - for (T i = item.get_global_id(0), stride = item.get_global_range(0); - i < n; i += stride) { - f(i); - } + if (info.hasReduction()) { + q.submit([&] (sycl::handler& h) { + sycl::accessor + shared_data(sycl::range<1>(Gpu::Device::warp_size), h); + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), + sycl::range<1>(nthreads_per_block)), + [=] (sycl::nd_item<1> item) + AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) + { + for (T i = item.get_global_id(0), stride = item.get_global_range(0); + i < n; i += stride) { + int n_active_threads = amrex::min(n-i+(T)item.get_local_id(0), + (T)item.get_local_range(0)); + detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_pointer(), + n_active_threads}); + } + }); }); - }); + } else { + q.submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), + sycl::range<1>(nthreads_per_block)), + [=] (sycl::nd_item<1> item) + AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) + { + for (T i = item.get_global_id(0), stride = item.get_global_range(0); + i < n; i += stride) { + detail::call_f(f, i, Gpu::Handler{&item}); + } + }); + }); + } } catch (sycl::exception const& ex) { amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!"); } } template -void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); @@ -136,31 +207,58 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, L&& f) noexce int nthreads_total = nthreads_per_block * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { - q.submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), - sycl::range<1>(nthreads_per_block)), - [=] (sycl::nd_item<1> item) - AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) - { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - i += lo.x; - j += lo.y; - k += lo.z; - f(i,j,k); - } + if (info.hasReduction()) { + q.submit([&] (sycl::handler& h) { + sycl::accessor + shared_data(sycl::range<1>(Gpu::Device::warp_size), h); + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), + sycl::range<1>(nthreads_per_block)), + [=] (sycl::nd_item<1> item) + AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) + { + for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + icell < ncells; icell += stride) { + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + int n_active_threads = amrex::min(ncells-i+(int)item.get_local_id(0), + (int)item.get_local_range(0)); + detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_pointer(), + n_active_threads}); + } + }); }); - }); + } else { + q.submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), + sycl::range<1>(nthreads_per_block)), + [=] (sycl::nd_item<1> item) + AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) + { + for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + icell < ncells; icell += stride) { + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + detail::call_f(f,i,j,k,Gpu::Handler{&item}); + } + }); + }); + } } catch (sycl::exception const& ex) { amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!"); } } template ::value> > -void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, T ncomp, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); @@ -173,26 +271,52 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, T ncomp, L&& int nthreads_total = nthreads_per_block * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { - q.submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), - sycl::range<1>(nthreads_per_block)), - [=] (sycl::nd_item<1> item) - AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) - { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); - icell < ncells; icell += stride) { - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - i += lo.x; - j += lo.y; - k += lo.z; - for (T n = 0; n < ncomp; ++n) { - f(i,j,k,n); + if (info.hasReduction()) { + q.submit([&] (sycl::handler& h) { + sycl::accessor + shared_data(sycl::range<1>(Gpu::Device::warp_size), h); + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), + sycl::range<1>(nthreads_per_block)), + [=] (sycl::nd_item<1> item) + AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) + { + for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + icell < ncells; icell += stride) { + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + int n_active_threads = amrex::min(ncells-i+(int)item.get_local_id(0), + (int)item.get_local_range(0)); + detail::call_f(f, i, j, k, ncomp, + Gpu::Handler{&item, shared_data.get_pointer(), + n_active_threads}); } - } + }); }); - }); + } else { + q.submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), + sycl::range<1>(nthreads_per_block)), + [=] (sycl::nd_item<1> item) + AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) + { + for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + icell < ncells; icell += stride) { + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item}); + } + }); + }); + } } catch (sycl::exception const& ex) { amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!"); } @@ -685,7 +809,7 @@ void VecReduce (N n, T const& init_val, L1&& f1, L2&& f2) noexcept i < n; i += stride) { f1(i,&r); } - f2(r,Gpu::Handler{item,shared_data.get_pointer()}); + f2(r,Gpu::Handler{&item,shared_data.get_pointer()}); }); }); } catch (sycl::exception const& ex) { @@ -733,24 +857,75 @@ void launch (T const& n, L&& f) noexcept AMREX_GPU_ERROR_CHECK(); } +namespace detail { + template + AMREX_GPU_DEVICE + auto call_f (F const& f, N i, N /*nleft*/) + noexcept -> decltype(f(0)) + { + f(i); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, N i, N nleft) + noexcept -> decltype(f(0,Gpu::Handler{})) + { + f(i,Gpu::Handler(amrex::min(nleft,(N)blockDim.x))); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, int /*nleft*/) + noexcept -> decltype(f(0,0,0)) + { + f(i,j,k); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, int nleft) + noexcept -> decltype(f(0,0,0,Gpu::Handler{})) + { + f(i,j,k,Gpu::Handler(amrex::min(nleft,(int)blockDim.x))); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, T ncomp, int /*nleft*/) + noexcept -> decltype(f(0,0,0,0)) + { + for (T n = 0; n < ncomp; ++n) f(i,j,k,n); + } + + template + AMREX_GPU_DEVICE + auto call_f (F const& f, int i, int j, int k, T ncomp, int nleft) + noexcept -> decltype(f(0,0,0,0,Gpu::Handler{})) + { + for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(int)blockDim.x))); + } +} + template ::value> > amrex::EnableIf_t::value> ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (amrex::isEmpty(n)) return; #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusible() && n <= Gpu::getFuseSizeThreshold()) { + if (!info.hasReduction() && Gpu::inFuseRegion() && info.isFusible() && n <= Gpu::getFuseSizeThreshold()) { Gpu::Register(n, f); } else #endif { amrex::ignore_unused(info); const auto ec = Gpu::ExecutionConfig(n); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; i < n; i += stride) { - f(i); + detail::call_f(f, i, (n-i+(T)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -764,7 +939,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept if (amrex::isEmpty(box)) return; int ncells = box.numPts(); #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { + if (!info.hasReduction() && Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { Gpu::Register(box, f); } else #endif @@ -773,17 +948,19 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept const auto lo = amrex::lbound(box); const auto len = amrex::length(box); const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { + icell < ncells; icell += stride) + { int k = icell / (len.x*len.y); int j = (icell - k*(len.x*len.y)) / len.x; int i = (icell - k*(len.x*len.y)) - j*len.x; i += lo.x; j += lo.y; k += lo.z; - f(i,j,k); + detail::call_f(f, i, j, k, (ncells-icell+(int)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -797,7 +974,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce if (amrex::isEmpty(box)) return; int ncells = box.numPts(); #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { + if (!info.hasReduction() && Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { Gpu::Register(box, ncomp, f); } else #endif @@ -806,6 +983,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce const auto lo = amrex::lbound(box); const auto len = amrex::length(box); const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; @@ -816,9 +994,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce i += lo.x; j += lo.y; k += lo.z; - for (T n = 0; n < ncomp; ++n) { - f(i,j,k,n); - } + detail::call_f(f, i, j, k, ncomp, (ncells-icell+(int)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H index 80a8052df8a..9a80512cafa 100644 --- a/Src/Base/AMReX_GpuReduce.H +++ b/Src/Base/AMReX_GpuReduce.H @@ -9,6 +9,29 @@ #include #include +// +// Public interface +// +namespace amrex { namespace Gpu { + template + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept; + + template + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept; + + template + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept; + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept; + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept; +}} + // // Reduce functions based on _shfl_down_sync // @@ -36,8 +59,8 @@ AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h) { T* shared = (T*)h.local; - int tid = h.item.get_local_id(0); - sycl::ONEAPI::sub_group const& sg = h.item.get_sub_group(); + int tid = h.item->get_local_id(0); + sycl::ONEAPI::sub_group const& sg = h.item->get_sub_group(); int lane = sg.get_local_id()[0]; int wid = sg.get_group_id()[0]; int numwarps = sg.get_group_range()[0]; @@ -46,56 +69,134 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h) // if this reduction call is occurring multiple times in a kernel, // and since we don't know how many times the user is calling it, // we do it always to be safe. - h.item.barrier(sycl::access::fence_space::local_space); + h.item->barrier(sycl::access::fence_space::local_space); if (lane == 0) shared[wid] = x; - h.item.barrier(sycl::access::fence_space::local_space); + h.item->barrier(sycl::access::fence_space::local_space); bool b = (tid == 0) || (tid < numwarps); x = b ? shared[lane] : x0; if (wid == 0) x = warp_reduce(x, sg); return x; } +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op, + T x0, Gpu::Handler const& handler) +{ + sycl::ONEAPI::sub_group const& sg = handler.item->get_sub_group(); + int wid = sg.get_group_id()[0]; + if ((wid+1)*warpSize <= handler.numActiveThreads) { + x = warp_reduce(x, sg); // full warp + if (sg.get_local_id()[0] == 0) atomic_op(dest, x); + } else { + atomic_op(dest, x); + } +} + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept +void deviceReduceSum_full (T * dest, T source, Gpu::Handler const& h) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), (T)0, h); - if (h.item.get_local_id(0) == 0) Gpu::Atomic::AddNoRet(dest, source); + if (h.item->get_local_id(0) == 0) Gpu::Atomic::AddNoRet(dest, source); } template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept +void deviceReduceMin_full (T * dest, T source, Gpu::Handler const& h) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), source, h); - if (h.item.get_local_id(0) == 0) Gpu::Atomic::Min(dest, source); + if (h.item->get_local_id(0) == 0) Gpu::Atomic::Min(dest, source); } template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept +void deviceReduceMax_full (T * dest, T source, Gpu::Handler const& h) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), source, h); - if (h.item.get_local_id(0) == 0) Gpu::Atomic::Max(dest, source); + if (h.item->get_local_id(0) == 0) Gpu::Atomic::Max(dest, source); } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept +void deviceReduceLogicalAnd_full (int * dest, int source, Gpu::Handler const& h) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), 1, h); - if (h.item.get_local_id(0) == 0) Gpu::Atomic::LogicalAnd(dest, source); + if (h.item->get_local_id(0) == 0) Gpu::Atomic::LogicalAnd(dest, source); } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept +void deviceReduceLogicalOr_full (int * dest, int source, Gpu::Handler const& h) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), 0, h); - if (h.item.get_local_id(0) == 0) Gpu::Atomic::LogicalOr(dest, source); + if (h.item->get_local_id(0) == 0) Gpu::Atomic::LogicalOr(dest, source); +} + +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept +{ + if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) { + deviceReduceSum_full(dest, source, h); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicAdd(), (T)0, h); + } +} + +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept +{ + if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) { + deviceReduceMin_full(dest, source, h); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicMin(), source, h); + } +} + +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept +{ + if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) { + deviceReduceMax_full(dest, source, h); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicMax(), source, h); + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept +{ + if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) { + deviceReduceLogicalAnd_full(dest, source, h); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicLogicalAnd(), 1, h); + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept +{ + if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) { + deviceReduceLogicalOr_full(dest, source, h); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicLogicalOr(), 0, h); + } } #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) @@ -136,9 +237,23 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0) return x; } +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op, + T x0, Gpu::Handler const& handler) +{ + int warp = (int)threadIdx.x / warpSize; + if ((warp+1)*warpSize <= handler.numActiveThreads) { + x = warp_reduce(x); // full warp + if (threadIdx.x % warpSize == 0) atomic_op(dest, x); + } else { + atomic_op(dest,x); + } +} + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceSum (T * dest, T source) noexcept +void deviceReduceSum_full (T * dest, T source) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), (T)0); @@ -147,7 +262,7 @@ void deviceReduceSum (T * dest, T source) noexcept template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceMin (T * dest, T source) noexcept +void deviceReduceMin_full (T * dest, T source) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), source); @@ -156,7 +271,7 @@ void deviceReduceMin (T * dest, T source) noexcept template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceMax (T * dest, T source) noexcept +void deviceReduceMax_full (T * dest, T source) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), source); @@ -164,7 +279,7 @@ void deviceReduceMax (T * dest, T source) noexcept } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceLogicalAnd (int * dest, int source) noexcept +void deviceReduceLogicalAnd_full (int * dest, int source) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), 1); @@ -172,18 +287,81 @@ void deviceReduceLogicalAnd (int * dest, int source) noexcept } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void deviceReduceLogicalOr (int * dest, int source) noexcept +void deviceReduceLogicalOr_full (int * dest, int source) noexcept { source = Gpu::blockReduce (source, Gpu::warpReduce >(), 0); if (threadIdx.x == 0) Gpu::Atomic::LogicalOr(dest, source); } +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceSum (T * dest, T source, Gpu::Handler const& handler) noexcept +{ + if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) { + deviceReduceSum_full(dest, source); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicAdd(), (T)0, handler); + } +} + +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceMin (T * dest, T source, Gpu::Handler const& handler) noexcept +{ + if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) { + deviceReduceMin_full(dest, source); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicMin(), source, handler); + } +} + +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceMax (T * dest, T source, Gpu::Handler const& handler) noexcept +{ + if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) { + deviceReduceMax_full(dest, source); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicMax(), source, handler); + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& handler) noexcept +{ + if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) { + deviceReduceLogicalAnd_full(dest, source); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicLogicalAnd(), 1, handler); + } +} + +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& handler) noexcept +{ + if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) { + deviceReduceLogicalOr_full(dest, source); + } else { + Gpu::blockReduce_partial + (dest, source, Gpu::warpReduce >(), + Gpu::AtomicLogicalOr(), 0, handler); + } +} + #else template AMREX_FORCE_INLINE -void deviceReduceSum (T * dest, T source) noexcept +void deviceReduceSum_full (T * dest, T source) noexcept { #ifdef _OPENMP #pragma omp atomic @@ -193,7 +371,14 @@ void deviceReduceSum (T * dest, T source) noexcept template AMREX_FORCE_INLINE -void deviceReduceMin (T * dest, T source) noexcept +void deviceReduceSum (T * dest, T source, Gpu::Handler const&) noexcept +{ + deviceReduceSum_full(dest, source); +} + +template +AMREX_FORCE_INLINE +void deviceReduceMin_full (T * dest, T source) noexcept { #ifdef _OPENMP #pragma omp critical (gpureduce_reducemin) @@ -203,7 +388,14 @@ void deviceReduceMin (T * dest, T source) noexcept template AMREX_FORCE_INLINE -void deviceReduceMax (T * dest, T source) noexcept +void deviceReduceMin (T * dest, T source, Gpu::Handler const&) noexcept +{ + deviceReduceMin_full(dest, source); +} + +template +AMREX_FORCE_INLINE +void deviceReduceMax_full (T * dest, T source) noexcept { #ifdef _OPENMP #pragma omp critical (gpureduce_reducemax) @@ -211,8 +403,15 @@ void deviceReduceMax (T * dest, T source) noexcept *dest = std::max(*dest, source); } +template +AMREX_FORCE_INLINE +void deviceReduceMax (T * dest, T source, Gpu::Handler const&) noexcept +{ + deviceReduceMax_full(dest, source); +} + AMREX_FORCE_INLINE -void deviceReduceLogicalAnd (int * dest, int source) noexcept +void deviceReduceLogicalAnd_full (int * dest, int source) noexcept { #ifdef _OPENMP #pragma omp critical (gpureduce_reduceand) @@ -221,7 +420,13 @@ void deviceReduceLogicalAnd (int * dest, int source) noexcept } AMREX_FORCE_INLINE -void deviceReduceLogicalOr (int * dest, int source) noexcept +void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const&) noexcept +{ + deviceReduceLogicalAnd_full(dest, source); +} + +AMREX_FORCE_INLINE +void deviceReduceLogicalOr_full (int * dest, int source) noexcept { #ifdef _OPENMP #pragma omp critical (gpureduce_reduceor) @@ -229,6 +434,12 @@ void deviceReduceLogicalOr (int * dest, int source) noexcept *dest = (*dest) || source; } +AMREX_FORCE_INLINE +void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const&) noexcept +{ + deviceReduceLogicalOr_full(dest, source); +} + #endif }} diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 42b6fef73a1..fa660545738 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -26,18 +26,41 @@ struct gpuStream_t { bool operator== (gpuStream_t const& rhs) noexcept { return queue == rhs.queue; } }; -namespace Gpu { -struct Handler { - Handler (sycl::nd_item<1> const& a_item, void* a_local = nullptr) - : item(a_item), local(a_local) {} - sycl::nd_item<1> const& item; - void* local; // DPC++ local memory -}; +#endif + } #endif -} +namespace amrex { namespace Gpu { + +#if defined(AMREX_USE_DPCPP) + +struct Handler +{ + Handler (sycl::nd_item<1> const* a_item = nullptr, void* a_local = nullptr, + int a_n_active_threds = -1) + : item(a_item), local(a_local), numActiveThreads(a_n_active_threds) {} + sycl::nd_item<1> const* item; + void* local; // DPC++ local memory + int numActiveThreads; +}; + +#elif defined(AMREX_USE_GPU) + +struct Handler +{ + AMREX_GPU_HOST_DEVICE constexpr Handler (int n_active_threads = -1) + : numActiveThreads(n_active_threads) {} + int numActiveThreads; +}; + +#else + +struct Handler {}; #endif + +}} + #endif diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index b63b1220dcb..c660adb91e2 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -77,12 +77,12 @@ struct ReduceOpSum template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept { - Gpu::deviceReduceSum(&d,s,h); + Gpu::deviceReduceSum_full(&d,s,h); } #else template AMREX_GPU_DEVICE AMREX_FORCE_INLINE - void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceSum(&d,s); } + void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceSum_full(&d,s); } #endif template @@ -99,12 +99,12 @@ struct ReduceOpMin template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept { - Gpu::deviceReduceMin(&d,s,h); + Gpu::deviceReduceMin_full(&d,s,h); } #else template AMREX_GPU_DEVICE AMREX_FORCE_INLINE - void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceMin(&d,s); } + void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceMin_full(&d,s); } #endif template @@ -121,12 +121,12 @@ struct ReduceOpMax template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept { - Gpu::deviceReduceMax(&d,s,h); + Gpu::deviceReduceMax_full(&d,s,h); } #else template AMREX_GPU_DEVICE AMREX_FORCE_INLINE - void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceMax(&d,s); } + void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceMax_full(&d,s); } #endif template @@ -142,11 +142,11 @@ struct ReduceOpLogicalAnd #ifdef AMREX_USE_DPCPP AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (int& d, int s, Gpu::Handler const& h) const noexcept { - Gpu::deviceReduceLogicalAnd(&d,s,h); + Gpu::deviceReduceLogicalAnd_full(&d,s,h); } #else AMREX_GPU_DEVICE AMREX_FORCE_INLINE - void parallel_update (int& d, int s) const noexcept { Gpu::deviceReduceLogicalAnd(&d,s); } + void parallel_update (int& d, int s) const noexcept { Gpu::deviceReduceLogicalAnd_full(&d,s); } #endif AMREX_GPU_DEVICE AMREX_FORCE_INLINE @@ -160,11 +160,11 @@ struct ReduceOpLogicalOr #ifdef AMREX_USE_DPCPP AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (int& d, int s, Gpu::Handler const& h) const noexcept { - Gpu::deviceReduceLogicalOr(&d,s,h); + Gpu::deviceReduceLogicalOr_full(&d,s,h); } #else AMREX_GPU_DEVICE AMREX_FORCE_INLINE - void parallel_update (int& d, int s) const noexcept { Gpu::deviceReduceLogicalOr(&d,s); } + void parallel_update (int& d, int s) const noexcept { Gpu::deviceReduceLogicalOr_full(&d,s); } #endif AMREX_GPU_DEVICE AMREX_FORCE_INLINE @@ -295,7 +295,7 @@ public: [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { ReduceTuple r = *(dp+1); - for (int icell = gh.item.get_global_id(0), stride = gh.item.get_global_range(0); + for (int icell = gh.item->get_global_id(0), stride = gh.item->get_global_range(0); icell < ncells; icell += stride) { int k = icell / (len.x*len.y); int j = (icell - k*(len.x*len.y)) / len.x; @@ -363,7 +363,7 @@ public: [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { ReduceTuple r = *(dp+1); - for (int icell = gh.item.get_global_id(0), stride = gh.item.get_global_range(0); + for (int icell = gh.item->get_global_id(0), stride = gh.item->get_global_range(0); icell < ncells; icell += stride) { int k = icell / (len.x*len.y); int j = (icell - k*(len.x*len.y)) / len.x; @@ -415,7 +415,7 @@ public: [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { ReduceTuple r = *(dp+1); - for (N i = gh.item.get_global_id(0), stride = gh.item.get_global_range(0); + for (N i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0); i < n; i += stride) { auto pr = f(i); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr); @@ -453,12 +453,12 @@ T Sum (N n, U const* v, T init_val, BOP bop) #ifdef AMREX_USE_DPCPP [=] AMREX_GPU_DEVICE (T const& r, Gpu::Handler const& h) noexcept { - Gpu::deviceReduceSum(dp, r, h); + Gpu::deviceReduceSum_full(dp, r, h); }); #else [=] AMREX_GPU_DEVICE (T const& r) noexcept { - Gpu::deviceReduceSum(dp, r); + Gpu::deviceReduceSum_full(dp, r); }); #endif return ds.dataValue(); @@ -495,12 +495,12 @@ T Min (N n, U const* v, T init_val, BOP bop) #ifdef AMREX_USE_DPCPP [=] AMREX_GPU_DEVICE (T const& r, Gpu::Handler const& h) noexcept { - Gpu::deviceReduceMin(dp, r, h); + Gpu::deviceReduceMin_full(dp, r, h); }); #else [=] AMREX_GPU_DEVICE (T const& r) noexcept { - Gpu::deviceReduceMin(dp, r); + Gpu::deviceReduceMin_full(dp, r); }); #endif return ds.dataValue(); @@ -526,12 +526,12 @@ T Max (N n, U const* v, T init_val, BOP bop) #ifdef AMREX_USE_DPCPP [=] AMREX_GPU_DEVICE (T const& r, Gpu::Handler const& h) noexcept { - Gpu::deviceReduceMax(dp, r, h); + Gpu::deviceReduceMax_full(dp, r, h); }); #else [=] AMREX_GPU_DEVICE (T const& r) noexcept { - Gpu::deviceReduceMax(dp, r); + Gpu::deviceReduceMax_full(dp, r); }); #endif return ds.dataValue(); @@ -560,14 +560,14 @@ std::pair MinMax (N n, U const* v, MINOP minop, MAXOP maxop) #ifdef AMREX_USE_DPCPP [=] AMREX_GPU_DEVICE (Real2 const& r, Gpu::Handler const& h) noexcept { - Gpu::deviceReduceMin(dp , r[0], h); - Gpu::deviceReduceMax(dp+1, r[1], h); + Gpu::deviceReduceMin_full(dp , r[0], h); + Gpu::deviceReduceMax_full(dp+1, r[1], h); }); #else [=] AMREX_GPU_DEVICE (Real2 const& r) noexcept { - Gpu::deviceReduceMin(dp , r[0]); - Gpu::deviceReduceMax(dp+1, r[1]); + Gpu::deviceReduceMin_full(dp , r[0]); + Gpu::deviceReduceMax_full(dp+1, r[1]); }); #endif Gpu::dtoh_memcpy(hv.data(), dp, 2*sizeof(T)); @@ -793,7 +793,7 @@ T Sum (N n, U const* v, T init_val, BOP bop) }, [=] (T r) noexcept { - Gpu::deviceReduceSum(dp, r); + Gpu::deviceReduceSum_full(dp, r); }); return sum; } @@ -830,7 +830,7 @@ T Min (N n, U const* v, T init_val, BOP bop) }, [=] (T r) noexcept { - Gpu::deviceReduceMin(dp, r); + Gpu::deviceReduceMin_full(dp, r); }); return mn; } @@ -853,7 +853,7 @@ T Max (N n, U const* v, T init_val, BOP bop) }, [=] (T r) noexcept { - Gpu::deviceReduceMax(dp, r); + Gpu::deviceReduceMax_full(dp, r); }); return mx; } @@ -878,8 +878,8 @@ std::pair MinMax (N n, U const* v, MINOP minop, MAXOP maxop) }, [=] (Real2 const& r) noexcept { - Gpu::deviceReduceMin(dp , r[0]); - Gpu::deviceReduceMax(dp+1, r[1]); + Gpu::deviceReduceMin_full(dp , r[0]); + Gpu::deviceReduceMax_full(dp+1, r[1]); }); return std::make_pair(hv[0],hv[1]); } diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H index 7d03ce0faf5..e0a4f1e9ab1 100644 --- a/Src/Base/AMReX_Scan.H +++ b/Src/Base/AMReX_Scan.H @@ -199,14 +199,14 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type) amrex::launch(nblocks, nthreads, sm, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - sycl::ONEAPI::sub_group const& sg = gh.item.get_sub_group(); + sycl::ONEAPI::sub_group const& sg = gh.item->get_sub_group(); int lane = sg.get_local_id()[0]; int warp = sg.get_group_id()[0]; int nwarps = sg.get_group_range()[0]; - int threadIdxx = gh.item.get_local_id(0); - int blockDimx = gh.item.get_local_range(0); - int gridDimx = gh.item.get_group_range(0); + int threadIdxx = gh.item->get_local_id(0); + int blockDimx = gh.item->get_local_range(0); + int gridDimx = gh.item->get_group_range(0); T* shared = (T*)(gh.local); T* shared2 = shared + Gpu::Device::warp_size; @@ -222,7 +222,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type) (virtual_block_id_p, gridDimx); virtual_block_id_shared = bid; } - gh.item.barrier(sycl::access::fence_space::local_space); + gh.item->barrier(sycl::access::fence_space::local_space); virtual_block_id = virtual_block_id_shared; } @@ -265,7 +265,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type) shared[warp] = x; } - gh.item.barrier(sycl::access::fence_space::local_space); + gh.item->barrier(sycl::access::fence_space::local_space); // The first warp will do scan on the warp sums for the // whole block. @@ -279,7 +279,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type) if (lane < nwarps) shared2[lane] = y; } - gh.item.barrier(sycl::access::fence_space::local_space); + gh.item->barrier(sycl::access::fence_space::local_space); // shared[0:nwarps) holds the inclusive sum of warp sums. @@ -317,7 +317,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type) int iblock = iblock0-lane; detail::STVA stva{'p', 0}; if (iblock >= 0) { - stva = pbs[iblock].wait(gh.item); + stva = pbs[iblock].wait(*gh.item); } T x = stva.value; @@ -359,7 +359,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type) } } - gh.item.barrier(sycl::access::fence_space::local_space); + gh.item->barrier(sycl::access::fence_space::local_space); T exclusive_prefix = shared[0]; diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index bc5b2443755..7edf2ed77df 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -194,6 +194,7 @@ target_sources( amrex AMReX_GpuError.H AMReX_GpuDevice.H AMReX_GpuDevice.cpp + AMReX_GpuBuffer.H AMReX_GpuAtomic.H AMReX_GpuUtility.H AMReX_GpuUtility.cpp diff --git a/Src/Base/Make.package b/Src/Base/Make.package index e635b0928ee..a1a4dff0e62 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -83,6 +83,7 @@ C$(AMREX_BASE)_headers += AMReX_GpuMemory.H AMReX_GpuRange.H C$(AMREX_BASE)_headers += AMReX_GpuAtomic.H C$(AMREX_BASE)_sources += AMReX_GpuDevice.cpp AMReX_GpuUtility.cpp C$(AMREX_BASE)_headers += AMReX_GpuDevice.H AMReX_GpuUtility.H +C$(AMREX_BASE)_headers += AMReX_GpuBuffer.H C$(AMREX_BASE)_headers += AMReX_GpuAsyncArray.H C$(AMREX_BASE)_sources += AMReX_GpuAsyncArray.cpp diff --git a/Src/EB/AMReX_EB2_GeometryShop.H b/Src/EB/AMReX_EB2_GeometryShop.H index d1ac0401fbe..0df73f1dfda 100644 --- a/Src/EB/AMReX_EB2_GeometryShop.H +++ b/Src/EB/AMReX_EB2_GeometryShop.H @@ -28,6 +28,7 @@ Real IF_f (F const& f, GpuArray const& p) noexcept { #if AMREX_DEVICE_COMPILE + amrex::ignore_unused(f,p); amrex::Error("EB2::GeometryShop: how did this happen?"); return 0.0; #else diff --git a/Tutorials/GPU/ParallelReduce/GNUmakefile b/Tutorials/GPU/ParallelReduce/GNUmakefile index 2754352287a..19a7b9ded31 100644 --- a/Tutorials/GPU/ParallelReduce/GNUmakefile +++ b/Tutorials/GPU/ParallelReduce/GNUmakefile @@ -13,6 +13,7 @@ USE_OMP = FALSE USE_HIP = FALSE TINY_PROFILE = TRUE +BL_NO_FORT = TRUE include $(AMREX_HOME)/Tools/GNUMake/Make.defs diff --git a/Tutorials/GPU/ParallelReduce/main.cpp b/Tutorials/GPU/ParallelReduce/main.cpp index ee1e82ad4a0..31fae4efa03 100644 --- a/Tutorials/GPU/ParallelReduce/main.cpp +++ b/Tutorials/GPU/ParallelReduce/main.cpp @@ -24,10 +24,12 @@ void main_main () { int ncell = 512; int max_grid_size = 128; + int nghost = 0; { ParmParse pp; pp.query("ncell", ncell); pp.query("max_grid_size", max_grid_size); + pp.query("nghost", nghost); } BoxArray ba; @@ -37,11 +39,11 @@ void main_main () ba.maxSize(max_grid_size); } - MultiFab mf(ba,DistributionMapping{ba},1,0); - iMultiFab imf(ba,mf.DistributionMap(),1,0); + MultiFab mf(ba,DistributionMapping{ba},1,nghost); + iMultiFab imf(ba,mf.DistributionMap(),1,nghost); for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - Box const& bx = mfi.validbox(); + Box const& bx = mfi.fabbox(); auto const& fab = mf.array(mfi); auto const& ifab = imf.array(mfi); @@ -53,6 +55,96 @@ void main_main () }); } + int N = 1000000; + Gpu::DeviceVector vec(N); + Real* pvec = vec.dataPtr(); + amrex::ParallelForRNG( N, + [=] AMREX_GPU_DEVICE (int i, RandomEngine const& engine) noexcept + { + pvec[i] = amrex::Random(engine) - 0.5; + }); + + { + BL_PROFILE("ParallelForReduction-box-3"); + + Gpu::Buffer da({0.0, std::numeric_limits::max(), + std::numeric_limits::lowest()}); + Real* dp = da.data(); + for (MFIter mfi(mf); mfi.isValid(); ++mfi) + { + Box const& bx = mfi.fabbox(); + Array4 const& fab = mf.const_array(mfi); + amrex::ParallelFor(Gpu::KernelInfo().setReduction(true), bx, + [=] AMREX_GPU_DEVICE (int i, int j, int k, Gpu::Handler const& handler) noexcept + { + Gpu::deviceReduceSum(dp , fab(i,j,k), handler); + Gpu::deviceReduceMin(dp+1, fab(i,j,k), handler); + Gpu::deviceReduceMax(dp+2, fab(i,j,k), handler); + }); + } + Real* hp = da.copyToHost(); + ParallelDescriptor::ReduceRealSum(hp[0]); + ParallelDescriptor::ReduceRealMin(hp[1]); + ParallelDescriptor::ReduceRealMax(hp[2]); + amrex::Print().SetPrecision(17) << "sum: " << hp[0] << "\n" + << "min: " << hp[1] << "\n" + << "max: " << hp[2] << "\n"; + } + + { + BL_PROFILE("ParallelForReduction-box-sum"); + + Gpu::Buffer da({0.0}); + Real* dp = da.data(); + for (MFIter mfi(mf); mfi.isValid(); ++mfi) + { + Box const& bx = mfi.fabbox(); + Array4 const& fab = mf.const_array(mfi); + amrex::ParallelFor(Gpu::KernelInfo().setReduction(true), bx, + [=] AMREX_GPU_DEVICE (int i, int j, int k, Gpu::Handler const& handler) noexcept + { + Gpu::deviceReduceSum(dp, fab(i,j,k), handler); + }); + } + Real* hp = da.copyToHost(); + ParallelDescriptor::ReduceRealSum(hp[0]); + amrex::Print().SetPrecision(17) << "sum: " << hp[0] << "\n"; + } + + { + BL_PROFILE("ParallelForReduction-box-isum"); + + Gpu::Buffer da({0}); + Long* dp = da.data(); + for (MFIter mfi(imf); mfi.isValid(); ++mfi) + { + Box const& bx = mfi.fabbox(); + Array4 const& ifab = imf.const_array(mfi); + amrex::ParallelFor(Gpu::KernelInfo().setReduction(true), bx, + [=] AMREX_GPU_DEVICE (int i, int j, int k, Gpu::Handler const& handler) noexcept + { + Gpu::deviceReduceSum(dp, ifab(i,j,k), handler); + }); + } + Long* hp = da.copyToHost(); + ParallelDescriptor::ReduceLongSum(hp[0]); + amrex::Print().SetPrecision(17) << "isum: " << hp[0] << "\n"; + } + + { + BL_PROFILE("ParallelForReduction-vec-1"); + Gpu::Buffer da({0.0}); + Real* dp = da.data(); + amrex::ParallelFor(Gpu::KernelInfo().setReduction(true), N, + [=] AMREX_GPU_DEVICE (int i, Gpu::Handler const& handler) noexcept + { + Gpu::deviceReduceSum(dp, amrex::Math::abs(pvec[i]), handler); + }); + Real* hp = da.copyToHost(); + ParallelDescriptor::ReduceRealSum(hp[0]); + amrex::Print().SetPrecision(17) << "1-norm: " << hp[0] << "\n"; + } + { BL_PROFILE("MultiFab::sum"); amrex::Print().SetPrecision(17) << "sum: " << mf.sum() << "\n"; @@ -60,12 +152,12 @@ void main_main () { BL_PROFILE("MultiFab::min"); - amrex::Print().SetPrecision(17) << "min: " << mf.min(0) << "\n"; + amrex::Print().SetPrecision(17) << "min: " << mf.min(0, nghost) << "\n"; } { BL_PROFILE("MultiFab::max"); - amrex::Print().SetPrecision(17) << "max: " << mf.max(0) << "\n"; + amrex::Print().SetPrecision(17) << "max: " << mf.max(0, nghost) << "\n"; } { @@ -83,7 +175,7 @@ void main_main () for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); + const Box& bx = mfi.fabbox(); auto const& fab = mf.array(mfi); auto const& ifab = imf.array(mfi); reduce_op.eval(bx, reduce_data, @@ -117,17 +209,17 @@ void main_main () for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); + const Box& bx = mfi.fabbox(); auto const& fab = mf.array(mfi); auto const& ifab = imf.array(mfi); reduce_op.eval(bx, reduce_data, - [=] AMREX_GPU_DEVICE (Box const& bx) -> ReduceTuple + [=] AMREX_GPU_DEVICE (Box const& b) -> ReduceTuple { Real rsum = 0.; Real rmin = 1.e30; // If not because of cuda 9.2, Real rmax = -1.e30; // we should use numeric_limits. Long lsum = 0; - amrex::Loop(bx, + amrex::Loop(b, [=,&rsum,&rmin,&rmax,&lsum] (int i, int j, int k) { Real x = fab(i,j,k); Long ix = static_cast(ifab(i,j,k)); @@ -161,7 +253,7 @@ void main_main () for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); + const Box& bx = mfi.fabbox(); auto const& fab = mf.array(mfi); reduce_op.eval(bx, reduce_data, [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple @@ -185,7 +277,7 @@ void main_main () for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); + const Box& bx = mfi.fabbox(); auto const& fab = mf.array(mfi); reduce_op.eval(bx, reduce_data, [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple @@ -209,7 +301,7 @@ void main_main () for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); + const Box& bx = mfi.fabbox(); auto const& fab = mf.array(mfi); reduce_op.eval(bx, reduce_data, [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple @@ -233,7 +325,7 @@ void main_main () for (MFIter mfi(imf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); + const Box& bx = mfi.fabbox(); auto const& ifab = imf.array(mfi); reduce_op.eval(bx, reduce_data, [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple @@ -248,15 +340,6 @@ void main_main () amrex::Print() << "isum: " << hv << "\n"; } - int N = 1000000; - Gpu::DeviceVector vec(N); - Real* pvec = vec.dataPtr(); - amrex::ParallelForRNG( N, - [=] AMREX_GPU_DEVICE (int i, RandomEngine const& engine) noexcept - { - pvec[i] = amrex::Random(engine) - 0.5; - }); - { BL_PROFILE("VecReduce");