Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor grid-stride loop #4190

Open
wants to merge 8 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions Src/Base/AMReX_GpuLaunch.H
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <AMReX_RandomEngine.H>
#include <AMReX_Algorithm.H>
#include <AMReX_Math.H>
#include <AMReX_Vector.H>
#include <cstddef>
#include <limits>
#include <algorithm>
Expand Down Expand Up @@ -176,6 +177,45 @@ namespace Gpu {
{
return makeExecutionConfig<MT>(box.numPts());
}

struct ExecConfig
{
Long ntotalthreads;
int nblocks;
};
WeiqunZhang marked this conversation as resolved.
Show resolved Hide resolved

template <int MT>
Vector<ExecConfig> makeNExecutionConfigs (Long N) noexcept
{
// Max # of blocks in a kernel launch
int numblocks_max = std::numeric_limits<int>::max();
// Max # of threads in a kernel launch
Long nmax = Long(MT) * numblocks_max;
// # of launches needed for N elements without using grid-stride
// loops inside GPU kernels.
auto nlaunches = int((N+nmax-1)/nmax);
Vector<ExecConfig> r(nlaunches);
for (int i = 0; i < nlaunches; ++i) {
int nblocks;
if (N > nmax) {
nblocks = numblocks_max;
N -= nmax;
} else {
nblocks = int((N+MT-1)/MT);
}
// Total # of threads in this launch
r[i].ntotalthreads = Long(nblocks) * MT;
// # of blocks in this launch
r[i].nblocks = nblocks;
}
WeiqunZhang marked this conversation as resolved.
Show resolved Hide resolved
return r;
}

template <int MT, int dim>
Vector<ExecConfig> makeNExecutionConfigs (BoxND<dim> const& box) noexcept
{
return makeNExecutionConfigs<MT>(box.numPts());
}
#endif

}
Expand Down
126 changes: 87 additions & 39 deletions Src/Base/AMReX_GpuLaunchFunctsG.H
Original file line number Diff line number Diff line change
Expand Up @@ -747,35 +747,76 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
}

template<int MT, typename T, typename L>
template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
void launch (T const& n, L const& f) noexcept
{
static_assert(sizeof(T) >= 2);
if (amrex::isEmpty(n)) { return; }
const auto ec = Gpu::makeExecutionConfig<MT>(n);
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (auto const i : Gpu::Range(n)) {
f(i);
const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
T ndone = 0;
for (auto const& ec : nec) {
T nleft = n - ndone;
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
// This will not overflow, even though nblocks*MT might.
auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
if (tid < nleft) {
f(tid+ndone);
}
});
if (Long(nleft) > ec.ntotalthreads) {
ndone += T(ec.ntotalthreads);
WeiqunZhang marked this conversation as resolved.
Show resolved Hide resolved
}
});
}
AMREX_GPU_ERROR_CHECK();
}

template<int MT, int dim, typename L>
void launch (BoxND<dim> const& box, L const& f) noexcept
{
if (box.isEmpty()) { return; }
const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
const BoxIndexerND<dim> indexer(box);
const auto type = box.ixType();
std::uint64_t ndone = 0;
for (auto const& ec : nec) {
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
if (icell < indexer.numPts()) {
auto iv = indexer.intVect(icell);
f(BoxND<dim>(iv,iv,type));
}
});
ndone += ec.ntotalthreads;
}
AMREX_GPU_ERROR_CHECK();
}

template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
std::enable_if_t<MaybeDeviceRunnable<L>::value>
ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
{
static_assert(sizeof(T) >= 2);
if (amrex::isEmpty(n)) { return; }
const auto ec = Gpu::makeExecutionConfig<MT>(n);
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
i < Long(n); i += stride) {
detail::call_f_scalar_handler(f, T(i),
Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
T ndone = 0;
for (auto const& ec : nec) {
T nleft = n - ndone;
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
// This will not overflow, even though nblocks*MT might.
auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
if (tid < nleft) {
detail::call_f_scalar_handler(f, tid+ndone,
Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
if (Long(nleft) > ec.ntotalthreads) {
ndone += ec.ntotalthreads;
}
WeiqunZhang marked this conversation as resolved.
Show resolved Hide resolved
});
}
AMREX_GPU_ERROR_CHECK();
}

Expand All @@ -785,18 +826,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
{
if (amrex::isEmpty(box)) { return; }
const BoxIndexerND<dim> indexer(box);
const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
icell < indexer.numPts(); icell += stride)
{
auto iv = indexer.intVect(icell);
detail::call_f_intvect_handler(f, iv,
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
std::uint64_t ndone = 0;
for (auto const& ec : nec) {
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
if (icell < indexer.numPts()) {
auto iv = indexer.intVect(icell);
detail::call_f_intvect_handler(f, iv,
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
ndone += ec.ntotalthreads;
}
AMREX_GPU_ERROR_CHECK();
}

Expand All @@ -806,17 +850,21 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
{
if (amrex::isEmpty(box)) { return; }
const BoxIndexerND<dim> indexer(box);
const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
icell < indexer.numPts(); icell += stride) {
auto iv = indexer.intVect(icell);
detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
std::uint64_t ndone = 0;
for (auto const& ec : nec) {
AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + ndone;
if (icell < indexer.numPts()) {
auto iv = indexer.intVect(icell);
detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
(std::uint64_t)blockDim.x)));
}
});
ndone += ec.ntotalthreads;
}
AMREX_GPU_ERROR_CHECK();
}

Expand Down
8 changes: 8 additions & 0 deletions Src/EB/AMReX_algoim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,16 @@ compute_integrals (MultiFab& intgmf, IntVect nghost)

if (Gpu::inLaunchRegion())
{
#if defined(AMREX_USE_CUDA)
// It appears that there is a nvcc bug. We have to use the
// 4D ParallelFor here, even though ncomp is 1.
int ncomp = fg.nComp();
amrex::ParallelFor(bx, ncomp,
[=] AMREX_GPU_DEVICE (int i, int j, int k, int) noexcept
#else
amrex::ParallelFor(bx,
[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
#endif
{
const auto ebflag = fg(i,j,k);
if (ebflag.isRegular()) {
Expand Down
Loading