Skip to content

Commit

Permalink
Kernel fusing in FabArrayUtility (AMReX-Codes#2593)
Browse files Browse the repository at this point in the history
WeiqunZhang authored Jan 20, 2022
1 parent b7050b4 commit 3318e36
Showing 2 changed files with 62 additions and 61 deletions.
14 changes: 13 additions & 1 deletion Src/Base/AMReX_Box.H
Original file line number Diff line number Diff line change
@@ -1767,6 +1767,18 @@ Box makeSlab (Box const& b, int direction, int slab_index) noexcept
return r;
}

AMREX_GPU_HOST_DEVICE
AMREX_FORCE_INLINE
Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellType())
{
#if (AMREX_SPACEDIM == 1)
amrex::ignore_unused(j,k);
#elif (AMREX_SPACEDIM == 2)
amrex::ignore_unused(k);
#endif
return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
}

}

#endif /*BL_BOX_H*/
#endif /*AMREX_BOX_H*/
109 changes: 49 additions & 60 deletions Src/Base/AMReX_FabArrayUtility.H
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@
#include <AMReX_FabArray.H>
#include <AMReX_LayoutData.H>
#include <AMReX_Print.H>
#include <AMReX_Reduce.H>
#include <AMReX_ParReduce.H>
#include <limits>

namespace amrex {
@@ -52,23 +52,13 @@ ReduceMF (FabArray<FAB> const& fa, IntVect const& nghost, F&& f)
using T = std::conditional_t<std::is_same<OP,ReduceOpLogicalAnd>::value ||
std::is_same<OP,ReduceOpLogicalOr>::value,
int, typename FAB::value_type>;
ReduceOps<OP> reduce_op;
ReduceData<T> reduce_data(reduce_op);
using ReduceTuple = typename decltype(reduce_data)::Type;

for (MFIter mfi(fa); mfi.isValid(); ++mfi)
{
const Box& bx = amrex::grow(mfi.validbox(),nghost);
const auto& arr = fa.const_array(mfi);
reduce_op.eval(bx, reduce_data,
[=] AMREX_GPU_DEVICE (Box const& b) -> ReduceTuple
{
return { static_cast<T>(f(b, arr)) };
});
}

ReduceTuple hv = reduce_data.value(reduce_op);
return amrex::get<0>(hv);
auto typ = fa.ixType();
auto const& ma = fa.const_arrays();
return ParReduce(TypeList<OP>{}, TypeList<T>{}, fa, nghost,
[=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
{
return { static_cast<T>(f(amrex::makeSingleCellBox(i,j,k,typ), ma[box_no])) };
});
}

template <class OP, class FAB1, class FAB2, class F>
@@ -81,24 +71,15 @@ ReduceMF (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2, IntVect const& n
using T = std::conditional_t<std::is_same<OP,ReduceOpLogicalAnd>::value ||
std::is_same<OP,ReduceOpLogicalOr>::value,
int, typename FAB1::value_type>;
ReduceOps<OP> reduce_op;
ReduceData<T> reduce_data(reduce_op);
using ReduceTuple = typename decltype(reduce_data)::Type;

for (MFIter mfi(fa1); mfi.isValid(); ++mfi)
{
const Box& bx = amrex::grow(mfi.validbox(),nghost);
const auto& arr1 = fa1.const_array(mfi);
const auto& arr2 = fa2.const_array(mfi);
reduce_op.eval(bx, reduce_data,
[=] AMREX_GPU_DEVICE (Box const& b) -> ReduceTuple
{
return { static_cast<T>(f(b, arr1, arr2)) };
});
}

ReduceTuple hv = reduce_data.value(reduce_op);
return amrex::get<0>(hv);
auto typ = fa1.ixType();
auto const& ma1 = fa1.const_arrays();
auto const& ma2 = fa2.const_arrays();
return ParReduce(TypeList<OP>{}, TypeList<T>{}, fa1, nghost,
[=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
{
return { static_cast<T>(f(amrex::makeSingleCellBox(i,j,k,typ),
ma1[box_no], ma2[box_no])) };
});
}

template <class OP, class FAB1, class FAB2, class FAB3, class F>
@@ -112,25 +93,16 @@ ReduceMF (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
using T = std::conditional_t<std::is_same<OP,ReduceOpLogicalAnd>::value ||
std::is_same<OP,ReduceOpLogicalOr>::value,
int, typename FAB1::value_type>;
ReduceOps<OP> reduce_op;
ReduceData<T> reduce_data(reduce_op);
using ReduceTuple = typename decltype(reduce_data)::Type;

for (MFIter mfi(fa1); mfi.isValid(); ++mfi)
{
const Box& bx = amrex::grow(mfi.validbox(),nghost);
const auto& arr1 = fa1.const_array(mfi);
const auto& arr2 = fa2.const_array(mfi);
const auto& arr3 = fa3.const_array(mfi);
reduce_op.eval(bx, reduce_data,
[=] AMREX_GPU_DEVICE (Box const& b) -> ReduceTuple
{
return { static_cast<T>(f(b, arr1, arr2, arr3)) };
});
}

ReduceTuple hv = reduce_data.value(reduce_op);
return amrex::get<0>(hv);
auto typ = fa1.ixType();
auto const& ma1 = fa1.const_arrays();
auto const& ma2 = fa2.const_arrays();
auto const& ma3 = fa3.const_arrays();
return ParReduce(TypeList<OP>{}, TypeList<T>{}, fa1, nghost,
[=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
{
return { static_cast<T>(f(amrex::makeSingleCellBox(i,j,k,typ),
ma1[box_no], ma2[box_no], ma3[box_no])) };
});
}

template <class FAB, class F>
@@ -1449,14 +1421,13 @@ indexFromValue (FabArray<FAB> const& mf, int comp, IntVect const& nghost,
int* p = aa.data();
// This is a device ptr to 1+AMREX_SPACEDIM int zeros.
// The first is used as an atomic bool and the others for intvect.
for (MFIter mfi(mf,MFItInfo().SetDeviceSync(false)); mfi.isValid(); ++mfi) {
const Box& bx = amrex::grow(mfi.validbox(), nghost);
auto const& arr = mf.const_array(mfi);
amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
if (mf.isFusingCandidate()) {
auto const& ma = mf.const_arrays();
ParallelFor(mf, nghost, [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
{
int* flag = p;
if (*flag == 0) {
if (arr(i,j,k,comp) == value) {
if (ma[box_no](i,j,k,comp) == value) {
if (Gpu::Atomic::Exch(flag,1) == 0) {
AMREX_D_TERM(p[1] = i;,
p[2] = j;,
@@ -1465,6 +1436,24 @@ indexFromValue (FabArray<FAB> const& mf, int comp, IntVect const& nghost,
}
}
});
} else {
for (MFIter mfi(mf,MFItInfo().SetDeviceSync(false)); mfi.isValid(); ++mfi) {
const Box& bx = amrex::grow(mfi.validbox(), nghost);
auto const& arr = mf.const_array(mfi);
amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
{
int* flag = p;
if (*flag == 0) {
if (arr(i,j,k,comp) == value) {
if (Gpu::Atomic::Exch(flag,1) == 0) {
AMREX_D_TERM(p[1] = i;,
p[2] = j;,
p[3] = k;);
}
}
}
});
}
}
int const* tmp = aa.copyToHost();
AMREX_D_TERM(loc[0] = tmp[1];,

0 comments on commit 3318e36

Please sign in to comment.