Skip to content

Commit

Permalink
Remove VecReduce and FabReduce (AMReX-Codes#2091)
Browse files Browse the repository at this point in the history
They were used for the old atomics based reduction approach.

Only PeleC uses VecReduce.  The use of  VecReduce there can be replaced with Reduce::Sum.
  • Loading branch information
WeiqunZhang authored Jun 7, 2021
1 parent 79678dd commit 53614b4
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 242 deletions.
45 changes: 0 additions & 45 deletions Src/Base/AMReX_GpuLaunchFunctsC.H
Original file line number Diff line number Diff line change
Expand Up @@ -316,51 +316,6 @@ void ParallelFor (Gpu::KernelInfo const&,
box3, ncomp3, std::forward<L3>(f3));
}

template <typename N, typename T, typename L1, typename L2,
typename M=std::enable_if_t<std::is_integral<N>::value> >
void FabReduce (Box const& box, N ncomp, T const& init_val,
L1&& f1, L2&& f2, std::size_t /*shared_mem_bytes*/=0) noexcept
{
auto r = init_val;
const auto lo = amrex::lbound(box);
const auto hi = amrex::ubound(box);
for (N n = 0; n < ncomp; ++n) {
for (int k = lo.z; k <= hi.z; ++k) {
for (int j = lo.y; j <= hi.y; ++j) {
for (int i = lo.x; i <= hi.x; ++i) {
f1(i,j,k,n,&r);
}}}
}
f2(r);
}

template <typename T, typename L1, typename L2>
void FabReduce (Box const& box, T const& init_val,
L1&& f1, L2&& f2, std::size_t /*shared_mem_bytes*/=0) noexcept
{
auto r = init_val;
const auto lo = amrex::lbound(box);
const auto hi = amrex::ubound(box);
for (int k = lo.z; k <= hi.z; ++k) {
for (int j = lo.y; j <= hi.y; ++j) {
for (int i = lo.x; i <= hi.x; ++i) {
f1(i,j,k,&r);
}}}
f2(r);
}

template <typename N, typename T, typename L1, typename L2,
typename M=std::enable_if_t<std::is_integral<N>::value> >
void VecReduce (N n, T const& init_val,
L1&& f1, L2&& f2, std::size_t /*shared_mem_bytes*/=0) noexcept
{
auto r = init_val;
for (N i = 0; i < n; ++i) {
f1(i,&r);
}
f2(r);
}

template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
void HostDeviceParallelFor (T n, L&& f) noexcept
{
Expand Down
197 changes: 0 additions & 197 deletions Src/Base/AMReX_GpuLaunchFunctsG.H
Original file line number Diff line number Diff line change
Expand Up @@ -700,123 +700,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
}
}

template <typename T, typename L1, typename L2>
void FabReduce (Box const& box, T const& init_val, L1&& f1, L2&& f2) noexcept
{
if (amrex::isEmpty(box)) return;
int ncells = box.numPts();
const auto lo = amrex::lbound(box);
const auto len = amrex::length(box);
auto ec = Gpu::ExecutionConfig(ncells);
ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
// If we are on default queue, block all other streams
if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
int nthreads_per_block = ec.numThreads.x;
int nthreads_total = nthreads_per_block * ec.numBlocks.x;
auto& q = Gpu::Device::streamQueue();
try {
q.submit([&] (sycl::handler& h) {
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
{
auto r = init_val;
for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
icell < ncells; icell += stride) {
int k = icell / (len.x*len.y);
int j = (icell - k*(len.x*len.y)) / len.x;
int i = (icell - k*(len.x*len.y)) - j*len.x;
i += lo.x;
j += lo.y;
k += lo.z;
f1(i,j,k,&r);
}
f2(r);
});
});
} catch (sycl::exception const& ex) {
amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
}
}

template <typename N, typename T, typename L1, typename L2,
typename M=std::enable_if_t<std::is_integral<N>::value> >
void FabReduce (Box const& box, N ncomp, T const& init_val, L1&& f1, L2&& f2) noexcept
{
if (amrex::isEmpty(box)) return;
int ncells = box.numPts();
const auto lo = amrex::lbound(box);
const auto len = amrex::length(box);
auto ec = Gpu::ExecutionConfig(ncells);
ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
// If we are on default queue, block all other streams
if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
int nthreads_per_block = ec.numThreads.x;
int nthreads_total = nthreads_per_block * ec.numBlocks.x;
auto& q = Gpu::Device::streamQueue();
try {
q.submit([&] (sycl::handler& h) {
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
{
auto r = init_val;
for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
icell < ncells; icell += stride) {
int k = icell / (len.x*len.y);
int j = (icell - k*(len.x*len.y)) / len.x;
int i = (icell - k*(len.x*len.y)) - j*len.x;
i += lo.x;
j += lo.y;
k += lo.z;
for (N n = 0; n < ncomp; ++n) {
f1(i,j,k,n,&r);
}
}
f2(r);
});
});
} catch (sycl::exception const& ex) {
amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
}
}

template <typename N, typename T, typename L1, typename L2,
typename M=std::enable_if_t<std::is_integral<N>::value> >
void VecReduce (N n, T const& init_val, L1&& f1, L2&& f2) noexcept
{
if (amrex::isEmpty(n)) return;
auto ec = Gpu::ExecutionConfig(n);
ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
// If we are on default queue, block all other streams
if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
int nthreads_per_block = ec.numThreads.x;
int nthreads_total = nthreads_per_block * ec.numBlocks.x;
auto& q = Gpu::Device::streamQueue();
try {
q.submit([&] (sycl::handler& h) {
sycl::accessor<T, 1, sycl::access::mode::read_write, sycl::access::target::local>
shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
sycl::range<1>(nthreads_per_block)),
[=] (sycl::nd_item<1> item)
AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
{
auto r = init_val;
for (N i = item.get_global_id(0), stride = item.get_global_range(0);
i < n; i += stride) {
f1(i,&r);
}
f2(r,Gpu::Handler{&item,shared_data.get_pointer()});
});
});
} catch (sycl::exception const& ex) {
amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
}
}

#else
// CUDA or HIP

Expand Down Expand Up @@ -1323,86 +1206,6 @@ ParallelFor (Gpu::KernelInfo const& info,
}
}

template <typename T, typename L1, typename L2>
std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
FabReduce (Box const& box, T const& init_val, L1&& f1, L2&& f2) noexcept
{
if (amrex::isEmpty(box)) return;
int ncells = box.numPts();
const auto lo = amrex::lbound(box);
const auto len = amrex::length(box);
auto ec = Gpu::ExecutionConfig(ncells);
ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto r = init_val;
for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
icell < ncells; icell += stride) {
int k = icell / (len.x*len.y);
int j = (icell - k*(len.x*len.y)) / len.x;
int i = (icell - k*(len.x*len.y)) - j*len.x;
i += lo.x;
j += lo.y;
k += lo.z;
f1(i,j,k,&r);
}
f2(r);
});
AMREX_GPU_ERROR_CHECK();
}

template <typename N, typename T, typename L1, typename L2,
typename M=std::enable_if_t<std::is_integral<N>::value> >
std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
FabReduce (Box const& box, N ncomp, T const& init_val, L1&& f1, L2&& f2) noexcept
{
if (amrex::isEmpty(box)) return;
int ncells = box.numPts();
const auto lo = amrex::lbound(box);
const auto len = amrex::length(box);
auto ec = Gpu::ExecutionConfig(ncells);
ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto r = init_val;
for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
icell < ncells; icell += stride) {
int k = icell / (len.x*len.y);
int j = (icell - k*(len.x*len.y)) / len.x;
int i = (icell - k*(len.x*len.y)) - j*len.x;
i += lo.x;
j += lo.y;
k += lo.z;
for (N n = 0; n < ncomp; ++n) {
f1(i,j,k,n,&r);
}
}
f2(r);
});
AMREX_GPU_ERROR_CHECK();
}

template <typename N, typename T, typename L1, typename L2,
typename M=std::enable_if_t<std::is_integral<N>::value> >
std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
VecReduce (N n, T const& init_val, L1&& f1, L2&& f2) noexcept
{
if (amrex::isEmpty(n)) return;
auto ec = Gpu::ExecutionConfig(n);
ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
AMREX_ASSERT(ec.numThreads.x == AMREX_GPU_MAX_THREADS);
AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
auto r = init_val;
for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
i < n; i += stride) {
f1(i,&r);
}
f2(r);
});
AMREX_GPU_ERROR_CHECK();
}

#endif

template <typename L>
Expand Down

0 comments on commit 53614b4

Please sign in to comment.