Remove VecReduce and FabReduce (AMReX-Codes#2091)

They were used for the old atomics based reduction approach. Only PeleC uses VecReduce. The use of VecReduce there can be replaced with Reduce::Sum.
PaulMullowney · Jun 7, 2021 · 53614b4 · 53614b4
1 parent 79678dd
commit 53614b4
Show file tree

Hide file tree

Showing 2 changed files with 0 additions and 242 deletions.
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -316,51 +316,6 @@ void ParallelFor (Gpu::KernelInfo const&,
                 box3, ncomp3, std::forward<L3>(f3));
 }
 
-template <typename N, typename T, typename L1, typename L2,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-void FabReduce (Box const& box, N ncomp, T const& init_val,
-                L1&& f1, L2&& f2, std::size_t /*shared_mem_bytes*/=0) noexcept
-{
-    auto r = init_val;
-    const auto lo = amrex::lbound(box);
-    const auto hi = amrex::ubound(box);
-    for (N n = 0; n < ncomp; ++n) {
-        for (int k = lo.z; k <= hi.z; ++k) {
-        for (int j = lo.y; j <= hi.y; ++j) {
-        for (int i = lo.x; i <= hi.x; ++i) {
-            f1(i,j,k,n,&r);
-        }}}
-    }
-    f2(r);
-}
-
-template <typename T, typename L1, typename L2>
-void FabReduce (Box const& box, T const& init_val,
-                L1&& f1, L2&& f2, std::size_t /*shared_mem_bytes*/=0) noexcept
-{
-    auto r = init_val;
-    const auto lo = amrex::lbound(box);
-    const auto hi = amrex::ubound(box);
-    for (int k = lo.z; k <= hi.z; ++k) {
-    for (int j = lo.y; j <= hi.y; ++j) {
-    for (int i = lo.x; i <= hi.x; ++i) {
-        f1(i,j,k,&r);
-    }}}
-    f2(r);
-}
-
-template <typename N, typename T, typename L1, typename L2,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-void VecReduce (N n, T const& init_val,
-                L1&& f1, L2&& f2, std::size_t /*shared_mem_bytes*/=0) noexcept
-{
-    auto r = init_val;
-    for (N i = 0; i < n; ++i) {
-        f1(i,&r);
-    }
-    f2(r);
-}
-
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceParallelFor (T n, L&& f) noexcept
 {

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -700,123 +700,6 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     }
 }
 
-template <typename T, typename L1, typename L2>
-void FabReduce (Box const& box, T const& init_val, L1&& f1, L2&& f2) noexcept
-{
-    if (amrex::isEmpty(box)) return;
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    auto ec = Gpu::ExecutionConfig(ncells);
-    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
-    auto& q = Gpu::Device::streamQueue();
-    try {
-        q.submit([&] (sycl::handler& h) {
-            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
-                                             sycl::range<1>(nthreads_per_block)),
-            [=] (sycl::nd_item<1> item)
-            AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
-            {
-                auto r = init_val;
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
-                     icell < ncells; icell += stride) {
-                    int k =  icell /   (len.x*len.y);
-                    int j = (icell - k*(len.x*len.y)) /   len.x;
-                    int i = (icell - k*(len.x*len.y)) - j*len.x;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    f1(i,j,k,&r);
-                }
-                f2(r);
-            });
-        });
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
-    }
-}
-
-template <typename N, typename T, typename L1, typename L2,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-void FabReduce (Box const& box, N ncomp, T const& init_val, L1&& f1, L2&& f2) noexcept
-{
-    if (amrex::isEmpty(box)) return;
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    auto ec = Gpu::ExecutionConfig(ncells);
-    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
-    auto& q = Gpu::Device::streamQueue();
-    try {
-        q.submit([&] (sycl::handler& h) {
-            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
-                                             sycl::range<1>(nthreads_per_block)),
-            [=] (sycl::nd_item<1> item)
-            AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
-            {
-                auto r = init_val;
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
-                     icell < ncells; icell += stride) {
-                    int k =  icell /   (len.x*len.y);
-                    int j = (icell - k*(len.x*len.y)) /   len.x;
-                    int i = (icell - k*(len.x*len.y)) - j*len.x;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    for (N n = 0; n < ncomp; ++n) {
-                        f1(i,j,k,n,&r);
-                    }
-                }
-                f2(r);
-            });
-        });
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
-    }
-}
-
-template <typename N, typename T, typename L1, typename L2,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-void VecReduce (N n, T const& init_val, L1&& f1, L2&& f2) noexcept
-{
-    if (amrex::isEmpty(n)) return;
-    auto ec = Gpu::ExecutionConfig(n);
-    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-    // If we are on default queue, block all other streams
-    if (Gpu::onNullStream()) Gpu::nonNullStreamSynchronize();
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
-    auto& q = Gpu::Device::streamQueue();
-    try {
-        q.submit([&] (sycl::handler& h) {
-                sycl::accessor<T, 1, sycl::access::mode::read_write, sycl::access::target::local>
-                    shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
-            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
-                                             sycl::range<1>(nthreads_per_block)),
-            [=] (sycl::nd_item<1> item)
-            AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
-            {
-                auto r = init_val;
-                for (N i = item.get_global_id(0), stride = item.get_global_range(0);
-                     i < n; i += stride) {
-                    f1(i,&r);
-                }
-                f2(r,Gpu::Handler{&item,shared_data.get_pointer()});
-            });
-        });
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
-    }
-}
-
 #else
 // CUDA or HIP
 
@@ -1323,86 +1206,6 @@ ParallelFor (Gpu::KernelInfo const& info,
     }
 }
 
-template <typename T, typename L1, typename L2>
-std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
-FabReduce (Box const& box, T const& init_val, L1&& f1, L2&& f2) noexcept
-{
-    if (amrex::isEmpty(box)) return;
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    auto ec = Gpu::ExecutionConfig(ncells);
-    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        auto r = init_val;
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            int k =  icell /   (len.x*len.y);
-            int j = (icell - k*(len.x*len.y)) /   len.x;
-            int i = (icell - k*(len.x*len.y)) - j*len.x;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            f1(i,j,k,&r);
-        }
-        f2(r);
-    });
-    AMREX_GPU_ERROR_CHECK();
-}
-
-template <typename N, typename T, typename L1, typename L2,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
-FabReduce (Box const& box, N ncomp, T const& init_val, L1&& f1, L2&& f2) noexcept
-{
-    if (amrex::isEmpty(box)) return;
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    auto ec = Gpu::ExecutionConfig(ncells);
-    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        auto r = init_val;
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            int k =  icell /   (len.x*len.y);
-            int j = (icell - k*(len.x*len.y)) /   len.x;
-            int i = (icell - k*(len.x*len.y)) - j*len.x;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            for (N n = 0; n < ncomp; ++n) {
-                f1(i,j,k,n,&r);
-            }
-        }
-        f2(r);
-    });
-    AMREX_GPU_ERROR_CHECK();
-}
-
-template <typename N, typename T, typename L1, typename L2,
-          typename M=std::enable_if_t<std::is_integral<N>::value> >
-std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
-VecReduce (N n, T const& init_val, L1&& f1, L2&& f2) noexcept
-{
-    if (amrex::isEmpty(n)) return;
-    auto ec = Gpu::ExecutionConfig(n);
-    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-    AMREX_ASSERT(ec.numThreads.x == AMREX_GPU_MAX_THREADS);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        auto r = init_val;
-        for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             i < n; i += stride) {
-            f1(i,&r);
-        }
-        f2(r);
-    });
-    AMREX_GPU_ERROR_CHECK();
-}
-
 #endif
 
 template <typename L>