-
Notifications
You must be signed in to change notification settings - Fork 368
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add capability for `ParallelFor` to safely do reduction using `deviceReduceSum`, `Min`, etc. The user passes `Gpu::KernelInfo{}.setReduction(true)` to notify `ParallelFor` that this is a parallel reduction, and gives `ParallelFor` a callable that takes `Gpu::Handler`. A `Gpu::Handler` is needed to call `deviceReduceSum`. Also add `Gpu::Buffer` class, whose data pointer can be used as a device destination for `deviceReduceSum`. It also has a `copyToHost` method to copy the device result back to the host. See `Tutorials/GPU/ParallelReduce` for examples of how to use `ParallelFor` for reduction. Also note that the reduction function is OpenMP CPU threads safe. Thus the same code can run on with OpenMP when it is not built for GPU. Co-authored-by: Andrew Myers <atmyers2@gmail.com>
- Loading branch information
1 parent
c8cdfa6
commit 10ed0e0
Showing
21 changed files
with
963 additions
and
177 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
#ifndef AMREX_GPU_DEVICE_BUFFER_H_ | ||
#define AMREX_GPU_DEVICE_BUFFER_H_ | ||
#include <AMReX_Config.H> | ||
|
||
#include <AMReX_Arena.H> | ||
#include <AMReX_TypeTraits.H> | ||
#include <AMReX_GpuDevice.H> | ||
#include <cstring> | ||
#include <cstdlib> | ||
#include <initializer_list> | ||
#include <memory> | ||
|
||
namespace amrex { | ||
namespace Gpu { | ||
|
||
template <typename T, typename std::enable_if<AMREX_IS_TRIVIALLY_COPYABLE(T),int>::type = 0> | ||
class Buffer | ||
{ | ||
public: | ||
|
||
Buffer (std::initializer_list<T> init) | ||
: m_size(init.size()) | ||
{ | ||
if (m_size == 0) return; | ||
#ifdef AMREX_USE_GPU | ||
h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T))); | ||
#else | ||
h_data = static_cast<T*>(std::malloc(m_size*sizeof(T))); | ||
#endif | ||
std::memcpy(h_data, init.begin(), m_size*sizeof(T)); | ||
#ifdef AMREX_USE_GPU | ||
if (Gpu::inLaunchRegion()) | ||
{ | ||
d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T))); | ||
Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T)); | ||
#ifdef AMREX_USE_DPCPP | ||
if (Gpu::onNullStream()) Gpu::synchronize(); | ||
#endif | ||
} | ||
#endif | ||
} | ||
|
||
Buffer (T const* h_p, const std::size_t n) | ||
: m_size(n) | ||
{ | ||
if (m_size == 0) return; | ||
#ifdef AMREX_USE_GPU | ||
h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T))); | ||
#else | ||
h_data = static_cast<T*>(std::malloc(m_size*sizeof(T))); | ||
#endif | ||
std::memcpy(h_data, h_p, m_size*sizeof(T)); | ||
#ifdef AMREX_USE_GPU | ||
if (Gpu::inLaunchRegion()) | ||
{ | ||
d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T))); | ||
Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T)); | ||
#ifdef AMREX_USE_DPCPP | ||
if (Gpu::onNullStream()) Gpu::synchronize(); | ||
#endif | ||
} | ||
#endif | ||
} | ||
|
||
~Buffer () { clear(); } | ||
|
||
Buffer (Buffer const&) = delete; | ||
Buffer (Buffer &&) = delete; | ||
void operator= (Buffer const&) = delete; | ||
void operator= (Buffer &&) = delete; | ||
|
||
T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; } | ||
T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; } | ||
|
||
T const* hostData () const noexcept { return h_data; } | ||
T* hostDatat () noexcept { return h_data; } | ||
|
||
std::size_t size () const noexcept { return m_size; } | ||
|
||
void clear () | ||
{ | ||
#ifdef AMREX_USE_GPU | ||
if (d_data) The_Arena()->free(d_data); | ||
if (h_data) The_Pinned_Arena()->free(h_data); | ||
#else | ||
std::free(h_data); | ||
#endif | ||
d_data = nullptr; | ||
h_data = nullptr; | ||
} | ||
|
||
T* copyToHost () | ||
{ | ||
#ifdef AMREX_USE_GPU | ||
if (d_data) | ||
{ | ||
Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T)); | ||
Gpu::streamSynchronize(); | ||
} | ||
#endif | ||
return h_data; | ||
} | ||
|
||
private: | ||
std::size_t m_size; | ||
T* d_data = nullptr; | ||
T* h_data = nullptr; | ||
}; | ||
|
||
} | ||
} | ||
|
||
#endif |
Oops, something went wrong.