Skip to content

Commit

Permalink
Merge pull request #37 from eth-cscs/release/0.9.13
Browse files Browse the repository at this point in the history
Release/0.9.13
  • Loading branch information
AdhocMan authored Aug 18, 2020
2 parents c35e0a5 + 70c491a commit b00fbbc
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 2 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # 3.11 to avoid issues with OpenMP + CUDA
project(SpFFT LANGUAGES CXX VERSION 0.9.12)
project(SpFFT LANGUAGES CXX VERSION 0.9.13)
set(SPFFT_SO_VERSION 0)
set(SPFFT_VERSION ${PROJECT_VERSION})

Expand Down
2 changes: 1 addition & 1 deletion cmake/modules/FindROCFFT.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ endif()

find_library(
ROCFFT_LIBRARIES
NAMES "rocfft"
NAMES "rocfft" "rocfft-d"
HINTS ${_ROCFFT_PATHS}
PATH_SUFFIXES "rocfft/lib" "rocfft"
)
Expand Down
30 changes: 30 additions & 0 deletions src/fft/transform_real_2d_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,23 @@ class TransformReal2DGPU : public TransformGPU {
inline auto device_id() const noexcept -> int { return stream_.device_id(); }

auto forward() -> void override {
#ifdef SPFFT_ROCM
// workaround for bug with rocFFT for case 1x1xZ
if (spaceDomain_.dim_mid() == 1 && spaceDomain_.dim_inner() == 1) {
// make sure imaginary part is 0
gpu::check_status(gpu::memset_async(
static_cast<void*>(freqDomain_.data()), 0,
freqDomain_.size() * sizeof(typename decltype(freqDomain_)::ValueType), stream_.get()));
// copy real valued data into complex buffer -> from stride 1 to stride 2
gpu::check_status(gpu::memcpy_2d_async(static_cast<void*>(freqDomain_.data()), 2 * sizeof(T),
static_cast<const void*>(spaceDomain_.data()),
sizeof(T), sizeof(T), freqDomain_.dim_outer(),
gpu::flag::MemcpyDeviceToDevice, stream_.get()));
// no transform needed
return;
}
#endif

if(symm_) {
// Make sure buffer is zero before transform, such that the symmtry operation can identify
// elements, which have not been written to by the FFT
Expand All @@ -213,6 +230,19 @@ class TransformReal2DGPU : public TransformGPU {
}

auto backward() -> void override {
#ifdef SPFFT_ROCM
// workaround for bug with rocFFT for case 1x1xZ
if (spaceDomain_.dim_mid() == 1 && spaceDomain_.dim_inner() == 1) {
// copy complex data into real valued buffer -> from stride 2 to stride 1
gpu::check_status(gpu::memcpy_2d_async(static_cast<void*>(spaceDomain_.data()), sizeof(T),
static_cast<const void*>(freqDomain_.data()),
2 * sizeof(T), sizeof(T), freqDomain_.dim_outer(),
gpu::flag::MemcpyDeviceToDevice, stream_.get()));
// no transform needed
return;
}
#endif

gpu::fft::check_result(gpu::fft::set_work_area(planBackward_, workBuffer_->data()));
gpu::fft::check_result(
gpu::fft::execute(planBackward_, freqDomain_.data(), spaceDomain_.data()));
Expand Down
6 changes: 6 additions & 0 deletions src/gpu_util/gpu_pointer_translation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,13 @@ auto translate_gpu_pointer(const T* inputPointer) -> std::pair<const T*, const T
const T* devicePtr = nullptr;
return {inputPointer, devicePtr};
} else {
#ifdef SPFFT_ROCM
// Bug with HIP (ROCm 3.5) causes attr.devicePointer to be set to the start of allocated memory
// -> pointers with offset are not correctly returned
return {static_cast<const T*>(nullptr), inputPointer};
#else
return {static_cast<const T*>(attr.hostPointer), static_cast<const T*>(attr.devicePointer)};
#endif
}
}

Expand Down
11 changes: 11 additions & 0 deletions src/gpu_util/gpu_runtime_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ constexpr auto MemoryTypeManaged = GPU_PREFIX(MemoryTypeManaged);

constexpr auto MemcpyHostToDevice = GPU_PREFIX(MemcpyHostToDevice);
constexpr auto MemcpyDeviceToHost = GPU_PREFIX(MemcpyDeviceToHost);
constexpr auto MemcpyDeviceToDevice = GPU_PREFIX(MemcpyDeviceToDevice);

constexpr auto EventDefault = GPU_PREFIX(EventDefault);
constexpr auto EventBlockingSync = GPU_PREFIX(EventBlockingSync);
Expand Down Expand Up @@ -179,11 +180,21 @@ inline auto memcpy(ARGS&&... args) -> StatusType {
return GPU_PREFIX(Memcpy)(std::forward<ARGS>(args)...);
}

template <typename... ARGS>
inline auto memcpy_2d(ARGS&&... args) -> StatusType {
return GPU_PREFIX(Memcpy2D)(std::forward<ARGS>(args)...);
}

template <typename... ARGS>
inline auto memcpy_async(ARGS&&... args) -> StatusType {
return GPU_PREFIX(MemcpyAsync)(std::forward<ARGS>(args)...);
}

template <typename... ARGS>
inline auto memcpy_2d_async(ARGS&&... args) -> StatusType {
return GPU_PREFIX(Memcpy2DAsync)(std::forward<ARGS>(args)...);
}

template <typename... ARGS>
inline auto get_device(ARGS&&... args) -> StatusType {
return GPU_PREFIX(GetDevice)(std::forward<ARGS>(args)...);
Expand Down

0 comments on commit b00fbbc

Please sign in to comment.