Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release/v1.0.0 #39

Merged
merged 18 commits into from
Jan 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ jobs:
cd ${GITHUB_WORKSPACE}
mkdir -p build
cd build
/root/cmake-3.11.4-Linux-x86_64/bin/cmake .. -DSPFFT_BUILD_TESTS=ON -DSPFFT_GPU_BACKEND=ROCM
/root/cmake-3.11.4-Linux-x86_64/bin/cmake .. -DSPFFT_BUILD_TESTS=ON -DSPFFT_GPU_BACKEND=ROCM -DCMAKE_PREFIX_PATH=/opt/rocm
make -j2


107 changes: 62 additions & 45 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # 3.11 to avoid issues with OpenMP + CUDA
project(SpFFT LANGUAGES CXX VERSION 0.9.13)
set(SPFFT_SO_VERSION 0)
project(SpFFT LANGUAGES CXX VERSION 1.0.0)
set(SPFFT_SO_VERSION 1)
set(SPFFT_VERSION ${PROJECT_VERSION})

# allow {module}_ROOT variables to be set
if(POLICY CMP0074)
cmake_policy(SET CMP0074 NEW)
endif()

# use INTERFACE_LINK_LIBRARIES property if available
if(POLICY CMP0022)
cmake_policy(SET CMP0022 NEW)
# Initialize CMAKE_CUDA_ARCHITECTURES through nvcc if possible
if(POLICY CMP0104)
cmake_policy(SET CMP0104 NEW)
endif()

# set default build type to RELEASE
Expand Down Expand Up @@ -42,6 +42,11 @@ set_property(CACHE SPFFT_GPU_BACKEND PROPERTY STRINGS
"OFF" "CUDA" "ROCM"
)

set(SPFFT_FFTW_LIB "AUTO" CACHE STRING "Library providing a FFTW interface")
set_property(CACHE SPFFT_FFTW_LIB PROPERTY STRINGS
"AUTO" "FFTW" "MKL"
)

# Get GNU standard install prefixes
include(GNUInstallDirs)

Expand All @@ -63,7 +68,6 @@ set(SPFFT_DEFINITIONS)
set(SPFFT_EXTERNAL_COMPILE_OPTIONS)
set(SPFFT_LIBS)
set(SPFFT_EXTERNAL_LIBS)
set(SPFFT_INTERFACE_LIBS)
set(SPFFT_INTERFACE_INCLUDE_DIRS)
set(SPFFT_INCLUDE_DIRS)
set(SPFFT_EXTERNAL_INCLUDE_DIRS)
Expand Down Expand Up @@ -102,82 +106,95 @@ endif()
# CUDA
if(SPFFT_CUDA)
enable_language(CUDA)
find_library(CUDA_CUDART_LIBRARY cudart PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
find_library(CUDA_CUFFT_LIBRARY cufft PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
list(APPEND SPFFT_EXTERNAL_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_CUFFT_LIBRARY})
list(APPEND SPFFT_EXTERNAL_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})

if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.17.0")
find_package(CUDAToolkit REQUIRED)
else()
find_library(CUDA_CUDART_LIBRARY cudart PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
find_library(CUDA_CUFFT_LIBRARY cufft PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
if(NOT TARGET CUDA::cudart)
add_library(CUDA::cudart INTERFACE IMPORTED)
endif()
set_property(TARGET CUDA::cudart PROPERTY INTERFACE_LINK_LIBRARIES ${CUDA_CUDART_LIBRARY})
set_property(TARGET CUDA::cudart PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if(NOT TARGET CUDA::cufft)
add_library(CUDA::cufft INTERFACE IMPORTED)
endif()
set_property(TARGET CUDA::cufft PROPERTY INTERFACE_LINK_LIBRARIES ${CUDA_CUFFT_LIBRARY})
set_property(TARGET CUDA::cufft PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
endif()

list(APPEND SPFFT_EXTERNAL_LIBS CUDA::cudart CUDA::cufft)
endif()

# ROCM
if(SPFFT_ROCM)
find_package(hip CONFIG REQUIRED)
find_package(rocfft CONFIG REQUIRED)
list(APPEND SPFFT_EXTERNAL_LIBS hip::host roc::rocfft)

# FindHIP module provides compilation command for GPU code
find_package(HIP MODULE REQUIRED)
if(NOT HIP_HCC_FLAGS)
message(STATUS "Using default AMD gpu targets: gfx803, gfx900, gfx906. Set HIP_HCC_FLAGS to override.")
set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS} --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906)
endif()
find_package(HIP REQUIRED)
find_package(HIPLIBS REQUIRED)
find_package(ROCFFT REQUIRED)
list(APPEND SPFFT_EXTERNAL_LIBS HIPLIBS::hiplibs ROCFFT::rocfft)
list(APPEND SPFFT_EXTERNAL_COMPILE_OPTIONS -D__HIP_PLATFORM_HCC__) # required for parsing HIP headers with another compiler
endif()


if(SPFFT_MPI)
find_package(MPI COMPONENTS CXX REQUIRED)
list(APPEND SPFFT_EXTERNAL_LIBS MPI::MPI_CXX)
# always add MPI to interface libraries, because mpi.h is included in public header files
if(SPFFT_STATIC)
list(APPEND SPFFT_INTERFACE_LIBS ${MPI_CXX_LIBRARIES})
endif()
list(APPEND SPFFT_INTERFACE_INCLUDE_DIRS ${MPI_CXX_INCLUDE_DIRS})
endif()

if(SPFFT_OMP)
find_package(OpenMP COMPONENTS CXX REQUIRED)
list(APPEND SPFFT_EXTERNAL_LIBS OpenMP::OpenMP_CXX)
if(SPFFT_STATIC)
list(APPEND SPFFT_INTERFACE_LIBS ${OpenMP_CXX_LIBRARIES})
endif()
endif()

if(SPFFT_GPU_DIRECT)
message(STATUS "GPU Direct support enabled: Additional environment variables might have to be set before execution. (e.g \"export MPICH_RDMA_ENABLED_CUDA=1\")")
endif()



# Use MKL if available, otherwise require FFTW3
if(UNIX AND NOT APPLE)
# prefer static MKL in Linux. Together with "-Wl,--exclude-libs,ALL",
# symbols are not visible for linking afterwards and no conflicts with other MKL versions of other libraries should exist.
set(_TMP_SAVE ${CMAKE_FIND_LIBRARY_SUFFIXES})
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .so)
endif()
find_package(MKLSequential)
if(UNIX AND NOT APPLE)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${_TMP_SAVE})
unset(_TMP_SAVE)
# FFTW library must be found if not set to AUTO
set(_SPFFT_FIND_FFTW_LIB_OPTION)
if(NOT ${SPFFT_FFTW_LIB} STREQUAL "AUTO")
set(_SPFFT_FIND_FFTW_LIB_OPTION REQUIRED)
endif()

if(MKLSequential_FOUND)
list(APPEND SPFFT_EXTERNAL_LIBS MKL::Sequential)
if(SPFFT_STATIC)
list(APPEND SPFFT_INTERFACE_LIBS ${MKLSequential_LIBRARIES})
set(SPFFT_MKL OFF)
# Look for MKL first
if(${SPFFT_FFTW_LIB} STREQUAL "AUTO" OR ${SPFFT_FFTW_LIB} STREQUAL "MKL")
# Use MKL if available, otherwise require FFTW3
if(UNIX AND NOT APPLE)
# prefer static MKL in Linux. Together with "-Wl,--exclude-libs,ALL",
# symbols are not visible for linking afterwards and no conflicts with other MKL versions of other libraries should exist.
set(_TMP_SAVE ${CMAKE_FIND_LIBRARY_SUFFIXES})
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .so)
endif()
list(APPEND SPFFT_EXTERNAL_PKG_PACKAGES mkl-dynamic-lp64-seq)
else()
find_package(MKLSequential ${_SPFFT_FIND_FFTW_LIB_OPTION})
if(UNIX AND NOT APPLE)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${_TMP_SAVE})
unset(_TMP_SAVE)
endif()
if(TARGET MKL::Sequential)
list(APPEND SPFFT_EXTERNAL_LIBS MKL::Sequential)
list(APPEND SPFFT_EXTERNAL_PKG_PACKAGES mkl-dynamic-lp64-seq)
set(SPFFT_MKL ON)
endif()
endif()

# Look for FFTW library if required
if(NOT TARGET MKL::Sequential)
find_package(FFTW REQUIRED)
list(APPEND SPFFT_EXTERNAL_LIBS FFTW::FFTW)
if(SPFFT_STATIC)
list(APPEND SPFFT_INTERFACE_LIBS ${FFTW_LIBRARIES})
endif()
if(SPFFT_SINGLE_PRECISION AND NOT FFTW_FLOAT_FOUND)
message(FATAL_ERROR "FFTW library with single precision support NOT FOUND. Disable SPFFT_SINGLE_PRECISION or provide path to library.")
endif()
list(APPEND SPFFT_EXTERNAL_PKG_PACKAGES fftw3)
endif()


if(SPFFT_BUILD_TESTS)
# enable timing with testing
set(SPFFT_TIMING ON)
Expand Down
81 changes: 51 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ To allow for pre-allocation and reuse of memory, the design is based on two clas
- **Grid**: Provides memory for transforms up to a given size.
- **Transform**: Created with information on sparse input data and is associated with a *Grid*. Maximum size is limited by *Grid* dimensions. Internal reference counting to *Grid* objects guarantee a valid state until *Transform* object destruction.

The user provides memory for storing sparse frequency domain data, while a *Transform* provides memory for space domain data. This implies, that executing a *Transform* will override the space domain data of all other *Transforms* associated with the same *Grid*.
A transform can be computed in-place and out-of-place. Addtionally, an internally allocated work buffer can optionally be used for input / output of space domain data.

### New Features in v1.0
- Support for externally allocated memory for space domain data including in-place and out-of-place transforms
- Optional asynchronous computation when using GPUs
- Simplified / direct transform handle creation if no resource reuse through grid handles is required

## Documentation
Documentation can be found [here](https://spfft.readthedocs.io/en/latest/).
Expand All @@ -41,7 +46,7 @@ Documentation can be found [here](https://spfft.readthedocs.io/en/latest/).
- For multi-threading: OpenMP support by the compiler
- For compilation with GPU support:
- CUDA 9.0 and later for Nvidia hardware
- ROCm 2.6 and later for AMD hardware
- ROCm 3.5 and later for AMD hardware

## Installation
The build system follows the standard CMake workflow. Example:
Expand All @@ -53,19 +58,18 @@ make -j8 install
```

### CMake options
| Option | Default | Description |
|------------------------|---------|--------------------------------------------------|
| SPFFT_MPI | ON | Enable MPI support |
| SPFFT_OMP | ON | Enable multi-threading with OpenMP |
| SPFFT_GPU_BACKEND | OFF | Select GPU backend. Can be OFF, CUDA or ROCM |
| SPFFT_GPU_DIRECT | OFF | Use GPU aware MPI with GPUDirect |
| SPFFT_SINGLE_PRECISION | OFF | Enable single precision support |
| SPFFT_STATIC | OFF | Build as static library |
| SPFFT_BUILD_TESTS | OFF | Build test executables for developement purposes |
| SPFFT_INSTALL | ON | Add library to install target |
| SPFFT_FORTRAN | OFF | Build Fortran interface module |


| Option | Default | Description |
|------------------------|---------|--------------------------------------------------------------|
| SPFFT_MPI | ON | Enable MPI support |
| SPFFT_OMP | ON | Enable multi-threading with OpenMP |
| SPFFT_GPU_BACKEND | OFF | Select GPU backend. Can be OFF, CUDA or ROCM |
| SPFFT_GPU_DIRECT | OFF | Use GPU aware MPI with GPUDirect |
| SPFFT_SINGLE_PRECISION | OFF | Enable single precision support |
| SPFFT_STATIC | OFF | Build as static library |
| SPFFT_FFTW_LIB | AUTO | Library providing a FFTW interface. Can be AUTO, MKL or FFTW |
| SPFFT_BUILD_TESTS | OFF | Build test executables for developement purposes |
| SPFFT_INSTALL | ON | Add library to install target |
| SPFFT_FORTRAN | OFF | Build Fortran interface module |


## Examples
Expand All @@ -88,21 +92,21 @@ int main(int argc, char** argv) {
// Use default OpenMP value
const int numThreads = -1;

// use all elements in this example.
// Use all elements in this example.
const int numFrequencyElements = dimX * dimY * dimZ;

// Slice length in space domain. Equivalent to dimZ for non-distributed case.
const int localZLength = dimZ;

// interleaved complex numbers
// Interleaved complex numbers
std::vector<double> frequencyElements;
frequencyElements.reserve(2 * numFrequencyElements);

// indices of frequency elements
// Indices of frequency elements
std::vector<int> indices;
indices.reserve(dimX * dimY * dimZ * 3);

// initialize frequency domain values and indices
// Initialize frequency domain values and indices
double initValue = 0.0;
for (int xIndex = 0; xIndex < dimX; ++xIndex) {
for (int yIndex = 0; yIndex < dimY; ++yIndex) {
Expand All @@ -126,31 +130,48 @@ int main(int argc, char** argv) {
std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl;
}

// create local Grid. For distributed computations, a MPI Communicator has to be provided
// Create local Grid. For distributed computations, a MPI Communicator has to be provided
spfft::Grid grid(dimX, dimY, dimZ, dimX * dimY, SPFFT_PU_HOST, numThreads);

// create transform
// Create transform.
// Note: A transform handle can be created without a grid if no resource sharing is desired.
spfft::Transform transform =
grid.create_transform(SPFFT_PU_HOST, SPFFT_TRANS_C2C, dimX, dimY, dimZ, localZLength,
numFrequencyElements, SPFFT_INDEX_TRIPLETS, indices.data());

// Get pointer to space domain data. Alignment fullfills requirements for std::complex.
// Can also be read as std::complex elements (guaranteed by the standard to be binary compatible
// since C++11).
double* spaceDomain = transform.space_domain_data(SPFFT_PU_HOST);

// transform backward
///////////////////////////////////////////////////
// Option A: Reuse internal buffer for space domain
///////////////////////////////////////////////////

// Transform backward
transform.backward(frequencyElements.data(), SPFFT_PU_HOST);

// Get pointer to buffer with space domain data. Is guaranteed to be castable to a valid
// std::complex pointer. Using the internal working buffer as input / output can help reduce
// memory usage.
double* spaceDomainPtr = transform.space_domain_data(SPFFT_PU_HOST);

std::cout << std::endl << "After backward transform:" << std::endl;
for (int i = 0; i < transform.local_slice_size(); ++i) {
std::cout << spaceDomain[2 * i] << ", " << spaceDomain[2 * i + 1] << std::endl;
std::cout << spaceDomainPtr[2 * i] << ", " << spaceDomainPtr[2 * i + 1] << std::endl;
}

// transform forward
transform.forward(SPFFT_PU_HOST, frequencyElements.data(), SPFFT_NO_SCALING);
/////////////////////////////////////////////////
// Option B: Use external buffer for space domain
/////////////////////////////////////////////////

std::vector<double> spaceDomainVec(2 * transform.local_slice_size());

// Transform backward
transform.backward(frequencyElements.data(), spaceDomainVec.data());

// Transform forward
transform.forward(spaceDomainVec.data(), frequencyElements.data(), SPFFT_NO_SCALING);

// Note: In-place transforms are also supported by passing the same pointer for input and output.

std::cout << std::endl << "After forward transform (without scaling):" << std::endl;
std::cout << std::endl << "After forward transform (without normalization):" << std::endl;
for (int i = 0; i < numFrequencyElements; ++i) {
std::cout << frequencyElements[2 * i] << ", " << frequencyElements[2 * i + 1] << std::endl;
}
Expand Down
23 changes: 23 additions & 0 deletions cmake/SpFFTSharedConfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,34 @@ set(SPFFT_MPI @SPFFT_MPI@)
set(SPFFT_STATIC @SPFFT_STATIC@)
set(SPFFT_GPU_DIRECT @SPFFT_GPU_DIRECT@)
set(SPFFT_SINGLE_PRECISION @SPFFT_SINGLE_PRECISION@)
set(SPFFT_FFTW_LIB @SPFFT_FFTW_LIB@)
set(SPFFT_GPU_BACKEND @SPFFT_GPU_BACKEND@)
set(SPFFT_CUDA @SPFFT_CUDA@)
set(SPFFT_ROCM @SPFFT_ROCM@)
set(SPFFT_MKL @SPFFT_MKL@)

include(CMakeFindDependencyMacro)

# add version of package
include("${CMAKE_CURRENT_LIST_DIR}/SpFFTSharedConfigVersion.cmake")

# add library target
include("${CMAKE_CURRENT_LIST_DIR}/SpFFTSharedTargets.cmake")

# SpFFT only has MPI as public dependency, since the mpi header is
# part of the public header file
if(SPFFT_MPI)
# only look for MPI if header matching language is possibly used
get_property(_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
if("CXX" IN_LIST _LANGUAGES)
find_dependency(MPI COMPONENTS CXX)
target_link_libraries(SpFFT::spfft INTERFACE MPI::MPI_CXX)
endif()

if("C" IN_LIST _LANGUAGES)
find_dependency(MPI COMPONENTS C)
target_link_libraries(SpFFT::spfft INTERFACE MPI::MPI_C)
endif()

# NOTE: Fortran module does not depend on MPI
endif()
Loading