Skip to content

Commit

Permalink
Merge branch 'mr_shifted_periodicity' into 'master'
Browse files Browse the repository at this point in the history
Shifted Periodicity

See merge request walberla/walberla!680
  • Loading branch information
Girish Kumatagi committed Sep 9, 2024
2 parents 0d2ceae + eb3f1c0 commit 9da0b10
Show file tree
Hide file tree
Showing 15 changed files with 1,571 additions and 27 deletions.
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ add_subdirectory( blockforest )
add_subdirectory( boundary )
add_subdirectory( communication )
add_subdirectory( core )
add_subdirectory(gpu)
add_subdirectory( gpu )
add_subdirectory( domain_decomposition )
add_subdirectory( executiontree )
if ( WALBERLA_BUILD_WITH_FFT AND FFTW3_FOUND )
Expand Down
1 change: 1 addition & 0 deletions src/boundary/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ target_sources( boundary
BoundaryHandlingCollection.h
Boundary.cpp
BoundaryUID.h
ShiftedPeriodicity.h
)
611 changes: 611 additions & 0 deletions src/boundary/ShiftedPeriodicity.h

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ target_sources( gpu
ParallelStreams.h
GPURAII.h
DeviceSelectMPI.cpp
ShiftedPeriodicity.cu
ShiftedPeriodicity.h
)

# sources only for CUDA
Expand Down
30 changes: 15 additions & 15 deletions src/gpu/FieldAccessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,29 +56,29 @@ namespace gpu
fOffset_(fOffset), indexingScheme_(indexingScheme )
{}

__device__ void set( uint3 blockIdx, uint3 threadIdx )
__device__ void set( uint3 _blockIdx, uint3 _threadIdx )
{
switch ( indexingScheme_)
{
case FZYX: ptr_ += blockIdx.z * fOffset_ + blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
case FZY : ptr_ += blockIdx.y * fOffset_ + blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
case FZ : ptr_ += blockIdx.x * fOffset_ + threadIdx.x * zOffset_; break;
case F : ptr_ += threadIdx.x * fOffset_; break;

case ZYXF: ptr_ += blockIdx.z * zOffset_ + blockIdx.y * yOffset_ + blockIdx.x * xOffset_ + threadIdx.x * fOffset_; break;
case ZYX : ptr_ += blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
case ZY : ptr_ += blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
case Z : ptr_ += threadIdx.x * zOffset_; break;
case FZYX: ptr_ += _blockIdx.z * fOffset_ + _blockIdx.y * zOffset_ + _blockIdx.x * yOffset_ + _threadIdx.x * xOffset_; break;
case FZY : ptr_ += _blockIdx.y * fOffset_ + _blockIdx.x * zOffset_ + _threadIdx.x * yOffset_; break;
case FZ : ptr_ += _blockIdx.x * fOffset_ + _threadIdx.x * zOffset_; break;
case F : ptr_ += _threadIdx.x * fOffset_; break;

case ZYXF: ptr_ += _blockIdx.z * zOffset_ + _blockIdx.y * yOffset_ + _blockIdx.x * xOffset_ + _threadIdx.x * fOffset_; break;
case ZYX : ptr_ += _blockIdx.y * zOffset_ + _blockIdx.x * yOffset_ + _threadIdx.x * xOffset_; break;
case ZY : ptr_ += _blockIdx.x * zOffset_ + _threadIdx.x * yOffset_; break;
case Z : ptr_ += _threadIdx.x * zOffset_; break;
}
}


__device__ uint_t getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim )
__device__ uint_t getLinearIndex( uint3 _blockIdx, uint3 _threadIdx, uint3 _gridDim, uint3 _blockDim )
{
return threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x +
blockIdx.z * blockDim.x * gridDim.x * gridDim.y ;
return _threadIdx.x +
_blockIdx.x * _blockDim.x +
_blockIdx.y * _blockDim.x * _gridDim.x +
_blockIdx.z * _blockDim.x * _gridDim.x * _gridDim.y ;
}

// This is always true for this specific field indexing class.
Expand Down
14 changes: 7 additions & 7 deletions src/gpu/FieldAccessor3D.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,17 @@ namespace gpu
uint_t yOffset,
uint_t zOffset,
uint_t fOffset,
const dim3 & idxDim,
const dim3 & blockDim )
const dim3 & _idxDim,
const dim3 & _blockDim )
: ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ),
idxDim_( idxDim ), blockDim_( blockDim )
idxDim_( _idxDim ), blockDim_( _blockDim )
{}

__device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx )
__device__ __forceinline__ void set( const uint3& _blockIdx, const uint3& _threadIdx )
{
uint_t x = blockIdx.x * blockDim_.x + threadIdx.x;
uint_t y = blockIdx.y * blockDim_.y + threadIdx.y;
uint_t z = blockIdx.z * blockDim_.z + threadIdx.z;
uint_t x = _blockIdx.x * blockDim_.x + _threadIdx.x;
uint_t y = _blockIdx.y * blockDim_.y + _threadIdx.y;
uint_t z = _blockIdx.z * blockDim_.z + _threadIdx.z;

if ( x < idxDim_.x && y < idxDim_.y && z < idxDim_.z )
{
Expand Down
8 changes: 4 additions & 4 deletions src/gpu/FieldAccessorXYZ.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ namespace gpu
: ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset), fOffset_(fOffset)
{}

__device__ void set( uint3 blockIdx, uint3 threadIdx )
__device__ void set( uint3 _blockIdx, uint3 _threadIdx )
{
ptr_ += threadIdx.x * xOffset_ +
blockIdx.x * yOffset_ +
blockIdx.y * zOffset_ ;
ptr_ += _threadIdx.x * xOffset_ +
_blockIdx.x * yOffset_ +
_blockIdx.y * zOffset_ ;
}

__device__ T & get() { return * (T*)(ptr_); }
Expand Down
53 changes: 53 additions & 0 deletions src/gpu/ShiftedPeriodicity.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file ShiftedPeriodicity.cu
//! \ingroup gpu
//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
//
//======================================================================================================================

#include "gpu/ShiftedPeriodicity.h"

namespace walberla {
namespace gpu
{

namespace internal {
#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
__global__ void packBufferGPU( gpu::FieldAccessor<real_t> fa, real_t * const buffer ) {
fa.set(blockIdx, threadIdx);
if(fa.isValidPosition()) {
buffer[fa.getLinearIndex(blockIdx, threadIdx, gridDim, blockDim)] = fa.get();
}
}
__global__ void unpackBufferGPU( gpu::FieldAccessor<real_t> fa, const real_t * const buffer ) {
fa.set(blockIdx, threadIdx);
if(fa.isValidPosition()) {
fa.get() = buffer[fa.getLinearIndex(blockIdx, threadIdx, gridDim, blockDim)];
}
}
#else
__global__ void packBufferGPU( gpu::FieldAccessor<real_t>, real_t * const ) {
WALBERLA_ABORT("gpu/ShiftedPeriodicity only supported when built with GPU support. Please use boundary/ShiftedPeriodicity on CPUs")
}
__global__ void unpackBufferGPU( gpu::FieldAccessor<real_t>, const real_t * const ) {
WALBERLA_ABORT("gpu/ShiftedPeriodicity only supported when built with GPU support. Please use boundary/ShiftedPeriodicity on CPUs")
}
#endif
}

} // namespace gpu
} // namespace walberla
144 changes: 144 additions & 0 deletions src/gpu/ShiftedPeriodicity.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file ShiftedPeriodicity.h
//! \ingroup gpu
//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
//
//======================================================================================================================

#include "blockforest/StructuredBlockForest.h"

#include "boundary/ShiftedPeriodicity.h"

#include "core/DataTypes.h"
#include "core/cell/CellInterval.h"
#include "core/debug/Debug.h"
#include "core/math/Vector3.h"

#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"

#include "gpu/DeviceWrapper.h"
#include "gpu/FieldAccessor.h"
#include "gpu/FieldIndexing.h"
#include "gpu/GPUField.h"
#include "gpu/GPUWrapper.h"
#include "gpu/Kernel.h"

#include <cstdlib>
#include <device_launch_parameters.h>
#include <memory>
#include <vector>

#include "ErrorChecking.h"

namespace walberla {
namespace gpu
{

namespace internal {
// GPU kernels - can be extended for other data types
__global__ void packBufferGPU( gpu::FieldAccessor<real_t> fa, real_t * const buffer );
__global__ void unpackBufferGPU( gpu::FieldAccessor<real_t> fa, const real_t * const buffer );
}

//*******************************************************************************************************************
/*!
* A periodicity boundary condition that adds a user-defined spatial shift to the field when applied.
* This shift can prevent the locking of large-scale turbulent features in the flow direction, see e.g.,
* Munters et al. (https://doi.org/10.1063/1.4941912).
*
* Periodicity defined in the blockforest must be turned off in the normal-direction.
*
* This class handles the GPU-specific packing and unpacking of the communication buffers.
*
* @tparam GhostLayerField_T Type of the ghost-layer field that is shifted periodically
*/
//*******************************************************************************************************************
template< typename GPUField_T >
class ShiftedPeriodicityGPU : public boundary::ShiftedPeriodicityBase<ShiftedPeriodicityGPU<GPUField_T>, GPUField_T> {

using Base = boundary::ShiftedPeriodicityBase<ShiftedPeriodicityGPU<GPUField_T>, GPUField_T>;
friend Base;

public:

using ValueType = typename GPUField_T::value_type;
using ShiftType = typename Base::ShiftType;
using FieldIdx_T = gpu::FieldIndexing<ValueType>;

ShiftedPeriodicityGPU(const std::weak_ptr< StructuredBlockForest >& blockForest,
const BlockDataID& fieldID, const uint_t fieldGhostLayers,
const uint_t normalDir, const uint_t shiftDir, const ShiftType shiftValue)
: Base(blockForest, fieldID, fieldGhostLayers, normalDir, shiftDir, shiftValue)
{}


private:

void packBuffer(IBlock* const block, const CellInterval& ci, std::vector< ValueType >& h_buffer) {

// get field
auto d_field = block->getData< GPUField_T >(this->fieldID_);
WALBERLA_ASSERT_NOT_NULLPTR(d_field)

const uint_t nValues = ci.numCells() * uint_c(this->fSize_);

// create GPU buffer
ValueType * d_buffer{};
WALBERLA_GPU_CHECK(gpuMalloc(&d_buffer, nValues * sizeof(ValueType)))

// fill buffer on GPU
auto packKernel = gpu::make_kernel( &internal::packBufferGPU );
packKernel.addFieldIndexingParam( FieldIdx_T::interval( *d_field, ci, 0, this->fSize_ ) );
packKernel.addParam<real_t*>(d_buffer);
packKernel();

// copy from device to host buffer
WALBERLA_GPU_CHECK(gpuMemcpy(h_buffer.data(), d_buffer, nValues * sizeof(ValueType), gpuMemcpyDeviceToHost))

WALBERLA_GPU_CHECK(gpuFree(d_buffer))

}

void unpackBuffer(IBlock* const block, const CellInterval& ci, const std::vector< ValueType >& h_buffer) {

// get field
auto d_field = block->getData< GPUField_T >(this->fieldID_);
WALBERLA_ASSERT_NOT_NULLPTR(d_field)

const uint_t nValues = ci.numCells() * uint_c(this->fSize_);

// create GPU buffer
ValueType * d_buffer{};
WALBERLA_GPU_CHECK(gpuMalloc(&d_buffer, nValues * sizeof(ValueType)))

// copy from host to device buffer
WALBERLA_GPU_CHECK(gpuMemcpy(d_buffer, h_buffer.data(), nValues * sizeof(ValueType), gpuMemcpyHostToDevice))

// unpack buffer on GPU
auto unpackKernel = gpu::make_kernel( &internal::unpackBufferGPU );
unpackKernel.addFieldIndexingParam( FieldIdx_T::interval( *d_field, ci, 0, this->fSize_ ) );
unpackKernel.addParam<const real_t*>(d_buffer);
unpackKernel();

WALBERLA_GPU_CHECK(gpuFree(d_buffer))
}

}; // class ShiftedPeriodicity

} // namespace gpu
} // namespace walberla
11 changes: 11 additions & 0 deletions tests/boundary/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,14 @@

waLBerla_compile_test( FILES BoundaryHandling.cpp )
waLBerla_execute_test( NAME BoundaryHandling )

if (WALBERLA_BUILD_WITH_PYTHON)

waLBerla_link_files_to_builddir( *.py )

waLBerla_compile_test( FILES TestShiftedPeriodicity.cpp DEPENDS blockforest field python_coupling )
waLBerla_execute_test( NAME TestShiftedPeriodicity1 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py )
waLBerla_execute_test( NAME TestShiftedPeriodicity2 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py PROCESSES 2 )
waLBerla_execute_test( NAME TestShiftedPeriodicity4 COMMAND $<TARGET_FILE:TestShiftedPeriodicity> ${CMAKE_CURRENT_SOURCE_DIR}/TestShiftedPeriodicitySetup.py PROCESSES 4 )

endif()
Loading

0 comments on commit 9da0b10

Please sign in to comment.