Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
toxa81 committed Jan 16, 2019
1 parent 4705af4 commit 22054cd
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 207 deletions.
175 changes: 82 additions & 93 deletions SDDK/GPU/fft_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,84 +28,73 @@
#include <stdio.h>


__global__ void cufft_repack_z_buffer_back_kernel(int dimz,
int num_zcol_loc,
int const* local_z_offsets,
int const* local_z_sizes,
cuDoubleComplex const* old_buffer,
cuDoubleComplex* new_buffer)
template <int direction>
__global__ void repack_z_buffer_gpu_kernel(int size_z,
int num_zcol_loc,
int const* local_z_offsets,
int const* local_z_sizes,
cuDoubleComplex* z_sticks_local,
cuDoubleComplex* a2a_buffer)
{
int iz = blockDim.x * blockIdx.x + threadIdx.x;
int izcol = blockIdx.y;
int rank = blockIdx.z;

int local_zsize = local_z_sizes[rank];
if (iz < local_zsize) {
int offs = local_z_offsets[rank];
new_buffer[offs + iz + izcol * dimz] = old_buffer[offs * num_zcol_loc + izcol * local_zsize + iz];
}
}


__global__ void cufft_repack_z_buffer_kernel(int dimz,
int num_zcol_loc,
int const* local_z_offsets,
int const* local_z_sizes,
cuDoubleComplex* old_buffer,
cuDoubleComplex* new_buffer)
{
int iz = blockDim.x * blockIdx.x + threadIdx.x;
int izcol = blockIdx.y;
int rank = blockIdx.z;

int local_zsize = local_z_sizes[rank];
if (iz < local_zsize) {
int offs = local_z_offsets[rank];
new_buffer[offs * num_zcol_loc + izcol * local_zsize + iz] = old_buffer[offs + iz + izcol * dimz];
if (direction == 1) {
z_sticks_local[offs + iz + izcol * size_z] = a2a_buffer[offs * num_zcol_loc + izcol * local_zsize + iz];
}
if (direction == -1) {
a2a_buffer[offs * num_zcol_loc + izcol * local_zsize + iz] = z_sticks_local[offs + iz + izcol * size_z];
}
}
}


extern "C" void cufft_repack_z_buffer(int direction,
int num_ranks,
int dimz,
int num_zcol_loc,
int zcol_max_size,
int const* local_z_offsets,
int const* local_z_sizes,
cuDoubleComplex* serial_buffer,
cuDoubleComplex* parallel_buffer)
extern "C" void repack_z_buffer_gpu(int direction,
int num_ranks,
int size_z,
int num_zcol_loc,
int zcol_max_size,
int const* local_z_offsets,
int const* local_z_sizes,
cuDoubleComplex* z_sticks_local,
cuDoubleComplex* a2a_buffer)
{
dim3 grid_t(64);
dim3 grid_b(num_blocks(zcol_max_size, grid_t.x), num_zcol_loc, num_ranks);

if (direction == 1) {
cufft_repack_z_buffer_kernel<<<grid_b, grid_t, 0, 0>>>
(dimz,
num_zcol_loc,
local_z_offsets,
local_z_sizes,
serial_buffer,
parallel_buffer);
} else {
cufft_repack_z_buffer_back_kernel<<<grid_b, grid_t, 0, 0>>>
(dimz,
num_zcol_loc,
local_z_offsets,
local_z_sizes,
parallel_buffer,
serial_buffer);
repack_z_buffer_gpu_kernel<1> <<<grid_b, grid_t, 0, 0>>>
(
size_z,
num_zcol_loc,
local_z_offsets,
local_z_sizes,
z_sticks_local,
a2a_buffer
);
} else {
repack_z_buffer_gpu_kernel<-1> <<<grid_b, grid_t, 0, 0>>>
(
size_z,
num_zcol_loc,
local_z_offsets,
local_z_sizes,
z_sticks_local,
a2a_buffer
);
}

}



__global__ void cufft_batch_load_gpu_kernel(int fft_size,
int num_pw_components,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer)
__global__ void batch_load_gpu_kernel(int fft_size,
int num_pw_components,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer)
{
int i = blockIdx.y;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
Expand All @@ -115,13 +104,13 @@ __global__ void cufft_batch_load_gpu_kernel(int fft_size,
}
}

extern "C" void cufft_batch_load_gpu(int fft_size,
int num_pw_components,
int num_fft,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer,
int stream_id__)
extern "C" void batch_load_gpu(int fft_size,
int num_pw_components,
int num_fft,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer,
int stream_id__)
{
dim3 grid_t(64);
dim3 grid_b(num_blocks(num_pw_components, grid_t.x), num_fft);
Expand All @@ -130,7 +119,7 @@ extern "C" void cufft_batch_load_gpu(int fft_size,

acc::zero(fft_buffer, fft_size * num_fft);

cufft_batch_load_gpu_kernel <<<grid_b, grid_t, 0, stream>>>
batch_load_gpu_kernel <<<grid_b, grid_t, 0, stream>>>
(
fft_size,
num_pw_components,
Expand All @@ -140,13 +129,13 @@ extern "C" void cufft_batch_load_gpu(int fft_size,
);
}

__global__ void cufft_batch_unload_gpu_kernel(int fft_size,
int num_pw_components,
int const* map,
cuDoubleComplex const* fft_buffer,
cuDoubleComplex* data,
double alpha,
double beta)
__global__ void batch_unload_gpu_kernel(int fft_size,
int num_pw_components,
int const* map,
cuDoubleComplex const* fft_buffer,
cuDoubleComplex* data,
double alpha,
double beta)
{
int i = blockIdx.y;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
Expand All @@ -165,15 +154,15 @@ __global__ void cufft_batch_unload_gpu_kernel(int fft_size,
/// Unload data from FFT buffer.
/** The following operation is executed:
* data[ig] = alpha * data[ig] + beta * fft_buffer[map[ig]] */
extern "C" void cufft_batch_unload_gpu(int fft_size,
int num_pw_components,
int num_fft,
int const* map,
cuDoubleComplex const* fft_buffer,
cuDoubleComplex* data,
double alpha,
double beta,
int stream_id__)
extern "C" void batch_unload_gpu(int fft_size,
int num_pw_components,
int num_fft,
int const* map,
cuDoubleComplex const* fft_buffer,
cuDoubleComplex* data,
double alpha,
double beta,
int stream_id__)
{
dim3 grid_t(64);
dim3 grid_b(num_blocks(num_pw_components, grid_t.x), num_fft);
Expand All @@ -183,8 +172,8 @@ extern "C" void cufft_batch_unload_gpu(int fft_size,
if (alpha == 0) {
acc::zero(data, num_pw_components);
}
cufft_batch_unload_gpu_kernel <<<grid_b, grid_t, 0, stream>>>

batch_unload_gpu_kernel <<<grid_b, grid_t, 0, stream>>>
(
fft_size,
num_pw_components,
Expand All @@ -196,10 +185,10 @@ extern "C" void cufft_batch_unload_gpu(int fft_size,
);
}

__global__ void cufft_load_x0y0_col_gpu_kernel(int z_col_size,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer)
__global__ void load_x0y0_col_gpu_kernel(int z_col_size,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer)

{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
Expand All @@ -209,18 +198,18 @@ __global__ void cufft_load_x0y0_col_gpu_kernel(int z_col_size
}
}

extern "C" void cufft_load_x0y0_col_gpu(int z_col_size,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer,
int stream_id__)
extern "C" void load_x0y0_col_gpu(int z_col_size,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer,
int stream_id__)
{
dim3 grid_t(64);
dim3 grid_b(num_blocks(z_col_size, grid_t.x));

cudaStream_t stream = acc::stream(stream_id(stream_id__));

cufft_load_x0y0_col_gpu_kernel <<<grid_b, grid_t, 0, stream>>>
load_x0y0_col_gpu_kernel <<<grid_b, grid_t, 0, stream>>>
(
z_col_size,
map,
Expand Down
83 changes: 42 additions & 41 deletions SDDK/GPU/fft_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,47 +22,48 @@
* \brief Contains definition of CUDA kernels necessary for a FFT driver.
*/

#include "acc.hpp"
//#include "acc.hpp"
#include <complex>

extern "C" {

void cufft_repack_z_buffer(int direction,
int num_ranks,
int dimz,
int num_zcol_loc,
int zcol_max_size,
int const* local_z_offsets,
int const* local_z_sizes,
cuDoubleComplex* serial_buffer,
cuDoubleComplex* parallel_buffer);
void repack_z_buffer_gpu(int direction,
int num_ranks,
int size_z,
int num_zcol_loc,
int zcol_max_size,
int const* local_z_offsets,
int const* local_z_sizes,
std::complex<double>* z_long_sticks_local,
std::complex<double>* z_short_sticks_full);


void cufft_batch_load_gpu(int fft_size,
int num_pw_components,
int num_fft,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer,
int stream_id);
void batch_load_gpu(int fft_size,
int num_pw_components,
int num_fft,
int const* map,
std::complex<double> const* data,
std::complex<double>* fft_buffer,
int stream_id);

void cufft_load_x0y0_col_gpu(int z_col_size,
int const* map,
cuDoubleComplex const* data,
cuDoubleComplex* fft_buffer,
int stream_id);
void load_x0y0_col_gpu(int z_col_size,
int const* map,
std::complex<double> const* data,
std::complex<double>* fft_buffer,
int stream_id);

void cufft_batch_unload_gpu(int fft_size,
int num_pw_components,
int num_fft,
int const* map,
cuDoubleComplex const* fft_buffer,
cuDoubleComplex* data,
double alpha,
double beta,
int stream_id);
void batch_unload_gpu(int fft_size,
int num_pw_components,
int num_fft,
int const* map,
std::complex<double> const* fft_buffer,
std::complex<double>* data,
double alpha,
double beta,
int stream_id);

void unpack_z_cols_gpu(cuDoubleComplex* z_cols_packed__,
cuDoubleComplex* fft_buf__,
void unpack_z_cols_gpu(std::complex<double>* z_cols_packed__,
std::complex<double>* fft_buf__,
int size_x__,
int size_y__,
int size_z__,
Expand All @@ -71,28 +72,28 @@ void unpack_z_cols_gpu(cuDoubleComplex* z_cols_packed__,
bool use_reduction__,
int stream_id__);

void pack_z_cols_gpu(cuDoubleComplex* z_cols_packed__,
cuDoubleComplex* fft_buf__,
void pack_z_cols_gpu(std::complex<double>* z_cols_packed__,
std::complex<double>* fft_buf__,
int size_x__,
int size_y__,
int size_z__,
int num_z_cols__,
int const* z_col_pos__,
int stream_id__);

void unpack_z_cols_2_gpu(cuDoubleComplex* z_cols_packed1__,
cuDoubleComplex* z_cols_packed2__,
cuDoubleComplex* fft_buf__,
void unpack_z_cols_2_gpu(std::complex<double>* z_cols_packed1__,
std::complex<double>* z_cols_packed2__,
std::complex<double>* fft_buf__,
int size_x__,
int size_y__,
int size_z__,
int num_z_cols__,
int const* z_col_pos__,
int stream_id__);

void pack_z_cols_2_gpu(cuDoubleComplex* z_cols_packed1__,
cuDoubleComplex* z_cols_packed2__,
cuDoubleComplex* fft_buf__,
void pack_z_cols_2_gpu(std::complex<double>* z_cols_packed1__,
std::complex<double>* z_cols_packed2__,
std::complex<double>* fft_buf__,
int size_x__,
int size_y__,
int size_z__,
Expand Down
Loading

0 comments on commit 22054cd

Please sign in to comment.