From a3417a32d43898c959a5df7b444cf0f2ddb36213 Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Mon, 21 Jan 2013 15:08:02 +0000 Subject: [PATCH] Bump CUDA version with NVIDIA/Bristol optimisations --- Makefile | 29 ++-- PdV_kernel_cuda.cu | 188 ++++---------------- clover.in | 7 +- cuda_common.cu | 190 ++++++++++++--------- mpi_transfers_cuda.cu | 254 ++++++++------------------- pack_buffer_kernels.cu | 260 ++++++++++++++-------------- update_halo_kernel_cuda.cu | 340 ++++++++++++------------------------- 7 files changed, 476 insertions(+), 792 deletions(-) diff --git a/Makefile b/Makefile index a8a464b..4d59920 100644 --- a/Makefile +++ b/Makefile @@ -13,9 +13,9 @@ OMP=$(OMP_$(COMPILER)) FLAGS_INTEL = -O3 -ipo FLAGS_SUN = -O2 -FLAGS_GNU = -O2 -g -FLAGS_CRAY = -em -ra -FLAGS_PGI = -O2 -Mpreprocess -g +FLAGS_GNU = -O2 +FLAGS_CRAY = -O2 -em -ra -f free -F +FLAGS_PGI = -O2 -Mpreprocess FLAGS_PATHSCALE = -O2 FLAGS_XLF = -O2 FLAGS_ = -O2 @@ -28,6 +28,14 @@ CFLAGS_PATHSCALE = -O2 CFLAGS_XLF = -O2 CFLAGS_ = -O2 +# flags for nvcc +# set NV_ARCH to select the correct one +CODE_GEN_FERMI=-gencode arch=compute_20,code=sm_21 +CODE_GEN_KEPLER=-gencode arch=compute_30,code=sm_30 + +# requires CUDA_HOME to be set - not the same on all machines +NV_FLAGS=-O2 -w -c -I $(CUDA_HOME)/include $(CODE_GEN_$(NV_ARCH)) -DNO_ERR_CHK + ifdef DEBUG FLAGS_INTEL = -O0 -g -debug all -check all -traceback -check noarg_temp_created FLAGS_SUN = -O0 -xopenmp=noopt -g @@ -39,6 +47,7 @@ ifdef DEBUG FLAGS_ = -O0 -g CFLAGS_INTEL = -O0 -g -c -debug all -traceback -restrict CFLAGS_CRAY = -O0 -g -em -eD + NV_FLAGS += -g -G endif ifdef IEEE @@ -56,7 +65,7 @@ CPPLIBS_PGI=-pgcpplibs CPPLIBS_GNU=-lstdc++ CPPLIBS=$(CPPLIBS_$(COMPILER)) -FLAGS=$(FLAGS_$(COMPILER)) $(OMP) $(I3E) $(OPTIONS) +FLAGS=$(FLAGS_$(COMPILER)) $(OMP) $(I3E) $(OPTIONS) $(RESIDENT_FLAG) CFLAGS=$(CFLAGS_$(COMPILER)) $(OMP) $(I3E) $(C_OPTIONS) -c MPI_COMPILER=mpif90 C_MPI_COMPILER=mpicc @@ -81,7 +90,7 @@ CUDA_FILES=\ update_halo_kernel_cuda.o all: clover_leaf - rm -f *.o *.mod *genmod* + rm -f *.o *.mod *genmod* *.lst clover_leaf: cuda_clover c_lover *.f90 $(MPI_COMPILER) $(FLAGS) \ @@ -138,7 +147,6 @@ clover_leaf: cuda_clover c_lover *.f90 ideal_gas_kernel_c.o \ advec_cell_kernel_c.o \ viscosity_kernel_c.o \ - timer_c.o \ $(CUDA_FILES) \ -L $(CUDA_HOME)/lib64 -lcudart $(CPPLIBS) \ -o clover_leaf @@ -153,16 +161,13 @@ c_lover: ideal_gas_kernel_c.c \ viscosity_kernel_c.c \ advec_cell_kernel_c.c \ - advec_mom_kernel_c.c \ - timer_c.c + advec_mom_kernel_c.c -#for kepler use: -gencode arch=compute_30,code=sm_30 cuda_clover: $(CUDA_FILES) - @echo "NB - This creates code for Fermi architecture which supports double precision natively - removing the gencode specification statement will create code that operates on floating point numbers instead, at the cost of a possible loss of precision" %.o: %.cu - nvcc $(CFLAGS_GNU) -c -DCUDA_RESIDENT -gencode arch=compute_20,code=sm_21 -I $(CUDA_HOME)/include $< #-DTIME_KERNELS + nvcc $(NV_FLAGS) $< clean: - rm -f *.o *.mod *genmod* + rm -f *.o *.mod *genmod* *.lst diff --git a/PdV_kernel_cuda.cu b/PdV_kernel_cuda.cu index 6cff99a..5ed23f5 100644 --- a/PdV_kernel_cuda.cu +++ b/PdV_kernel_cuda.cu @@ -4,12 +4,10 @@ #include "ftocmacros.h" #include -#include "chunk_cuda.cu" -extern CloverleafCudaChunk chunk; - #include "omp.h" -extern CudaDevPtrStorage pointer_storage; +#include "chunk_cuda.cu" +extern CloverleafCudaChunk chunk; __global__ void device_PdV_cuda_kernel_predict (int x_min, int x_max, int y_min, int y_max, @@ -38,25 +36,25 @@ const double * __restrict const yvel1) double recip_volume, energy_change, min_cell_volume, right_flux, left_flux, top_flux, bottom_flux, total_flux; - if(row > 1 && column > 1 - && row < y_max+2 && column < x_max+2) + if (row >= (y_min + 1) && row <= (y_max + 1) + && column >= (x_min + 1) && column <= (x_max + 1)) { left_flux = (xarea[THARR2D(0, 0, 1)] - * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 0, 1)] - + xvel0[THARR2D(0, 1, 1)] + xvel0[THARR2D(0, 1, 1)])) + * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)] + + xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)])) * 0.25 * dt * 0.5; right_flux = (xarea[THARR2D(1, 0, 1)] - * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 0, 1)] - + xvel0[THARR2D(1, 1, 1)] + xvel0[THARR2D(1, 1, 1)])) + * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)] + + xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)])) * 0.25 * dt * 0.5; bottom_flux = (yarea[THARR2D(0, 0, 0)] - * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(0, 0, 1)] - + yvel0[THARR2D(1, 0, 1)] + yvel0[THARR2D(1, 0, 1)])) + * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)] + + yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)])) * 0.25 * dt * 0.5; top_flux = (yarea[THARR2D(0, 1, 0)] - * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(0, 1, 1)] - + yvel0[THARR2D(1, 1, 1)] + yvel0[THARR2D(1, 1, 1)])) + * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)] + + yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)])) * 0.25 * dt * 0.5; total_flux = right_flux - left_flux + top_flux - bottom_flux; @@ -89,9 +87,11 @@ const double * __restrict const yvel1) density1[THARR2D(0, 0, 0)] = density0[THARR2D(0, 0, 0)] * volume_change; } - //reduction to get error conditon, if any + Reduce< BLOCK_SZ/2 >::run(err_cond_kernel, error_condition, max_func); + + /* __syncthreads(); - for(size_t offset = blockDim.x / 2; offset > 0; offset /= 2) + for(int offset = blockDim.x / 2; offset > 0; offset /= 2) { if(threadIdx.x < offset) { @@ -101,6 +101,7 @@ const double * __restrict const yvel1) __syncthreads(); } error_condition[blockIdx.x] = err_cond_kernel[0];; + */ } __global__ void device_PdV_cuda_kernel_not_predict @@ -130,25 +131,25 @@ const double * __restrict const yvel1) double recip_volume, energy_change, min_cell_volume, right_flux, left_flux, top_flux, bottom_flux, total_flux; - if(row > 1 && column > 1 - && row < y_max+2 && column < x_max+2) + if (row >= (y_min + 1) && row <= (y_max + 1) + && column >= (x_min + 1) && column <= (x_max + 1)) { left_flux = (xarea[THARR2D(0, 0, 1)] - * (xvel0[THARR2D(0, 0, 1)] + xvel1[THARR2D(0, 0, 1)] - + xvel0[THARR2D(0, 1, 1)] + xvel1[THARR2D(0, 1, 1)])) + * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)] + + xvel1[THARR2D(0, 0, 1)] + xvel1[THARR2D(0, 1, 1)])) * 0.25 * dt; right_flux = (xarea[THARR2D(1, 0, 1)] - * (xvel0[THARR2D(1, 0, 1)] + xvel1[THARR2D(1, 0, 1)] - + xvel0[THARR2D(1, 1, 1)] + xvel1[THARR2D(1, 1, 1)])) + * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)] + + xvel1[THARR2D(1, 0, 1)] + xvel1[THARR2D(1, 1, 1)])) * 0.25 * dt; bottom_flux = (yarea[THARR2D(0, 0, 0)] - * (yvel0[THARR2D(0, 0, 1)] + yvel1[THARR2D(0, 0, 1)] - + yvel0[THARR2D(1, 0, 1)] + yvel1[THARR2D(1, 0, 1)])) + * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)] + + yvel1[THARR2D(0, 0, 1)] + yvel1[THARR2D(1, 0, 1)])) * 0.25 * dt; top_flux = (yarea[THARR2D(0, 1, 0)] - * (yvel0[THARR2D(0, 1, 1)] + yvel1[THARR2D(0, 1, 1)] - + yvel0[THARR2D(1, 1, 1)] + yvel1[THARR2D(1, 1, 1)])) + * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)] + + yvel1[THARR2D(0, 1, 1)] + yvel1[THARR2D(1, 1, 1)])) * 0.25 * dt; total_flux = right_flux - left_flux + top_flux - bottom_flux; @@ -181,8 +182,11 @@ const double * __restrict const yvel1) } + Reduce< BLOCK_SZ/2 >::run(err_cond_kernel, error_condition, max_func); + + /* __syncthreads(); - for(size_t offset = blockDim.x / 2; offset > 0; offset /= 2) + for(int offset = blockDim.x / 2; offset > 0; offset /= 2) { if(threadIdx.x < offset) { @@ -192,110 +196,7 @@ const double * __restrict const yvel1) __syncthreads(); } error_condition[blockIdx.x] = err_cond_kernel[0];; -} - -void PdV_cuda -(int error_condition,int predict,int x_min,int x_max,int y_min,int y_max, -double dt, -double *xarea, -double *yarea, -double *volume, -double *density0, -double *density1, -double *energy0, -double *energy1, -double *pressure, -double *viscosity, -double *xvel0, -double *yvel0, -double *xvel1, -double *yvel1) -{ - pointer_storage.setSize(x_max, y_max); - - double* xarea_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xarea, BUFSZ2D(1, 0)); - double* yarea_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yarea, BUFSZ2D(0, 1)); - - double* density0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, density0, BUFSZ2D(0, 0)); - double* energy0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, energy0, BUFSZ2D(0, 0)); - - double* volume_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, volume, BUFSZ2D(0, 0)); - double* pressure_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, pressure, BUFSZ2D(0, 0)); - double* viscosity_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, viscosity, BUFSZ2D(0, 0)); - - double* xvel0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xvel0, BUFSZ2D(1, 1)); - double* xvel1_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xvel1, BUFSZ2D(1, 1)); - double* yvel0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yvel0, BUFSZ2D(1, 1)); - double* yvel1_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yvel1, BUFSZ2D(1, 1)); - - double* energy1_d = pointer_storage.getDevStorage(__LINE__, __FILE__); - double* density1_d = pointer_storage.getDevStorage(__LINE__, __FILE__); - - size_t num_blocks = (((x_max+4)*(y_max+4))/BLOCK_SZ); - //error condition - thrust::device_ptr reduce_ptr_1 = - thrust::device_malloc(num_blocks*sizeof(int)); - int* err_condition_arr_d = thrust::raw_pointer_cast(reduce_ptr_1); - -#ifdef TIME_KERNELS -_CUDA_BEGIN_PROFILE_name(device); -#endif - - if(predict) - { - device_PdV_cuda_kernel_predict<<< ((x_max+4)*(y_max+4))/BLOCK_SZ, BLOCK_SZ >>> - (x_min, x_max, y_min, y_max, dt, err_condition_arr_d, - xarea_d, yarea_d, volume_d, density0_d, density1_d, - energy0_d, energy1_d, pressure_d, viscosity_d, - xvel0_d, yvel0_d, xvel1_d, yvel1_d); - } - else - { - device_PdV_cuda_kernel_not_predict<<< ((x_max+4)*(y_max+4))/BLOCK_SZ, BLOCK_SZ >>> - (x_min, x_max, y_min, y_max, dt, err_condition_arr_d, - xarea_d, yarea_d, volume_d, density0_d, density1_d, - energy0_d, energy1_d, pressure_d, viscosity_d, - xvel0_d, yvel0_d, xvel1_d, yvel1_d); - } - -#ifdef TIME_KERNELS -_CUDA_END_PROFILE_name(device); -#endif - -errChk(__LINE__, __FILE__); - - pointer_storage.freeDevStorageAndCopy(energy1_d, energy1, BUFSZ2D(0, 0)); - pointer_storage.freeDevStorageAndCopy(density1_d, density1, BUFSZ2D(0, 0)); - - pointer_storage.freeDevStorage(xarea_d); - pointer_storage.freeDevStorage(yarea_d); - pointer_storage.freeDevStorage(volume_d); - pointer_storage.freeDevStorage(pressure_d); - pointer_storage.freeDevStorage(viscosity_d); - pointer_storage.freeDevStorage(xvel0_d); - pointer_storage.freeDevStorage(yvel0_d); - pointer_storage.freeDevStorage(xvel1_d); - pointer_storage.freeDevStorage(yvel1_d); - pointer_storage.freeDevStorage(energy0_d); - pointer_storage.freeDevStorage(density0_d); - - /* - int err_cond = thrust::reduce(reduce_ptr_1, - reduce_ptr_1 + num_blocks, - 0, thrust::maximum()); - // */ - int err_cond = *thrust::max_element(reduce_ptr_1, reduce_ptr_1 + num_blocks); - thrust::device_free(reduce_ptr_1); - - if(err_cond == 1) - { - std::cerr << "Negative volume in PdV kernel" << std::endl; - } - else if(err_cond == 2) - { - std::cerr << "Negative cell volume in PdV kernel" << std::endl; - } - + */ } extern "C" void pdv_kernel_cuda_ @@ -311,31 +212,16 @@ double *viscosity, double *xvel0, double *xvel1, double *yvel0, -double *yvel1) +double *yvel1, +double *unused_array) { -#ifdef TIME_KERNELS -_CUDA_BEGIN_PROFILE_name(host); -#endif - #ifndef CUDA_RESIDENT - PdV_cuda(*errorcondition, *prdct, *xmin, *xmax, *ymin, *ymax,*dtbyt, - xarea, yarea, volume, density0, density1, energy0, energy1, - pressure, viscosity, xvel0, yvel0, xvel1, yvel1); - #else chunk.PdV_kernel(errorcondition, *prdct, *dtbyt); - #endif -#ifdef TIME_KERNELS -_CUDA_END_PROFILE_name(host); -#endif } - void CloverleafCudaChunk::PdV_kernel (int* error_condition, int predict, double dt) { - -#ifdef TIME_KERNELS -_CUDA_BEGIN_PROFILE_name(device); -#endif + _CUDA_BEGIN_PROFILE_name(device); if(predict) { @@ -356,9 +242,7 @@ _CUDA_BEGIN_PROFILE_name(device); errChk(__LINE__, __FILE__); } -#ifdef TIME_KERNELS -_CUDA_END_PROFILE_name(device); -#endif + _CUDA_END_PROFILE_name(device); int err_cond = *thrust::max_element(reduce_pdv, reduce_pdv + num_blocks); diff --git a/clover.in b/clover.in index f285740..1c8f821 100644 --- a/clover.in +++ b/clover.in @@ -3,8 +3,8 @@ state 1 density=0.2 energy=1.0 state 2 density=1.0 energy=2.5 geometry=rectangle xmin=0.0 xmax=5.0 ymin=0.0 ymax=2.0 - x_cells=960 - y_cells=960 + x_cells=3840 + y_cells=3840 xmin=0.0 ymin=0.0 @@ -14,9 +14,8 @@ initial_timestep=0.04 timestep_rise=1.5 max_timestep=0.04 - end_time=0.5 + end_time=95.5 end_step=87 - use_cuda_kernels *endclover diff --git a/cuda_common.cu b/cuda_common.cu index 01a68fd..10e528f 100644 --- a/cuda_common.cu +++ b/cuda_common.cu @@ -3,7 +3,7 @@ #define __CUDA_COMMON_INC // size of workgroup/block -#define BLOCK_SZ 512 +#define BLOCK_SZ 256 // number of bytes to allocate for x size array #define BUFSZX(x_extra) \ @@ -21,7 +21,9 @@ * ((y_max) + 4 + y_extra) \ * sizeof(double) ) -// access a volue in a 2d array given the x and y offset from current thread index, adding or subtracting a bit more if it is one of the arrays with bigger rows +// access a volue in a 2d array given the x and y offset from current thread +// index, adding or subtracting a bit more if it is one of the arrays with +// bigger rows #define THARR2D(x_offset, y_offset, big_row)\ ( glob_id \ + (x_offset) \ @@ -35,26 +37,6 @@ const int row = glob_id / (x_max + 4); \ const int column = glob_id % (x_max + 4); -// some arrays are larger in the x direction -#define __large_kernel_indexes \ - const int glob_id = threadIdx.x \ - + blockIdx.x * blockDim.x; \ - const int row = glob_id / (x_max + 4 + x_extra); \ - const int column = glob_id % (x_max + 4 + x_extra); - -// beginning of profiling bit -#define _CUDA_BEGIN_PROFILE_name(x) \ - double x##t_0, x##t_1; \ - x##t_0 = omp_get_wtime(); - -// end of profiling bit -#define _CUDA_END_PROFILE_name(x) \ - cudaDeviceSynchronize(); \ - x##t_1 = omp_get_wtime(); \ - std::cout << "[PROFILING] " << x##t_1 - x##t_0 \ - << " to calculate block \"" << #x << \ - "\" in " << __FILE__ < #include #include "omp.h" +#include "ftocmacros.h" // for reduction in calc_dt and PdV #include "thrust/copy.h" @@ -89,7 +75,7 @@ /*******************/ -// lots of time is spent error checking - define this to stop checking for errors. I don't have to tell you that this can be a silly idea +// lots of time is spent error checking - define this to stop checking for errors. #ifndef NO_ERR_CHK #ifdef _GNUC_ @@ -106,99 +92,135 @@ inline void errChk std::cout << "error on line " << line_num << " of "; std::cout << file << std::endl; std::cout << "return code " << l_e; - switch(l_e) - { - case 8: std::cout << " (invalid device function - recompile correctly for architecture)"; break; - case 11: std::cout << " (invalid value - some number was passed wrong)"; break; - default: std::cout << " ()"; break; - } std::cout << std::endl; - exit(-1); + exit(l_e); } } #else -#define errChk(l, f) ;//nop +// do nothing instead +#define errChk(l, f) ; #endif //NO_ERR_CHK -/*******************/ +// whether to time kernel run times +#ifdef TIME_KERNELS -class CudaDevPtrStorage -{ -private: - //work arrays used for storing data used by any kernel - std::vector< double* > work_arrays; +// beginning of profiling bit +#define _CUDA_BEGIN_PROFILE_name(x) \ + double x##t_0, x##t_1; \ + x##t_0 = omp_get_wtime(); - int x_max, y_max; -public: +// end of profiling bit +#define _CUDA_END_PROFILE_name(x) \ + cudaDeviceSynchronize(); \ + x##t_1 = omp_get_wtime(); \ + std::cout << "[PROFILING] " << x##t_1 - x##t_0 \ + << " to calculate block \"" << #x << \ + "\" in " << __FILE__ < +class Reduce +{ +public: + __device__ inline static void run + (double* array, double* out, double(*func)(double, double)) { - double * new_storage; - if(work_arrays.size() < 1) + // only need to synch if not working within a warp + if (offset > 16) { - cudaMalloc((void**) &new_storage, ((x_max+5)*(y_max+5)*sizeof(double))); - errChk(line, file); + __syncthreads(); } - else + + // only continue if it's in the lower half + if (threadIdx.x < offset) { - new_storage = work_arrays.back(); - work_arrays.pop_back(); + array[threadIdx.x] = func(array[threadIdx.x], array[threadIdx.x + offset]); + Reduce< offset/2 >::run(array, out, func); } - return new_storage; } - // same as above, but also copies data from an existing host pointer to the device - double* getDevStorageAndCopy - (int line, std::string const& file, const double* existing, size_t size) + __device__ inline static void run + (int* array, int* out, int(*func)(int, int)) { - double * new_storage; - new_storage = getDevStorage(line, file); - cudaMemcpy(new_storage, existing, size, cudaMemcpyHostToDevice); - errChk(line, file); - return new_storage; + // only need to synch if not working within a warp + if (offset > 16) + { + __syncthreads(); + } + + // only continue if it's in the lower half + if (threadIdx.x < offset) + { + array[threadIdx.x] = func(array[threadIdx.x], array[threadIdx.x + offset]); + Reduce< offset/2 >::run(array, out, func); + } } +}; - // frees up some device storage to be used again elsewhere - void freeDevStorage - (double* dev_ptr) +template < > +class Reduce < 0 > +{ +public: + __device__ inline static void run + (double* array, double* out, double(*func)(double, double)) { - work_arrays.push_back(dev_ptr); + out[blockIdx.x] = array[0]; } - // same as above, but copies data back from the device as well - void freeDevStorageAndCopy - (double* dev_ptr, double* existing, size_t size) + __device__ inline static void run + (int* array, int* out, int(*func)(int, int)) { - cudaMemcpy(existing, dev_ptr, size, cudaMemcpyDeviceToHost); - errChk(__LINE__, __FILE__); - freeDevStorage(dev_ptr); + out[blockIdx.x] = array[0]; } }; #endif + diff --git a/mpi_transfers_cuda.cu b/mpi_transfers_cuda.cu index 6642e93..ffe426c 100644 --- a/mpi_transfers_cuda.cu +++ b/mpi_transfers_cuda.cu @@ -2,6 +2,7 @@ #include "chunk_cuda.cu" #include "cuda_common.cu" +#include #include "pack_buffer_kernels.cu" @@ -19,14 +20,11 @@ extern CloverleafCudaChunk chunk; extern "C" void cudapackbuffers_ (const int* which_array, const int* which_side, -const int* x_inc, -const int* y_inc, double* buffer, const int* buffer_size, const int* depth) { chunk.packBuffer((*which_array) - 1, *which_side, - *x_inc, *y_inc, buffer, *buffer_size, *depth); } @@ -34,69 +32,45 @@ const int* depth) extern "C" void cudaunpackbuffers_ (const int* which_array, const int* which_side, -const int* x_inc, -const int* y_inc, double* buffer, const int* buffer_size, const int* depth) { chunk.unpackBuffer((*which_array) - 1, *which_side, - *x_inc, *y_inc, buffer, *buffer_size, *depth); } void CloverleafCudaChunk::packBuffer (const int which_array, const int which_side, -const int x_inc, -const int y_inc, double* buffer, const int buffer_size, const int depth) { - #define PACK_CUDA_BUFFERS(dev_ptr, x_extra, y_extra) \ + #define CALL_PACK(dev_ptr, type, face, dir)\ + {\ + const int launch_sz = (ceil((dir##_max+4+type.dir##_e)/static_cast(BLOCK_SZ))) * depth; \ + device_pack##face##Buffer<<< launch_sz, BLOCK_SZ >>> \ + (x_min, x_max, y_min, y_max, type, \ + dev_ptr, dev_##face##_send_buffer, depth); \ + errChk(__LINE__, __FILE__); \ + cudaMemcpy(buffer, dev_##face##_send_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \ + errChk(__LINE__, __FILE__); \ + cudaDeviceSynchronize();\ + break; \ + } + + #define PACK_CUDA_BUFFERS(dev_ptr, type) \ switch(which_side) \ { \ case LEFT_FACE: \ - device_packLeftBuffer<<< num_blocks, BLOCK_SZ >>> \ - (x_min, x_max, y_min, y_max,\ - x_extra, y_extra, \ - x_inc, y_inc, \ - dev_ptr, dev_left_buffer, buffer_size, depth); \ - errChk(__LINE__, __FILE__); \ - cudaMemcpy(buffer, dev_left_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \ - errChk(__LINE__, __FILE__); \ - break;\ + CALL_PACK(dev_ptr, type, left, y);\ case RIGHT_FACE:\ - device_packRightBuffer<<< num_blocks, BLOCK_SZ >>> \ - (x_min, x_max, y_min, y_max,\ - x_extra, y_extra, \ - x_inc, y_inc, \ - dev_ptr, dev_right_buffer, buffer_size, depth); \ - errChk(__LINE__, __FILE__); \ - cudaMemcpy(buffer, dev_right_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \ - errChk(__LINE__, __FILE__); \ - break;\ - case TOP_FACE:\ - device_packTopBuffer<<< num_blocks, BLOCK_SZ >>> \ - (x_min, x_max, y_min, y_max,\ - x_extra, y_extra, \ - x_inc, y_inc, \ - dev_ptr, dev_top_buffer, buffer_size, depth); \ - errChk(__LINE__, __FILE__); \ - cudaMemcpy(buffer, dev_top_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \ - errChk(__LINE__, __FILE__); \ - break;\ + CALL_PACK(dev_ptr, type, right, y);\ case BOTTOM_FACE:\ - device_packBottomBuffer<<< num_blocks, BLOCK_SZ >>> \ - (x_min, x_max, y_min, y_max,\ - x_extra, y_extra, \ - x_inc, y_inc, \ - dev_ptr, dev_bottom_buffer, buffer_size, depth); \ - errChk(__LINE__, __FILE__); \ - cudaMemcpy(buffer, dev_bottom_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \ - errChk(__LINE__, __FILE__); \ - break;\ + CALL_PACK(dev_ptr, type, bottom, x);\ + case TOP_FACE:\ + CALL_PACK(dev_ptr, type, top, x);\ default: \ std::cout << "Invalid side passed to buffer packing in task " << task << std::endl; \ exit(1); \ @@ -104,77 +78,57 @@ const int depth) switch(which_array) { - case FIELD_density0: PACK_CUDA_BUFFERS(density0, 0, 0); break; - case FIELD_density1: PACK_CUDA_BUFFERS(density1, 0, 0); break; - case FIELD_energy0: PACK_CUDA_BUFFERS(energy0, 0, 0); break; - case FIELD_energy1: PACK_CUDA_BUFFERS(energy1, 0, 0); break; - case FIELD_pressure: PACK_CUDA_BUFFERS(pressure, 0, 0); break; - case FIELD_viscosity: PACK_CUDA_BUFFERS(viscosity, 0, 0); break; - case FIELD_soundspeed: PACK_CUDA_BUFFERS(soundspeed, 0, 0); break; - case FIELD_xvel0: PACK_CUDA_BUFFERS(xvel0, 1, 1); break; - case FIELD_xvel1: PACK_CUDA_BUFFERS(xvel1, 1, 1); break; - case FIELD_yvel0: PACK_CUDA_BUFFERS(yvel0, 1, 1); break; - case FIELD_yvel1: PACK_CUDA_BUFFERS(yvel1, 1, 1); break; - case FIELD_vol_flux_x: PACK_CUDA_BUFFERS(vol_flux_x, 1, 0); break; - case FIELD_vol_flux_y: PACK_CUDA_BUFFERS(vol_flux_y, 0, 1); break; - case FIELD_mass_flux_x: PACK_CUDA_BUFFERS(mass_flux_x, 1, 0); break; - case FIELD_mass_flux_y: PACK_CUDA_BUFFERS(mass_flux_y, 0, 1); break; + case FIELD_density0: PACK_CUDA_BUFFERS(density0, CELL); break; + case FIELD_density1: PACK_CUDA_BUFFERS(density1, CELL); break; + case FIELD_energy0: PACK_CUDA_BUFFERS(energy0, CELL); break; + case FIELD_energy1: PACK_CUDA_BUFFERS(energy1, CELL); break; + case FIELD_pressure: PACK_CUDA_BUFFERS(pressure, CELL); break; + case FIELD_viscosity: PACK_CUDA_BUFFERS(viscosity, CELL); break; + case FIELD_soundspeed: PACK_CUDA_BUFFERS(soundspeed, CELL); break; + case FIELD_xvel0: PACK_CUDA_BUFFERS(xvel0, VERTEX_X); break; + case FIELD_xvel1: PACK_CUDA_BUFFERS(xvel1, VERTEX_X); break; + case FIELD_yvel0: PACK_CUDA_BUFFERS(yvel0, VERTEX_Y); break; + case FIELD_yvel1: PACK_CUDA_BUFFERS(yvel1, VERTEX_Y); break; + case FIELD_vol_flux_x: PACK_CUDA_BUFFERS(vol_flux_x, X_FACE); break; + case FIELD_vol_flux_y: PACK_CUDA_BUFFERS(vol_flux_y, Y_FACE); break; + case FIELD_mass_flux_x: PACK_CUDA_BUFFERS(mass_flux_x, X_FACE); break; + case FIELD_mass_flux_y: PACK_CUDA_BUFFERS(mass_flux_y, Y_FACE); break; default: std::cerr << "Invalid which_array identifier passed to CUDA for MPI transfer" << std::endl; exit(1); } + } void CloverleafCudaChunk::unpackBuffer (const int which_array, const int which_side, -const int x_inc, -const int y_inc, double* buffer, const int buffer_size, const int depth) { - #define UNPACK_CUDA_BUFFERS(dev_ptr, x_extra, y_extra) \ + #define CALL_UNPACK(dev_ptr, type, face, dir)\ + { \ + cudaMemcpy(dev_##face##_recv_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \ + errChk(__LINE__, __FILE__); \ + cudaDeviceSynchronize();\ + const int launch_sz = (ceil((dir##_max+4+type.dir##_e)/static_cast(BLOCK_SZ))) * depth; \ + device_unpack##face##Buffer<<< launch_sz, BLOCK_SZ >>> \ + (x_min, x_max, y_min, y_max, type, \ + dev_ptr, dev_##face##_recv_buffer, depth); \ + errChk(__LINE__, __FILE__); \ + break; \ + } + + #define UNPACK_CUDA_BUFFERS(dev_ptr, type) \ switch(which_side) \ { \ case LEFT_FACE: \ - device_unpackLeftBuffer<<< num_blocks, BLOCK_SZ >>> \ - (x_min, x_max, y_min, y_max,\ - x_extra, y_extra, \ - x_inc, y_inc, \ - dev_ptr, dev_left_buffer, buffer_size, depth); \ - errChk(__LINE__, __FILE__); \ - cudaMemcpy(dev_left_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \ - errChk(__LINE__, __FILE__); \ - break;\ + CALL_UNPACK(dev_ptr, type, left, y);\ case RIGHT_FACE:\ - device_unpackRightBuffer<<< num_blocks, BLOCK_SZ >>> \ - (x_min, x_max, y_min, y_max,\ - x_extra, y_extra, \ - x_inc, y_inc, \ - dev_ptr, dev_right_buffer, buffer_size, depth); \ - errChk(__LINE__, __FILE__); \ - cudaMemcpy(dev_right_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \ - errChk(__LINE__, __FILE__); \ - break;\ - case TOP_FACE:\ - device_unpackTopBuffer<<< num_blocks, BLOCK_SZ >>> \ - (x_min, x_max, y_min, y_max,\ - x_extra, y_extra, \ - x_inc, y_inc, \ - dev_ptr, dev_top_buffer, buffer_size, depth); \ - errChk(__LINE__, __FILE__); \ - cudaMemcpy(dev_top_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \ - errChk(__LINE__, __FILE__); \ - break;\ + CALL_UNPACK(dev_ptr, type, right, y);\ case BOTTOM_FACE:\ - device_unpackBottomBuffer<<< num_blocks, BLOCK_SZ >>> \ - (x_min, x_max, y_min, y_max,\ - x_extra, y_extra, \ - x_inc, y_inc, \ - dev_ptr, dev_bottom_buffer, buffer_size, depth); \ - errChk(__LINE__, __FILE__); \ - cudaMemcpy(dev_bottom_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \ - errChk(__LINE__, __FILE__); \ - break;\ + CALL_UNPACK(dev_ptr, type, bottom, x);\ + case TOP_FACE:\ + CALL_UNPACK(dev_ptr, type, top, x);\ default: \ std::cout << "Invalid side passed to buffer unpacking in task " << task << std::endl; \ exit(1); \ @@ -182,91 +136,23 @@ const int depth) switch(which_array) { - case FIELD_density0: UNPACK_CUDA_BUFFERS(density0, 0, 0); break; - case FIELD_density1: UNPACK_CUDA_BUFFERS(density1, 0, 0); break; - case FIELD_energy0: UNPACK_CUDA_BUFFERS(energy0, 0, 0); break; - case FIELD_energy1: UNPACK_CUDA_BUFFERS(energy1, 0, 0); break; - case FIELD_pressure: UNPACK_CUDA_BUFFERS(pressure, 0, 0); break; - case FIELD_viscosity: UNPACK_CUDA_BUFFERS(viscosity, 0, 0); break; - case FIELD_soundspeed: UNPACK_CUDA_BUFFERS(soundspeed, 0, 0); break; - case FIELD_xvel0: UNPACK_CUDA_BUFFERS(xvel0, 1, 1); break; - case FIELD_xvel1: UNPACK_CUDA_BUFFERS(xvel1, 1, 1); break; - case FIELD_yvel0: UNPACK_CUDA_BUFFERS(yvel0, 1, 1); break; - case FIELD_yvel1: UNPACK_CUDA_BUFFERS(yvel1, 1, 1); break; - case FIELD_vol_flux_x: UNPACK_CUDA_BUFFERS(vol_flux_x, 1, 0); break; - case FIELD_vol_flux_y: UNPACK_CUDA_BUFFERS(vol_flux_y, 0, 1); break; - case FIELD_mass_flux_x: UNPACK_CUDA_BUFFERS(mass_flux_x, 1, 0); break; - case FIELD_mass_flux_y: UNPACK_CUDA_BUFFERS(mass_flux_y, 0, 1); break; - default: std::cerr << "Invalid which_array identifier passed to CUDA for MPI transfer" << std::endl; exit(1); - } -} - -/****************************************************/ - -// copy data out from device back to host -extern "C" void clover_cuda_copy_out_ -(const int* which_array, double* copy_into) -{ - chunk.copyToHost((*which_array) - 1, copy_into); -} - -// copy data back to device from host -extern "C" void clover_cuda_copy_back_ -(const int* which_array, double* copy_from) -{ - chunk.copyToDevice((*which_array) - 1, copy_from); -} -void CloverleafCudaChunk::copyToHost(const int which_array, double* copy_into) -{ - #define COPY_OUT(dev_ptr, sz) \ - std::cout << "copying to host: " << #dev_ptr << std::endl; \ - cudaMemcpy(copy_into, dev_ptr, sz, cudaMemcpyDeviceToHost); \ - errChk(__LINE__, __FILE__); - switch(which_array) - { - case FIELD_density0: COPY_OUT(density0, BUFSZ2D(0, 0)); break; - case FIELD_density1: COPY_OUT(density1, BUFSZ2D(0, 0)); break; - case FIELD_energy0: COPY_OUT(energy0, BUFSZ2D(0, 0)); break; - case FIELD_energy1: COPY_OUT(energy1, BUFSZ2D(0, 0)); break; - case FIELD_pressure: COPY_OUT(pressure, BUFSZ2D(0, 0)); break; - case FIELD_viscosity: COPY_OUT(viscosity, BUFSZ2D(0, 0)); break; - case FIELD_soundspeed: COPY_OUT(soundspeed, BUFSZ2D(0, 0)); break; - case FIELD_xvel0: COPY_OUT(xvel0, BUFSZ2D(1, 1)); break; - case FIELD_xvel1: COPY_OUT(xvel1, BUFSZ2D(1, 1)); break; - case FIELD_yvel0: COPY_OUT(yvel0, BUFSZ2D(1, 1)); break; - case FIELD_yvel1: COPY_OUT(yvel1, BUFSZ2D(1, 1)); break; - case FIELD_vol_flux_x: COPY_OUT(vol_flux_x, BUFSZ2D(1, 0)); break; - case FIELD_vol_flux_y: COPY_OUT(vol_flux_y, BUFSZ2D(0, 1)); break; - case FIELD_mass_flux_x: COPY_OUT(mass_flux_x, BUFSZ2D(1, 0)); break; - case FIELD_mass_flux_y: COPY_OUT(mass_flux_y, BUFSZ2D(0, 1)); break; + case FIELD_density0: UNPACK_CUDA_BUFFERS(density0, CELL); break; + case FIELD_density1: UNPACK_CUDA_BUFFERS(density1, CELL); break; + case FIELD_energy0: UNPACK_CUDA_BUFFERS(energy0, CELL); break; + case FIELD_energy1: UNPACK_CUDA_BUFFERS(energy1, CELL); break; + case FIELD_pressure: UNPACK_CUDA_BUFFERS(pressure, CELL); break; + case FIELD_viscosity: UNPACK_CUDA_BUFFERS(viscosity, CELL); break; + case FIELD_soundspeed: UNPACK_CUDA_BUFFERS(soundspeed, CELL); break; + case FIELD_xvel0: UNPACK_CUDA_BUFFERS(xvel0, VERTEX_X); break; + case FIELD_xvel1: UNPACK_CUDA_BUFFERS(xvel1, VERTEX_X); break; + case FIELD_yvel0: UNPACK_CUDA_BUFFERS(yvel0, VERTEX_Y); break; + case FIELD_yvel1: UNPACK_CUDA_BUFFERS(yvel1, VERTEX_Y); break; + case FIELD_vol_flux_x: UNPACK_CUDA_BUFFERS(vol_flux_x, X_FACE); break; + case FIELD_vol_flux_y: UNPACK_CUDA_BUFFERS(vol_flux_y, Y_FACE); break; + case FIELD_mass_flux_x: UNPACK_CUDA_BUFFERS(mass_flux_x, X_FACE); break; + case FIELD_mass_flux_y: UNPACK_CUDA_BUFFERS(mass_flux_y, Y_FACE); break; default: std::cerr << "Invalid which_array identifier passed to CUDA for MPI transfer" << std::endl; exit(1); } -} -void CloverleafCudaChunk::copyToDevice(const int which_array, double* copy_from) -{ - #define COPY_BACK(dev_ptr, sz) \ - std::cout << "copying to device: " << #dev_ptr << std::endl; \ - cudaMemcpy(dev_ptr, copy_from, sz, cudaMemcpyHostToDevice); \ - errChk(__LINE__, __FILE__); - switch(which_array) - { - case FIELD_density0: COPY_BACK(density0, BUFSZ2D(0, 0)); break; - case FIELD_density1: COPY_BACK(density1, BUFSZ2D(0, 0)); break; - case FIELD_energy0: COPY_BACK(energy0, BUFSZ2D(0, 0)); break; - case FIELD_energy1: COPY_BACK(energy1, BUFSZ2D(0, 0)); break; - case FIELD_pressure: COPY_BACK(pressure, BUFSZ2D(0, 0)); break; - case FIELD_viscosity: COPY_BACK(viscosity, BUFSZ2D(0, 0)); break; - case FIELD_soundspeed: COPY_BACK(soundspeed, BUFSZ2D(0, 0)); break; - case FIELD_xvel0: COPY_BACK(xvel0, BUFSZ2D(1, 1)); break; - case FIELD_xvel1: COPY_BACK(xvel1, BUFSZ2D(1, 1)); break; - case FIELD_yvel0: COPY_BACK(yvel0, BUFSZ2D(1, 1)); break; - case FIELD_yvel1: COPY_BACK(yvel1, BUFSZ2D(1, 1)); break; - case FIELD_vol_flux_x: COPY_BACK(vol_flux_x, BUFSZ2D(1, 0)); break; - case FIELD_vol_flux_y: COPY_BACK(vol_flux_y, BUFSZ2D(0, 1)); break; - case FIELD_mass_flux_x: COPY_BACK(mass_flux_x, BUFSZ2D(1, 0)); break; - case FIELD_mass_flux_y: COPY_BACK(mass_flux_y, BUFSZ2D(0, 1)); break; - default: std::cerr << "Invalid which_array identifier passed to CUDA for MPI transfer" << std::endl; exit(1); - } } diff --git a/pack_buffer_kernels.cu b/pack_buffer_kernels.cu index e57c437..df093f5 100644 --- a/pack_buffer_kernels.cu +++ b/pack_buffer_kernels.cu @@ -2,8 +2,11 @@ /********************/ +// j is column +// k is row + // could put this check in, prob doesnt need it -// if(row > 1 - depth && row < y_max + 2 + depth + y_inc) +// if (row > 1 - depth && row < y_max + 2 + depth + y_inc) // left/right buffer // index=j+(k+depth-1)*depth @@ -31,197 +34,202 @@ /********************/ -// j is column -// k is row - -// pack buffers - -// >= is more like fortran do loops, should help with conversion -#define BETWEEN_COLUMNS(upper, lower) (column >= (lower) && column <= (upper)) -#define BETWEEN_ROWS(upper, lower) (row >= (lower) && row <= (upper)) +// for top/bottom +#define HORZ_IDX(add) (column + depth + ((add + 1) - 1) * (x_max + x_inc + (2 * depth))) +// for left/right +#define VERT_IDX(add) ((add + 1) + (row + depth - 1) * depth) -__global__ void device_packLeftBuffer +__global__ void device_packleftBuffer (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_inc, int y_inc, -double* array, -double* left_buffer, -int buffer_size, -int depth) +cell_info_t type, +const double* array, + double* left_buffer, +const int depth) { - __large_kernel_indexes; + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0; + int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0; - // x_min + x_inc - 1 + j - // x_min = 1 in fortran -> 2 in c - // j = 1 or 2 - if(column == 2 + x_inc - 1 + 1) - { - left_buffer[row] = array[THARR2D(0, 0, x_extra)]; - } - else if(depth > 1 && column == 2 + x_inc - 1 + 2) + //__kernel_indexes; + const int glob_id = threadIdx.x + blockIdx.x * blockDim.x; + const int row = glob_id / depth; + const int column = glob_id % depth; + + if (row >= (y_min + 1) - depth && row <= (y_max + 1) + y_inc + depth) { - left_buffer[row + y_max + 4 + y_extra] = array[THARR2D(0, 0, x_extra)]; + const int row_begin = row * (x_max + 4 + x_extra); + + left_buffer[VERT_IDX(column)] = array[row_begin + (x_min + 1) + x_inc - 1 + 1 + column]; } } -__global__ void device_unpackLeftBuffer +__global__ void device_unpackleftBuffer (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_inc, int y_inc, -double* array, -double* left_buffer, -int buffer_size, -int depth) +cell_info_t type, + double* array, +const double* left_buffer, +const int depth) { - __large_kernel_indexes; + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0; + int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0; - // x_min - j - if(column == 2 - 1) - { - array[THARR2D(0, 0, x_extra)] = left_buffer[row]; - } - else if(depth > 1 && column == 2 - 2) + //__kernel_indexes; + const int glob_id = threadIdx.x + blockIdx.x * blockDim.x; + const int row = glob_id / depth; + const int column = glob_id % depth; + + if (row >= (y_min + 1) - depth && row <= (y_max + 1) + y_inc + depth) { - array[THARR2D(0, 0, x_extra)] = left_buffer[row + y_max + 4 + y_extra]; + const int row_begin = row * (x_max + 4 + x_extra); + + array[row_begin + (x_min + 1) - (1 + column)] = left_buffer[VERT_IDX(column)]; } } /************************************************************/ -__global__ void device_packRightBuffer +__global__ void device_packrightBuffer (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_inc, int y_inc, -double* array, -double* right_buffer, -int buffer_size, -int depth) +cell_info_t type, +const double* array, + double* right_buffer, +const int depth) { - __large_kernel_indexes; + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0; + int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0; - // x_max + 1 - j - if(column == x_max + 1 + 1 - 1) - { - right_buffer[row] = array[THARR2D(0, 0, x_extra)]; - } - else if(depth > 1 && column == x_max + 1 + 1 - 2) + //__kernel_indexes; + const int glob_id = threadIdx.x + blockIdx.x * blockDim.x; + const int row = glob_id / depth; + const int column = glob_id % depth; + + if (row >= (y_min + 1) - depth && row <= (y_max + 1) + y_inc + depth) { - //array[THARR2D(0, 0, x_extra)] = right_buffer[row + y_max + 4]; - right_buffer[row + y_max + 4 + y_extra] = array[THARR2D(0, 0, x_extra)]; + const int row_begin = row * (x_max + 4 + x_extra); + + right_buffer[VERT_IDX(column)] = array[row_begin + (x_max + 1) + 1 - (1 + column)]; } } -__global__ void device_unpackRightBuffer +__global__ void device_unpackrightBuffer (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_inc, int y_inc, -double* array, -double* right_buffer, -int buffer_size, -int depth) +cell_info_t type, + double* array, +const double* right_buffer, +const int depth) { - __large_kernel_indexes; + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0; + int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0; - // x_max + x_inc + j - if(column == x_max + 1 + x_inc + 1) - { - array[THARR2D(0, 0, x_extra)] = right_buffer[row]; - } - else if(depth > 1 && column == x_max + 1 + x_inc + 1) + //__kernel_indexes; + const int glob_id = threadIdx.x + blockIdx.x * blockDim.x; + const int row = glob_id / depth; + const int column = glob_id % depth; + + if (row >= (y_min + 1) - depth && row <= (y_max + 1) + y_inc + depth) { - array[THARR2D(0, 0, x_extra)] = right_buffer[row + y_max + 4 + y_extra]; + const int row_begin = row * (x_max + 4 + x_extra); + + array[row_begin + (x_max + 1) + x_inc + 1 + column] = right_buffer[VERT_IDX(column)]; } } /************************************************************/ -__global__ void device_packBottomBuffer +__global__ void device_packbottomBuffer (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_inc, int y_inc, +cell_info_t type, double* array, double* bottom_buffer, -int buffer_size, -int depth) +const int depth) { - __large_kernel_indexes; + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0; + int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0; + __kernel_indexes; - // y_min + y_inc - 1 + k - if(row == 2 + y_inc - 1 + 1) - { - bottom_buffer[row] = array[THARR2D(0, 0, x_extra)]; - } - else if(depth > 1 && row == 2 + y_inc - 1 + 2) + if (column >= (x_min + 1) - depth && column <= (x_max + 1) + y_inc + depth) { - bottom_buffer[row + x_max + 4 + x_extra] = array[THARR2D(0, 0, x_extra)]; + if (row < depth) + { + bottom_buffer[HORZ_IDX(row)] = array[THARR2D(0, (y_min + 1) + y_inc - 1 + 1, x_extra)]; + } } } -__global__ void device_unpackBottomBuffer +__global__ void device_unpackbottomBuffer (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_inc, int y_inc, +cell_info_t type, double* array, double* bottom_buffer, -int buffer_size, -int depth) +const int depth) { - __large_kernel_indexes; + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0; + int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0; + __kernel_indexes; - // y_min - k - if(row == 2 - 1) - { - array[THARR2D(0, 0, x_extra)] = bottom_buffer[row]; - } - else if(depth > 1 && row == 2 - 2) + if (column >= (x_min + 1) - depth && column <= (x_max + 1) + y_inc + depth) { - array[THARR2D(0, 0, x_extra)] = bottom_buffer[row + x_max + 4 + x_extra]; + if (row < depth) + { + array[THARR2D(0, (y_min + 1) - (1 + 2*row), x_extra)] = bottom_buffer[HORZ_IDX(row)]; + } } } /************************************************************/ -__global__ void device_packTopBuffer +__global__ void device_packtopBuffer (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_inc, int y_inc, +cell_info_t type, double* array, double* top_buffer, -int buffer_size, -int depth) +const int depth) { - __large_kernel_indexes; + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0; + int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0; + __kernel_indexes; - // y_min + y_inc - 1 + k - if(row == 2 + y_inc - 1 + 1) - { - top_buffer[row] = array[THARR2D(0, 0, x_extra)]; - } - else if(depth > 1 && row == 2 + y_inc - 1 + 2) + if (column >= (x_min + 1) - depth && column <= (x_max + 1) + y_inc + depth) { - top_buffer[row + x_max + 4 + x_extra] = array[THARR2D(0, 0, x_extra)]; + if (row < depth) + { + top_buffer[HORZ_IDX(row)] = array[THARR2D(0, (y_max + 1) + 1 - (1 + 2*row), x_extra)]; + } } } -__global__ void device_unpackTopBuffer +__global__ void device_unpacktopBuffer (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_inc, int y_inc, +cell_info_t type, double* array, double* top_buffer, -int buffer_size, -int depth) +const int depth) { - __large_kernel_indexes; - - // y_max + y_inc + k - if(row == y_max + 1 + y_inc + 1) - { - array[THARR2D(0, 0, x_extra)] = top_buffer[row]; - } - else if(depth > 1 && row == y_max + 1 + y_inc + 2) - { - array[THARR2D(0, 0, x_extra)] = top_buffer[row + x_max + 4 + x_extra]; + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0; + int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0; + __kernel_indexes; + + if (column >= (x_min + 1) - depth && column <= (x_max + 1) + y_inc + depth) + { + if (row < depth) + { + array[THARR2D(0, (y_max + 1) + y_inc + 1, x_extra)] = top_buffer[HORZ_IDX(row)]; + } } } diff --git a/update_halo_kernel_cuda.cu b/update_halo_kernel_cuda.cu index 1b0313f..985af9c 100755 --- a/update_halo_kernel_cuda.cu +++ b/update_halo_kernel_cuda.cu @@ -1,10 +1,8 @@ -// TODO some copies are negative, or copy form slightly different locations - look through and fix -// change to use depth - #include "ftocmacros.h" #include "cuda_common.cu" #include "chunk_cuda.cu" +#include #define CHUNK_left 0 #define CHUNK_right 1 @@ -13,245 +11,145 @@ #define EXTERNAL_FACE (-1) -#define NUM_FIELDS 15 - -extern CudaDevPtrStorage pointer_storage; - extern CloverleafCudaChunk chunk; -/* -* copies from the outer bit from [x_min, x_max] to the outer cells -*/ - __global__ void device_update_halo_kernel_bottom_cuda (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_invert, int y_invert, +cell_info_t type, double* cur_array, int depth) { - __large_kernel_indexes; - // if its on the third row - /* - if(row == 2 - && column > 0 && column < x_max+3+x_extra) + int x_extra = type.x_e; + int y_extra = type.y_e; + int y_invert = type.y_i; + __kernel_indexes; + + // offset by 1 if it is anything but a CELL grid + int b_offset = (type.grid_type != CELL_DATA) ? 1 : 0; + + if(column >= 2 - depth && column <= (x_max + 1) + x_extra + depth) { - for(int ii = 0; ii < depth; ii++) + if (row < depth) { - //FIXME 1+k for all the ones that have x extra or y extra - cur_array[THARR2D(0, -(1 + ii), x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 0, x_extra)]; + const int offset = 2 + b_offset; + + /* + * 1 - 2 * row means that row 0 services row 1, and vice versa + * this means that it can be dispatched with 'depth' rows only + */ + cur_array[THARR2D(0, 1 - (2 * row), x_extra)] = + y_invert * cur_array[THARR2D(0, offset, x_extra)]; } } - */ - - if(row == y_max) - { - cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, -1, x_extra)]; - } - else if(row == y_max + 1) - { - cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, -3, x_extra)]; - } - else if(row == y_max + 2) - { - cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, -5, x_extra)]; - } } __global__ void device_update_halo_kernel_top_cuda (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_invert, int y_invert, +cell_info_t type, double* cur_array, int depth) { - __large_kernel_indexes; - /* - // if on the third row from the bottom - if(row == y_max+1+y_extra - && column > 0 && column < x_max+3+x_extra) + int x_extra = type.x_e; + int y_extra = type.y_e; + int y_invert = type.y_i; + int x_face = type.x_f; + __kernel_indexes; + + // if x face data, offset source/dest by - 1 + int x_f_offset = (x_face) ? 1 : 0; + + if(column >= 2 - depth && column <= (x_max + 1) + x_extra + depth) { - for(int ii = 0; ii < depth; ii++) + if (row < depth) { - cur_array[THARR2D(0, 1 + ii, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 0, x_extra)]; - } - } - */ + const int offset = (- row) * 2 - 1 - x_f_offset; - if(row == 3) - { - cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 1, x_extra)]; - } - else if(row == 2) - { - cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 3, x_extra)]; - } - else if(row == 1) - { - cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 5, x_extra)]; + cur_array[THARR2D(0, y_extra + y_max + 2, x_extra)] = + y_invert * cur_array[THARR2D(0, y_max + 2 + offset, x_extra)]; + } } } __global__ void device_update_halo_kernel_left_cuda (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_invert, int y_invert, +cell_info_t type, double* cur_array, int depth) { - __large_kernel_indexes; - /* - // if on the third column from the left - if(row > 0 && row < y_max+3+y_extra - && column == 2) - { - for(int ii = 0; ii < depth; ii++) - { - cur_array[THARR2D(-(1 + ii), 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(0, 0, x_extra)]; - } - } - */ + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_invert = type.x_i; - if(column == 3) - { - cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(1, 0, x_extra)]; - } - else if(column == 2) - { - cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(3, 0, x_extra)]; - } - else if(column == 1) + // offset by 1 if it is anything but a CELL grid + int l_offset = (type.grid_type != CELL_DATA) ? 1 : 0; + + // special indexes for specific depth + const int glob_id = threadIdx.x + blockIdx.x * blockDim.x; + const int row = glob_id / depth; + const int column = glob_id % depth; + + if(row >= 2 - depth && row <= (y_max + 1) + y_extra + depth) { - cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(5, 0, x_extra)]; + // first in row + const int offset = row * (x_max + 4 + x_extra); + + cur_array[offset + (1 - column)] = x_invert * cur_array[offset + 2 + column + l_offset]; } } __global__ void device_update_halo_kernel_right_cuda (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_invert, int y_invert, +cell_info_t type, double* cur_array, int depth) { - __large_kernel_indexes; - /* - // if on the third column from the right - if(row > 0 && row < y_max+3+y_extra - && column == x_max+1+x_extra) - { - for(int ii = 0; ii < depth; ii++) - { - cur_array[THARR2D(1 + ii, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(0, 0, x_extra)]; - } - } - */ + int x_extra = type.x_e; + int y_extra = type.y_e; + int x_invert = type.x_i; + int y_face = type.y_f; - if(column == x_max) - { - cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(-1, 0, x_extra)]; - } - else if(column == x_max + 1) - { - cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(-3, 0, x_extra)]; - } - else if(column == x_max + 2) + // offset source by -1 if its a y face + int y_f_offset = (y_face) ? 1 : 0; + + const int glob_id = threadIdx.x + blockIdx.x * blockDim.x; + const int row = glob_id / depth; + const int column = glob_id % depth; + + if(row >= 2 - depth && row <= (y_max + 1) + y_extra + depth) { - cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(-5, 0, x_extra)]; + const int offset = row * (x_max + 4 + x_extra); + + cur_array[offset + x_max + 2 + x_extra + column] = x_invert * cur_array[offset + x_max + 1 - (column + y_f_offset)]; } } void update_array (int x_min,int x_max,int y_min,int y_max, -int x_extra, int y_extra, -int x_invert, int y_invert, +cell_info_t const& type, const int* chunk_neighbours, -double* cur_array, +double* cur_array_d, int depth) { - #define CHECK_LAUNCH(dir) \ - if(chunk_neighbours[CHUNK_ ## dir] == EXTERNAL_FACE)\ + #define CHECK_LAUNCH(face, dir) \ + if(chunk_neighbours[CHUNK_ ## face] == EXTERNAL_FACE)\ {\ - device_update_halo_kernel_ ## dir ## _cuda<<< ((x_max+5)*(y_max+5))/BLOCK_SZ, BLOCK_SZ >>>\ - (x_min,x_max,y_min,y_max, x_extra, y_extra, x_invert, y_invert, cur_array, depth);\ - errChk(__LINE__, __FILE__);\ + const int launch_sz = (ceil((dir##_max+4+type.dir##_e)/static_cast(BLOCK_SZ))) * depth; \ + device_update_halo_kernel_##face##_cuda \ + <<< launch_sz, BLOCK_SZ >>> \ + (x_min, x_max, y_min, y_max, type, cur_array_d, depth); \ + errChk(__LINE__, __FILE__); \ } - CHECK_LAUNCH(bottom); - CHECK_LAUNCH(top); - CHECK_LAUNCH(right); - CHECK_LAUNCH(left); - -} - -void update_halo_cuda -(int x_min,int x_max,int y_min,int y_max, - -const int* chunk_neighbours, - -double* density0, -double* energy0, -double* pressure, -double* viscosity, -double* soundspeed, -double* density1, -double* energy1, -double* xvel0, -double* yvel0, -double* xvel1, -double* yvel1, -double* vol_flux_x, -double* vol_flux_y, -double* mass_flux_x, -double* mass_flux_y, - -const int* fields, -int depth) -{ - -#ifdef TIME_KERNELS -_CUDA_BEGIN_PROFILE_name(host); -#endif - - pointer_storage.setSize(x_max, y_max); - double* cur_array_d ; - - #define HALO_UPDATE(arr, x_e, y_e, x_i, y_i) \ - {if(fields[FIELD_ ## arr] == 1)\ - {\ - cur_array_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, arr, BUFSZ2D(x_e, y_e));\ - update_array(x_min, x_max, y_min, y_max, \ - x_e, y_e, x_i, y_i, \ - chunk_neighbours, cur_array_d, depth);\ - pointer_storage.freeDevStorageAndCopy(cur_array_d, arr, BUFSZ2D(x_e, y_e));\ - }} - - HALO_UPDATE(density0, 0, 0, 0, 0); - HALO_UPDATE(energy0, 0, 0, 0, 0); - HALO_UPDATE(pressure, 0, 0, 0, 0); - HALO_UPDATE(viscosity, 0, 0, 0, 0); - HALO_UPDATE(soundspeed, 0, 0, 0, 0); - HALO_UPDATE(density1, 0, 0, 0, 0); - HALO_UPDATE(energy1, 0, 0, 0, 0); - - HALO_UPDATE(xvel0, 1, 1, 1, 0); - HALO_UPDATE(yvel0, 1, 1, 0, 1); - HALO_UPDATE(xvel1, 1, 1, 1, 0); - HALO_UPDATE(yvel1, 1, 1, 0, 1); - - HALO_UPDATE(vol_flux_x, 1, 0, 1, 0); - HALO_UPDATE(vol_flux_y, 0, 1, 0, 1); - HALO_UPDATE(mass_flux_x, 1, 0, 1, 0); - HALO_UPDATE(mass_flux_y, 0, 1, 0, 1); - -#ifdef TIME_KERNELS -_CUDA_END_PROFILE_name(host); -#endif + CHECK_LAUNCH(bottom, x); + CHECK_LAUNCH(top, x); + CHECK_LAUNCH(left, y); + CHECK_LAUNCH(right, y); } extern "C" void update_halo_kernel_cuda_ (int *x_min,int *x_max,int *y_min,int *y_max, -int* left,int* bottom,int* right,int* top, -int* left_boundary,int* bottom_boundary,int* right_boundary,int* top_boundary, +int* left, int* bottom, int* right, int* top, +int* left_boundary, int* bottom_boundary, int* right_boundary, int* top_boundary, const int* chunk_neighbours, @@ -272,62 +170,44 @@ double* mass_flux_x, double* mass_flux_y, const int* fields, -int* depth) +const int* depth) { -#ifdef TIME_KERNELS -_CUDA_BEGIN_PROFILE_name(host); -#endif - #ifndef CUDA_RESIDENT - update_halo_cuda(*x_min, *x_max, *y_min, *y_max, - chunk_neighbours, - density0, energy0, pressure, viscosity, soundspeed, density1, energy1, - xvel0, yvel0, xvel1, yvel1, - vol_flux_x, vol_flux_y, mass_flux_x, mass_flux_y, - fields, *depth); - #else chunk.update_halo_kernel(fields, *depth, chunk_neighbours); - #endif -#ifdef TIME_KERNELS -_CUDA_END_PROFILE_name(host); -#endif } void CloverleafCudaChunk::update_halo_kernel -(const int* fields, int depth, +(const int* fields, +const int depth, const int* chunk_neighbours) { -#ifdef TIME_KERNELS -_CUDA_BEGIN_PROFILE_name(device); -#endif + _CUDA_BEGIN_PROFILE_name(device); - #define HALO_UPDATE_RESIDENT(arr, x_e, y_e, x_i, y_i) \ + #define HALO_UPDATE_RESIDENT(arr, type) \ {if(fields[FIELD_ ## arr] == 1)\ {\ update_array(x_min, x_max, y_min, y_max, \ - x_e, y_e, x_i, y_i, \ - chunk_neighbours, arr, depth);\ + type, chunk_neighbours, arr, depth);\ }} - HALO_UPDATE_RESIDENT(density0, 0, 0, 0, 0); - HALO_UPDATE_RESIDENT(energy0, 0, 0, 0, 0); - HALO_UPDATE_RESIDENT(pressure, 0, 0, 0, 0); - HALO_UPDATE_RESIDENT(viscosity, 0, 0, 0, 0); - HALO_UPDATE_RESIDENT(soundspeed, 0, 0, 0, 0); - HALO_UPDATE_RESIDENT(density1, 0, 0, 0, 0); - HALO_UPDATE_RESIDENT(energy1, 0, 0, 0, 0); - - HALO_UPDATE_RESIDENT(xvel0, 1, 1, 1, 0); - HALO_UPDATE_RESIDENT(yvel0, 1, 1, 0, 1); - HALO_UPDATE_RESIDENT(xvel1, 1, 1, 1, 0); - HALO_UPDATE_RESIDENT(yvel1, 1, 1, 0, 1); - - HALO_UPDATE_RESIDENT(vol_flux_x, 1, 0, 1, 0); - HALO_UPDATE_RESIDENT(vol_flux_y, 0, 1, 0, 1); - HALO_UPDATE_RESIDENT(mass_flux_x, 1, 0, 1, 0); - HALO_UPDATE_RESIDENT(mass_flux_y, 0, 1, 0, 1); - -#ifdef TIME_KERNELS -_CUDA_END_PROFILE_name(device); -#endif + HALO_UPDATE_RESIDENT(density0, CELL); + HALO_UPDATE_RESIDENT(density1, CELL); + HALO_UPDATE_RESIDENT(energy0, CELL); + HALO_UPDATE_RESIDENT(energy1, CELL); + HALO_UPDATE_RESIDENT(pressure, CELL); + HALO_UPDATE_RESIDENT(viscosity, CELL); + + HALO_UPDATE_RESIDENT(xvel0, VERTEX_X); + HALO_UPDATE_RESIDENT(xvel1, VERTEX_X); + + HALO_UPDATE_RESIDENT(yvel0, VERTEX_Y); + HALO_UPDATE_RESIDENT(yvel1, VERTEX_Y); + + HALO_UPDATE_RESIDENT(vol_flux_x, X_FACE); + HALO_UPDATE_RESIDENT(mass_flux_x, X_FACE); + + HALO_UPDATE_RESIDENT(vol_flux_y, Y_FACE); + HALO_UPDATE_RESIDENT(mass_flux_y, Y_FACE); + + _CUDA_END_PROFILE_name(device); }