Skip to content
This repository has been archived by the owner on Jun 20, 2024. It is now read-only.

Commit

Permalink
Bump CUDA version with NVIDIA/Bristol optimisations
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbeckingsale committed Jan 21, 2013
1 parent a461542 commit a3417a3
Show file tree
Hide file tree
Showing 7 changed files with 476 additions and 792 deletions.
29 changes: 17 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ OMP=$(OMP_$(COMPILER))

FLAGS_INTEL = -O3 -ipo
FLAGS_SUN = -O2
FLAGS_GNU = -O2 -g
FLAGS_CRAY = -em -ra
FLAGS_PGI = -O2 -Mpreprocess -g
FLAGS_GNU = -O2
FLAGS_CRAY = -O2 -em -ra -f free -F
FLAGS_PGI = -O2 -Mpreprocess
FLAGS_PATHSCALE = -O2
FLAGS_XLF = -O2
FLAGS_ = -O2
Expand All @@ -28,6 +28,14 @@ CFLAGS_PATHSCALE = -O2
CFLAGS_XLF = -O2
CFLAGS_ = -O2

# flags for nvcc
# set NV_ARCH to select the correct one
CODE_GEN_FERMI=-gencode arch=compute_20,code=sm_21
CODE_GEN_KEPLER=-gencode arch=compute_30,code=sm_30

# requires CUDA_HOME to be set - not the same on all machines
NV_FLAGS=-O2 -w -c -I $(CUDA_HOME)/include $(CODE_GEN_$(NV_ARCH)) -DNO_ERR_CHK

ifdef DEBUG
FLAGS_INTEL = -O0 -g -debug all -check all -traceback -check noarg_temp_created
FLAGS_SUN = -O0 -xopenmp=noopt -g
Expand All @@ -39,6 +47,7 @@ ifdef DEBUG
FLAGS_ = -O0 -g
CFLAGS_INTEL = -O0 -g -c -debug all -traceback -restrict
CFLAGS_CRAY = -O0 -g -em -eD
NV_FLAGS += -g -G
endif

ifdef IEEE
Expand All @@ -56,7 +65,7 @@ CPPLIBS_PGI=-pgcpplibs
CPPLIBS_GNU=-lstdc++
CPPLIBS=$(CPPLIBS_$(COMPILER))

FLAGS=$(FLAGS_$(COMPILER)) $(OMP) $(I3E) $(OPTIONS)
FLAGS=$(FLAGS_$(COMPILER)) $(OMP) $(I3E) $(OPTIONS) $(RESIDENT_FLAG)
CFLAGS=$(CFLAGS_$(COMPILER)) $(OMP) $(I3E) $(C_OPTIONS) -c
MPI_COMPILER=mpif90
C_MPI_COMPILER=mpicc
Expand All @@ -81,7 +90,7 @@ CUDA_FILES=\
update_halo_kernel_cuda.o

all: clover_leaf
rm -f *.o *.mod *genmod*
rm -f *.o *.mod *genmod* *.lst

clover_leaf: cuda_clover c_lover *.f90
$(MPI_COMPILER) $(FLAGS) \
Expand Down Expand Up @@ -138,7 +147,6 @@ clover_leaf: cuda_clover c_lover *.f90
ideal_gas_kernel_c.o \
advec_cell_kernel_c.o \
viscosity_kernel_c.o \
timer_c.o \
$(CUDA_FILES) \
-L $(CUDA_HOME)/lib64 -lcudart $(CPPLIBS) \
-o clover_leaf
Expand All @@ -153,16 +161,13 @@ c_lover:
ideal_gas_kernel_c.c \
viscosity_kernel_c.c \
advec_cell_kernel_c.c \
advec_mom_kernel_c.c \
timer_c.c
advec_mom_kernel_c.c

#for kepler use: -gencode arch=compute_30,code=sm_30
cuda_clover: $(CUDA_FILES)
@echo "NB - This creates code for Fermi architecture which supports double precision natively - removing the gencode specification statement will create code that operates on floating point numbers instead, at the cost of a possible loss of precision"

%.o: %.cu
nvcc $(CFLAGS_GNU) -c -DCUDA_RESIDENT -gencode arch=compute_20,code=sm_21 -I $(CUDA_HOME)/include $< #-DTIME_KERNELS
nvcc $(NV_FLAGS) $<

clean:
rm -f *.o *.mod *genmod*
rm -f *.o *.mod *genmod* *.lst

188 changes: 36 additions & 152 deletions PdV_kernel_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@
#include "ftocmacros.h"
#include <algorithm>

#include "chunk_cuda.cu"
extern CloverleafCudaChunk chunk;

#include "omp.h"

extern CudaDevPtrStorage pointer_storage;
#include "chunk_cuda.cu"
extern CloverleafCudaChunk chunk;

__global__ void device_PdV_cuda_kernel_predict
(int x_min, int x_max, int y_min, int y_max,
Expand Down Expand Up @@ -38,25 +36,25 @@ const double * __restrict const yvel1)
double recip_volume, energy_change, min_cell_volume,
right_flux, left_flux, top_flux, bottom_flux, total_flux;

if(row > 1 && column > 1
&& row < y_max+2 && column < x_max+2)
if (row >= (y_min + 1) && row <= (y_max + 1)
&& column >= (x_min + 1) && column <= (x_max + 1))
{
left_flux = (xarea[THARR2D(0, 0, 1)]
* (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 0, 1)]
+ xvel0[THARR2D(0, 1, 1)] + xvel0[THARR2D(0, 1, 1)]))
* (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)]
+ xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)]))
* 0.25 * dt * 0.5;
right_flux = (xarea[THARR2D(1, 0, 1)]
* (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 0, 1)]
+ xvel0[THARR2D(1, 1, 1)] + xvel0[THARR2D(1, 1, 1)]))
* (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)]
+ xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)]))
* 0.25 * dt * 0.5;

bottom_flux = (yarea[THARR2D(0, 0, 0)]
* (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(0, 0, 1)]
+ yvel0[THARR2D(1, 0, 1)] + yvel0[THARR2D(1, 0, 1)]))
* (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)]
+ yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)]))
* 0.25 * dt * 0.5;
top_flux = (yarea[THARR2D(0, 1, 0)]
* (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(0, 1, 1)]
+ yvel0[THARR2D(1, 1, 1)] + yvel0[THARR2D(1, 1, 1)]))
* (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)]
+ yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)]))
* 0.25 * dt * 0.5;

total_flux = right_flux - left_flux + top_flux - bottom_flux;
Expand Down Expand Up @@ -89,9 +87,11 @@ const double * __restrict const yvel1)
density1[THARR2D(0, 0, 0)] = density0[THARR2D(0, 0, 0)] * volume_change;
}

//reduction to get error conditon, if any
Reduce< BLOCK_SZ/2 >::run(err_cond_kernel, error_condition, max_func);

/*
__syncthreads();
for(size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
for(int offset = blockDim.x / 2; offset > 0; offset /= 2)
{
if(threadIdx.x < offset)
{
Expand All @@ -101,6 +101,7 @@ const double * __restrict const yvel1)
__syncthreads();
}
error_condition[blockIdx.x] = err_cond_kernel[0];;
*/
}

__global__ void device_PdV_cuda_kernel_not_predict
Expand Down Expand Up @@ -130,25 +131,25 @@ const double * __restrict const yvel1)
double recip_volume, energy_change, min_cell_volume,
right_flux, left_flux, top_flux, bottom_flux, total_flux;

if(row > 1 && column > 1
&& row < y_max+2 && column < x_max+2)
if (row >= (y_min + 1) && row <= (y_max + 1)
&& column >= (x_min + 1) && column <= (x_max + 1))
{
left_flux = (xarea[THARR2D(0, 0, 1)]
* (xvel0[THARR2D(0, 0, 1)] + xvel1[THARR2D(0, 0, 1)]
+ xvel0[THARR2D(0, 1, 1)] + xvel1[THARR2D(0, 1, 1)]))
* (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)]
+ xvel1[THARR2D(0, 0, 1)] + xvel1[THARR2D(0, 1, 1)]))
* 0.25 * dt;
right_flux = (xarea[THARR2D(1, 0, 1)]
* (xvel0[THARR2D(1, 0, 1)] + xvel1[THARR2D(1, 0, 1)]
+ xvel0[THARR2D(1, 1, 1)] + xvel1[THARR2D(1, 1, 1)]))
* (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)]
+ xvel1[THARR2D(1, 0, 1)] + xvel1[THARR2D(1, 1, 1)]))
* 0.25 * dt;

bottom_flux = (yarea[THARR2D(0, 0, 0)]
* (yvel0[THARR2D(0, 0, 1)] + yvel1[THARR2D(0, 0, 1)]
+ yvel0[THARR2D(1, 0, 1)] + yvel1[THARR2D(1, 0, 1)]))
* (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)]
+ yvel1[THARR2D(0, 0, 1)] + yvel1[THARR2D(1, 0, 1)]))
* 0.25 * dt;
top_flux = (yarea[THARR2D(0, 1, 0)]
* (yvel0[THARR2D(0, 1, 1)] + yvel1[THARR2D(0, 1, 1)]
+ yvel0[THARR2D(1, 1, 1)] + yvel1[THARR2D(1, 1, 1)]))
* (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)]
+ yvel1[THARR2D(0, 1, 1)] + yvel1[THARR2D(1, 1, 1)]))
* 0.25 * dt;

total_flux = right_flux - left_flux + top_flux - bottom_flux;
Expand Down Expand Up @@ -181,8 +182,11 @@ const double * __restrict const yvel1)

}

Reduce< BLOCK_SZ/2 >::run(err_cond_kernel, error_condition, max_func);

/*
__syncthreads();
for(size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
for(int offset = blockDim.x / 2; offset > 0; offset /= 2)
{
if(threadIdx.x < offset)
{
Expand All @@ -192,110 +196,7 @@ const double * __restrict const yvel1)
__syncthreads();
}
error_condition[blockIdx.x] = err_cond_kernel[0];;
}

void PdV_cuda
(int error_condition,int predict,int x_min,int x_max,int y_min,int y_max,
double dt,
double *xarea,
double *yarea,
double *volume,
double *density0,
double *density1,
double *energy0,
double *energy1,
double *pressure,
double *viscosity,
double *xvel0,
double *yvel0,
double *xvel1,
double *yvel1)
{
pointer_storage.setSize(x_max, y_max);

double* xarea_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xarea, BUFSZ2D(1, 0));
double* yarea_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yarea, BUFSZ2D(0, 1));

double* density0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, density0, BUFSZ2D(0, 0));
double* energy0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, energy0, BUFSZ2D(0, 0));

double* volume_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, volume, BUFSZ2D(0, 0));
double* pressure_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, pressure, BUFSZ2D(0, 0));
double* viscosity_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, viscosity, BUFSZ2D(0, 0));

double* xvel0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xvel0, BUFSZ2D(1, 1));
double* xvel1_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xvel1, BUFSZ2D(1, 1));
double* yvel0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yvel0, BUFSZ2D(1, 1));
double* yvel1_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yvel1, BUFSZ2D(1, 1));

double* energy1_d = pointer_storage.getDevStorage(__LINE__, __FILE__);
double* density1_d = pointer_storage.getDevStorage(__LINE__, __FILE__);

size_t num_blocks = (((x_max+4)*(y_max+4))/BLOCK_SZ);
//error condition
thrust::device_ptr<int> reduce_ptr_1 =
thrust::device_malloc<int>(num_blocks*sizeof(int));
int* err_condition_arr_d = thrust::raw_pointer_cast(reduce_ptr_1);

#ifdef TIME_KERNELS
_CUDA_BEGIN_PROFILE_name(device);
#endif

if(predict)
{
device_PdV_cuda_kernel_predict<<< ((x_max+4)*(y_max+4))/BLOCK_SZ, BLOCK_SZ >>>
(x_min, x_max, y_min, y_max, dt, err_condition_arr_d,
xarea_d, yarea_d, volume_d, density0_d, density1_d,
energy0_d, energy1_d, pressure_d, viscosity_d,
xvel0_d, yvel0_d, xvel1_d, yvel1_d);
}
else
{
device_PdV_cuda_kernel_not_predict<<< ((x_max+4)*(y_max+4))/BLOCK_SZ, BLOCK_SZ >>>
(x_min, x_max, y_min, y_max, dt, err_condition_arr_d,
xarea_d, yarea_d, volume_d, density0_d, density1_d,
energy0_d, energy1_d, pressure_d, viscosity_d,
xvel0_d, yvel0_d, xvel1_d, yvel1_d);
}

#ifdef TIME_KERNELS
_CUDA_END_PROFILE_name(device);
#endif

errChk(__LINE__, __FILE__);

pointer_storage.freeDevStorageAndCopy(energy1_d, energy1, BUFSZ2D(0, 0));
pointer_storage.freeDevStorageAndCopy(density1_d, density1, BUFSZ2D(0, 0));

pointer_storage.freeDevStorage(xarea_d);
pointer_storage.freeDevStorage(yarea_d);
pointer_storage.freeDevStorage(volume_d);
pointer_storage.freeDevStorage(pressure_d);
pointer_storage.freeDevStorage(viscosity_d);
pointer_storage.freeDevStorage(xvel0_d);
pointer_storage.freeDevStorage(yvel0_d);
pointer_storage.freeDevStorage(xvel1_d);
pointer_storage.freeDevStorage(yvel1_d);
pointer_storage.freeDevStorage(energy0_d);
pointer_storage.freeDevStorage(density0_d);

/*
int err_cond = thrust::reduce(reduce_ptr_1,
reduce_ptr_1 + num_blocks,
0, thrust::maximum<int>());
// */
int err_cond = *thrust::max_element(reduce_ptr_1, reduce_ptr_1 + num_blocks);
thrust::device_free(reduce_ptr_1);

if(err_cond == 1)
{
std::cerr << "Negative volume in PdV kernel" << std::endl;
}
else if(err_cond == 2)
{
std::cerr << "Negative cell volume in PdV kernel" << std::endl;
}

*/
}

extern "C" void pdv_kernel_cuda_
Expand All @@ -311,31 +212,16 @@ double *viscosity,
double *xvel0,
double *xvel1,
double *yvel0,
double *yvel1)
double *yvel1,
double *unused_array)
{
#ifdef TIME_KERNELS
_CUDA_BEGIN_PROFILE_name(host);
#endif
#ifndef CUDA_RESIDENT
PdV_cuda(*errorcondition, *prdct, *xmin, *xmax, *ymin, *ymax,*dtbyt,
xarea, yarea, volume, density0, density1, energy0, energy1,
pressure, viscosity, xvel0, yvel0, xvel1, yvel1);
#else
chunk.PdV_kernel(errorcondition, *prdct, *dtbyt);
#endif
#ifdef TIME_KERNELS
_CUDA_END_PROFILE_name(host);
#endif
}


void CloverleafCudaChunk::PdV_kernel
(int* error_condition, int predict, double dt)
{

#ifdef TIME_KERNELS
_CUDA_BEGIN_PROFILE_name(device);
#endif
_CUDA_BEGIN_PROFILE_name(device);

if(predict)
{
Expand All @@ -356,9 +242,7 @@ _CUDA_BEGIN_PROFILE_name(device);
errChk(__LINE__, __FILE__);
}

#ifdef TIME_KERNELS
_CUDA_END_PROFILE_name(device);
#endif
_CUDA_END_PROFILE_name(device);

int err_cond = *thrust::max_element(reduce_pdv,
reduce_pdv + num_blocks);
Expand Down
7 changes: 3 additions & 4 deletions clover.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
state 1 density=0.2 energy=1.0
state 2 density=1.0 energy=2.5 geometry=rectangle xmin=0.0 xmax=5.0 ymin=0.0 ymax=2.0

x_cells=960
y_cells=960
x_cells=3840
y_cells=3840

xmin=0.0
ymin=0.0
Expand All @@ -14,9 +14,8 @@
initial_timestep=0.04
timestep_rise=1.5
max_timestep=0.04
end_time=0.5
end_time=95.5
end_step=87

use_cuda_kernels

*endclover
Loading

0 comments on commit a3417a3

Please sign in to comment.