Bump CUDA version with NVIDIA/Bristol optimisations

UoB-HPC · Jan 21, 2013 · a3417a3 · a3417a3
1 parent a461542
commit a3417a3
Show file tree

Hide file tree

Showing 7 changed files with 476 additions and 792 deletions.
diff --git a/Makefile b/Makefile
@@ -13,9 +13,9 @@ OMP=$(OMP_$(COMPILER))
 
 FLAGS_INTEL     = -O3 -ipo
 FLAGS_SUN       = -O2
-FLAGS_GNU       = -O2 -g
-FLAGS_CRAY      = -em -ra
-FLAGS_PGI       = -O2 -Mpreprocess -g
+FLAGS_GNU       = -O2
+FLAGS_CRAY      = -O2 -em -ra -f free -F
+FLAGS_PGI       = -O2 -Mpreprocess
 FLAGS_PATHSCALE = -O2
 FLAGS_XLF       = -O2
 FLAGS_          = -O2
@@ -28,6 +28,14 @@ CFLAGS_PATHSCALE = -O2
 CFLAGS_XLF       = -O2
 CFLAGS_          = -O2
 
+# flags for nvcc
+# set NV_ARCH to select the correct one
+CODE_GEN_FERMI=-gencode arch=compute_20,code=sm_21
+CODE_GEN_KEPLER=-gencode arch=compute_30,code=sm_30
+
+# requires CUDA_HOME to be set - not the same on all machines
+NV_FLAGS=-O2 -w -c -I $(CUDA_HOME)/include $(CODE_GEN_$(NV_ARCH)) -DNO_ERR_CHK
+
 ifdef DEBUG
   FLAGS_INTEL     = -O0 -g -debug all -check all -traceback -check noarg_temp_created
   FLAGS_SUN       = -O0 -xopenmp=noopt -g
@@ -39,6 +47,7 @@ ifdef DEBUG
   FLAGS_          = -O0 -g
   CFLAGS_INTEL    = -O0 -g -c -debug all -traceback -restrict
   CFLAGS_CRAY     = -O0 -g -em -eD
+  NV_FLAGS += -g -G
 endif
 
 ifdef IEEE
@@ -56,7 +65,7 @@ CPPLIBS_PGI=-pgcpplibs
 CPPLIBS_GNU=-lstdc++
 CPPLIBS=$(CPPLIBS_$(COMPILER))
 
-FLAGS=$(FLAGS_$(COMPILER)) $(OMP) $(I3E) $(OPTIONS)
+FLAGS=$(FLAGS_$(COMPILER)) $(OMP) $(I3E) $(OPTIONS) $(RESIDENT_FLAG)
 CFLAGS=$(CFLAGS_$(COMPILER)) $(OMP) $(I3E) $(C_OPTIONS) -c
 MPI_COMPILER=mpif90
 C_MPI_COMPILER=mpicc
@@ -81,7 +90,7 @@ CUDA_FILES=\
 	update_halo_kernel_cuda.o
 
 all: clover_leaf
-	rm -f *.o *.mod *genmod*
+	rm -f *.o *.mod *genmod* *.lst
 
 clover_leaf: cuda_clover c_lover *.f90
 	$(MPI_COMPILER) $(FLAGS)	\
@@ -138,7 +147,6 @@ clover_leaf: cuda_clover c_lover *.f90
 	ideal_gas_kernel_c.o            \
 	advec_cell_kernel_c.o           \
 	viscosity_kernel_c.o            \
-	timer_c.o                       \
 	$(CUDA_FILES)	\
 	-L $(CUDA_HOME)/lib64 -lcudart $(CPPLIBS) 	\
 	-o clover_leaf
@@ -153,16 +161,13 @@ c_lover:
 	ideal_gas_kernel_c.c            \
 	viscosity_kernel_c.c            \
 	advec_cell_kernel_c.c			\
-	advec_mom_kernel_c.c            \
-	timer_c.c
+	advec_mom_kernel_c.c            
 
-#for kepler use: -gencode arch=compute_30,code=sm_30
 cuda_clover: $(CUDA_FILES)
-	@echo "NB - This creates code for Fermi architecture which supports double precision natively - removing the gencode specification statement will create code that operates on floating point numbers instead, at the cost of a possible loss of precision"
 
 %.o: %.cu 
-	nvcc $(CFLAGS_GNU) -c -DCUDA_RESIDENT -gencode arch=compute_20,code=sm_21 -I $(CUDA_HOME)/include $< #-DTIME_KERNELS 
+	nvcc $(NV_FLAGS) $<
 
 clean:
-	rm -f *.o *.mod *genmod*
+	rm -f *.o *.mod *genmod* *.lst
 
diff --git a/PdV_kernel_cuda.cu b/PdV_kernel_cuda.cu
@@ -4,12 +4,10 @@
 #include "ftocmacros.h"
 #include <algorithm>
 
-#include "chunk_cuda.cu"
-extern CloverleafCudaChunk chunk;
-
 #include "omp.h"
 
-extern CudaDevPtrStorage pointer_storage;
+#include "chunk_cuda.cu"
+extern CloverleafCudaChunk chunk;
 
 __global__ void device_PdV_cuda_kernel_predict
 (int x_min, int x_max, int y_min, int y_max, 
@@ -38,25 +36,25 @@ const double * __restrict const yvel1)
     double recip_volume, energy_change, min_cell_volume,
         right_flux, left_flux, top_flux, bottom_flux, total_flux;
 
-    if(row > 1 && column > 1
-    && row < y_max+2 && column < x_max+2)
+    if (row >= (y_min + 1) && row <= (y_max + 1)
+    && column >= (x_min + 1) && column <= (x_max + 1))
     {
         left_flux   = (xarea[THARR2D(0, 0, 1)]
-            * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 0, 1)] 
-            + xvel0[THARR2D(0, 1, 1)] + xvel0[THARR2D(0, 1, 1)]))
+            * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)] 
+            + xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)]))
             * 0.25 * dt * 0.5;
         right_flux  = (xarea[THARR2D(1, 0, 1)]
-            * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 0, 1)] 
-            + xvel0[THARR2D(1, 1, 1)] + xvel0[THARR2D(1, 1, 1)]))
+            * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)] 
+            + xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)]))
             * 0.25 * dt * 0.5;
 
         bottom_flux = (yarea[THARR2D(0, 0, 0)]
-            * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(0, 0, 1)] 
-            + yvel0[THARR2D(1, 0, 1)] + yvel0[THARR2D(1, 0, 1)]))
+            * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)] 
+            + yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)]))
             * 0.25 * dt * 0.5;
         top_flux    = (yarea[THARR2D(0, 1, 0)]
-            * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(0, 1, 1)] 
-            + yvel0[THARR2D(1, 1, 1)] + yvel0[THARR2D(1, 1, 1)]))
+            * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)] 
+            + yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)]))
             * 0.25 * dt * 0.5;
 
         total_flux = right_flux - left_flux + top_flux - bottom_flux;
@@ -89,9 +87,11 @@ const double * __restrict const yvel1)
         density1[THARR2D(0, 0, 0)] = density0[THARR2D(0, 0, 0)] * volume_change;
     }
 
-    //reduction to get error conditon, if any
+    Reduce< BLOCK_SZ/2 >::run(err_cond_kernel, error_condition, max_func);
+
+    /*
     __syncthreads();
-    for(size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
+    for(int offset = blockDim.x / 2; offset > 0; offset /= 2)
     {
         if(threadIdx.x < offset)
         {
@@ -101,6 +101,7 @@ const double * __restrict const yvel1)
         __syncthreads();
     }
     error_condition[blockIdx.x] = err_cond_kernel[0];;
+    */
 }
 
 __global__ void device_PdV_cuda_kernel_not_predict
@@ -130,25 +131,25 @@ const double * __restrict const yvel1)
     double recip_volume, energy_change, min_cell_volume,
         right_flux, left_flux, top_flux, bottom_flux, total_flux;
 
-    if(row > 1 && column > 1
-    && row < y_max+2 && column < x_max+2)
+    if (row >= (y_min + 1) && row <= (y_max + 1)
+    && column >= (x_min + 1) && column <= (x_max + 1))
     {
         left_flux   = (xarea[THARR2D(0, 0, 1)]
-            * (xvel0[THARR2D(0, 0, 1)] + xvel1[THARR2D(0, 0, 1)] 
-            + xvel0[THARR2D(0, 1, 1)] + xvel1[THARR2D(0, 1, 1)]))
+            * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)] 
+            + xvel1[THARR2D(0, 0, 1)] + xvel1[THARR2D(0, 1, 1)]))
             * 0.25 * dt;
         right_flux  = (xarea[THARR2D(1, 0, 1)]
-            * (xvel0[THARR2D(1, 0, 1)] + xvel1[THARR2D(1, 0, 1)] 
-            + xvel0[THARR2D(1, 1, 1)] + xvel1[THARR2D(1, 1, 1)]))
+            * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)] 
+            + xvel1[THARR2D(1, 0, 1)] + xvel1[THARR2D(1, 1, 1)]))
             * 0.25 * dt;
 
         bottom_flux = (yarea[THARR2D(0, 0, 0)]
-            * (yvel0[THARR2D(0, 0, 1)] + yvel1[THARR2D(0, 0, 1)] 
-            + yvel0[THARR2D(1, 0, 1)] + yvel1[THARR2D(1, 0, 1)]))
+            * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)] 
+            + yvel1[THARR2D(0, 0, 1)] + yvel1[THARR2D(1, 0, 1)]))
             * 0.25 * dt;
         top_flux    = (yarea[THARR2D(0, 1, 0)]
-            * (yvel0[THARR2D(0, 1, 1)] + yvel1[THARR2D(0, 1, 1)] 
-            + yvel0[THARR2D(1, 1, 1)] + yvel1[THARR2D(1, 1, 1)]))
+            * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)] 
+            + yvel1[THARR2D(0, 1, 1)] + yvel1[THARR2D(1, 1, 1)]))
             * 0.25 * dt;
 
         total_flux = right_flux - left_flux + top_flux - bottom_flux;
@@ -181,8 +182,11 @@ const double * __restrict const yvel1)
 
     }
 
+    Reduce< BLOCK_SZ/2 >::run(err_cond_kernel, error_condition, max_func);
+
+    /*
     __syncthreads();
-    for(size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
+    for(int offset = blockDim.x / 2; offset > 0; offset /= 2)
     {
         if(threadIdx.x < offset)
         {
@@ -192,110 +196,7 @@ const double * __restrict const yvel1)
         __syncthreads();
     }
     error_condition[blockIdx.x] = err_cond_kernel[0];;
-}
-
-void PdV_cuda
-(int error_condition,int predict,int x_min,int x_max,int y_min,int y_max,
-double dt,
-double *xarea,
-double *yarea,
-double *volume,
-double *density0,
-double *density1,
-double *energy0,
-double *energy1,
-double *pressure,
-double *viscosity,
-double *xvel0,
-double *yvel0,
-double *xvel1,
-double *yvel1)
-{
-    pointer_storage.setSize(x_max, y_max);
-
-    double* xarea_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xarea, BUFSZ2D(1, 0));
-    double* yarea_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yarea, BUFSZ2D(0, 1));
-
-    double* density0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, density0, BUFSZ2D(0, 0));
-    double* energy0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, energy0, BUFSZ2D(0, 0));
-
-    double* volume_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, volume, BUFSZ2D(0, 0));
-    double* pressure_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, pressure, BUFSZ2D(0, 0));
-    double* viscosity_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, viscosity, BUFSZ2D(0, 0));
-
-    double* xvel0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xvel0, BUFSZ2D(1, 1));
-    double* xvel1_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xvel1, BUFSZ2D(1, 1));
-    double* yvel0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yvel0, BUFSZ2D(1, 1));
-    double* yvel1_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yvel1, BUFSZ2D(1, 1));
-
-    double* energy1_d = pointer_storage.getDevStorage(__LINE__, __FILE__);
-    double* density1_d = pointer_storage.getDevStorage(__LINE__, __FILE__);
-
-    size_t num_blocks = (((x_max+4)*(y_max+4))/BLOCK_SZ);
-    //error condition
-    thrust::device_ptr<int> reduce_ptr_1 =
-        thrust::device_malloc<int>(num_blocks*sizeof(int));
-    int* err_condition_arr_d = thrust::raw_pointer_cast(reduce_ptr_1);
-
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(device);
-#endif
-
-    if(predict)
-    {
-        device_PdV_cuda_kernel_predict<<< ((x_max+4)*(y_max+4))/BLOCK_SZ, BLOCK_SZ >>>
-        (x_min, x_max, y_min, y_max, dt, err_condition_arr_d,
-            xarea_d, yarea_d, volume_d, density0_d, density1_d,
-            energy0_d, energy1_d, pressure_d, viscosity_d,
-            xvel0_d, yvel0_d, xvel1_d, yvel1_d);
-    }
-    else
-    {
-        device_PdV_cuda_kernel_not_predict<<< ((x_max+4)*(y_max+4))/BLOCK_SZ, BLOCK_SZ >>>
-        (x_min, x_max, y_min, y_max, dt, err_condition_arr_d,
-            xarea_d, yarea_d, volume_d, density0_d, density1_d,
-            energy0_d, energy1_d, pressure_d, viscosity_d,
-            xvel0_d, yvel0_d, xvel1_d, yvel1_d);
-    }
-
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(device);
-#endif
-
-errChk(__LINE__, __FILE__);
-
-    pointer_storage.freeDevStorageAndCopy(energy1_d, energy1, BUFSZ2D(0, 0));
-    pointer_storage.freeDevStorageAndCopy(density1_d, density1, BUFSZ2D(0, 0));
-
-    pointer_storage.freeDevStorage(xarea_d);
-    pointer_storage.freeDevStorage(yarea_d);
-    pointer_storage.freeDevStorage(volume_d);
-    pointer_storage.freeDevStorage(pressure_d);
-    pointer_storage.freeDevStorage(viscosity_d);
-    pointer_storage.freeDevStorage(xvel0_d);
-    pointer_storage.freeDevStorage(yvel0_d);
-    pointer_storage.freeDevStorage(xvel1_d);
-    pointer_storage.freeDevStorage(yvel1_d);
-    pointer_storage.freeDevStorage(energy0_d);
-    pointer_storage.freeDevStorage(density0_d);
-
-    /*
-    int err_cond = thrust::reduce(reduce_ptr_1,
-        reduce_ptr_1 + num_blocks,
-        0, thrust::maximum<int>());
-    // */
-    int err_cond = *thrust::max_element(reduce_ptr_1, reduce_ptr_1 + num_blocks);
-    thrust::device_free(reduce_ptr_1);
-
-    if(err_cond == 1)
-    {
-        std::cerr << "Negative volume in PdV kernel" << std::endl;
-    }
-    else if(err_cond == 2)
-    {
-        std::cerr << "Negative cell volume in PdV kernel" << std::endl;
-    }
-
+    */
 }
 
 extern "C" void pdv_kernel_cuda_
@@ -311,31 +212,16 @@ double *viscosity,
 double *xvel0,
 double *xvel1,
 double *yvel0,
-double *yvel1)
+double *yvel1,
+double *unused_array)
 {
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(host);
-#endif
-    #ifndef CUDA_RESIDENT
-    PdV_cuda(*errorcondition, *prdct,  *xmin, *xmax, *ymin, *ymax,*dtbyt,
-        xarea, yarea, volume, density0, density1, energy0, energy1,
-        pressure, viscosity, xvel0, yvel0, xvel1, yvel1);
-    #else
     chunk.PdV_kernel(errorcondition, *prdct, *dtbyt);
-    #endif
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(host);
-#endif
 }
 
-
 void CloverleafCudaChunk::PdV_kernel
 (int* error_condition, int predict, double dt)
 {
-
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(device);
-#endif
+    _CUDA_BEGIN_PROFILE_name(device);
 
     if(predict)
     {
@@ -356,9 +242,7 @@ _CUDA_BEGIN_PROFILE_name(device);
         errChk(__LINE__, __FILE__);
     }
 
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(device);
-#endif
+    _CUDA_END_PROFILE_name(device);
 
     int err_cond = *thrust::max_element(reduce_pdv,
         reduce_pdv + num_blocks);

diff --git a/clover.in b/clover.in
@@ -3,8 +3,8 @@
  state 1 density=0.2 energy=1.0
  state 2 density=1.0 energy=2.5 geometry=rectangle xmin=0.0 xmax=5.0 ymin=0.0 ymax=2.0
 
- x_cells=960
- y_cells=960
+ x_cells=3840
+ y_cells=3840
 
  xmin=0.0
  ymin=0.0
@@ -14,9 +14,8 @@
  initial_timestep=0.04
  timestep_rise=1.5
  max_timestep=0.04
- end_time=0.5
+ end_time=95.5
  end_step=87
-
  use_cuda_kernels
 
 *endclover