From a3417a32d43898c959a5df7b444cf0f2ddb36213 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Mon, 21 Jan 2013 15:08:02 +0000
Subject: [PATCH] Bump CUDA version with NVIDIA/Bristol optimisations

---
 Makefile                   |  29 ++--
 PdV_kernel_cuda.cu         | 188 ++++----------------
 clover.in                  |   7 +-
 cuda_common.cu             | 190 ++++++++++++---------
 mpi_transfers_cuda.cu      | 254 ++++++++-------------------
 pack_buffer_kernels.cu     | 260 ++++++++++++++--------------
 update_halo_kernel_cuda.cu | 340 ++++++++++++-------------------------
 7 files changed, 476 insertions(+), 792 deletions(-)
diff --git a/Makefile b/Makefile
index a8a464b..4d59920 100644
--- a/Makefile
+++ b/Makefile
@@ -13,9 +13,9 @@ OMP=$(OMP_$(COMPILER))
 
 FLAGS_INTEL     = -O3 -ipo
 FLAGS_SUN       = -O2
-FLAGS_GNU       = -O2 -g
-FLAGS_CRAY      = -em -ra
-FLAGS_PGI       = -O2 -Mpreprocess -g
+FLAGS_GNU       = -O2
+FLAGS_CRAY      = -O2 -em -ra -f free -F
+FLAGS_PGI       = -O2 -Mpreprocess
 FLAGS_PATHSCALE = -O2
 FLAGS_XLF       = -O2
 FLAGS_          = -O2
@@ -28,6 +28,14 @@ CFLAGS_PATHSCALE = -O2
 CFLAGS_XLF       = -O2
 CFLAGS_          = -O2
 
+# flags for nvcc
+# set NV_ARCH to select the correct one
+CODE_GEN_FERMI=-gencode arch=compute_20,code=sm_21
+CODE_GEN_KEPLER=-gencode arch=compute_30,code=sm_30
+
+# requires CUDA_HOME to be set - not the same on all machines
+NV_FLAGS=-O2 -w -c -I $(CUDA_HOME)/include $(CODE_GEN_$(NV_ARCH)) -DNO_ERR_CHK
+
 ifdef DEBUG
   FLAGS_INTEL     = -O0 -g -debug all -check all -traceback -check noarg_temp_created
   FLAGS_SUN       = -O0 -xopenmp=noopt -g
@@ -39,6 +47,7 @@ ifdef DEBUG
   FLAGS_          = -O0 -g
   CFLAGS_INTEL    = -O0 -g -c -debug all -traceback -restrict
   CFLAGS_CRAY     = -O0 -g -em -eD
+  NV_FLAGS += -g -G
 endif
 
 ifdef IEEE
@@ -56,7 +65,7 @@ CPPLIBS_PGI=-pgcpplibs
 CPPLIBS_GNU=-lstdc++
 CPPLIBS=$(CPPLIBS_$(COMPILER))
 
-FLAGS=$(FLAGS_$(COMPILER)) $(OMP) $(I3E) $(OPTIONS)
+FLAGS=$(FLAGS_$(COMPILER)) $(OMP) $(I3E) $(OPTIONS) $(RESIDENT_FLAG)
 CFLAGS=$(CFLAGS_$(COMPILER)) $(OMP) $(I3E) $(C_OPTIONS) -c
 MPI_COMPILER=mpif90
 C_MPI_COMPILER=mpicc
@@ -81,7 +90,7 @@ CUDA_FILES=\
 	update_halo_kernel_cuda.o
 
 all: clover_leaf
-	rm -f *.o *.mod *genmod*
+	rm -f *.o *.mod *genmod* *.lst
 
 clover_leaf: cuda_clover c_lover *.f90
 	$(MPI_COMPILER) $(FLAGS)	\
@@ -138,7 +147,6 @@ clover_leaf: cuda_clover c_lover *.f90
 	ideal_gas_kernel_c.o            \
 	advec_cell_kernel_c.o           \
 	viscosity_kernel_c.o            \
-	timer_c.o                       \
 	$(CUDA_FILES)	\
 	-L $(CUDA_HOME)/lib64 -lcudart $(CPPLIBS) 	\
 	-o clover_leaf
@@ -153,16 +161,13 @@ c_lover:
 	ideal_gas_kernel_c.c            \
 	viscosity_kernel_c.c            \
 	advec_cell_kernel_c.c			\
-	advec_mom_kernel_c.c            \
-	timer_c.c
+	advec_mom_kernel_c.c            
 
-#for kepler use: -gencode arch=compute_30,code=sm_30
 cuda_clover: $(CUDA_FILES)
-	@echo "NB - This creates code for Fermi architecture which supports double precision natively - removing the gencode specification statement will create code that operates on floating point numbers instead, at the cost of a possible loss of precision"
 
 %.o: %.cu 
-	nvcc $(CFLAGS_GNU) -c -DCUDA_RESIDENT -gencode arch=compute_20,code=sm_21 -I $(CUDA_HOME)/include $< #-DTIME_KERNELS 
+	nvcc $(NV_FLAGS) $<
 
 clean:
-	rm -f *.o *.mod *genmod*
+	rm -f *.o *.mod *genmod* *.lst
 
diff --git a/PdV_kernel_cuda.cu b/PdV_kernel_cuda.cu
index 6cff99a..5ed23f5 100644
--- a/PdV_kernel_cuda.cu
+++ b/PdV_kernel_cuda.cu
@@ -4,12 +4,10 @@
 #include "ftocmacros.h"
 #include <algorithm>
 
-#include "chunk_cuda.cu"
-extern CloverleafCudaChunk chunk;
-
 #include "omp.h"
 
-extern CudaDevPtrStorage pointer_storage;
+#include "chunk_cuda.cu"
+extern CloverleafCudaChunk chunk;
 
 __global__ void device_PdV_cuda_kernel_predict
 (int x_min, int x_max, int y_min, int y_max, 
@@ -38,25 +36,25 @@ const double * __restrict const yvel1)
     double recip_volume, energy_change, min_cell_volume,
         right_flux, left_flux, top_flux, bottom_flux, total_flux;
     
-    if(row > 1 && column > 1
-    && row < y_max+2 && column < x_max+2)
+    if (row >= (y_min + 1) && row <= (y_max + 1)
+    && column >= (x_min + 1) && column <= (x_max + 1))
     {
         left_flux   = (xarea[THARR2D(0, 0, 1)]
-            * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 0, 1)] 
-            + xvel0[THARR2D(0, 1, 1)] + xvel0[THARR2D(0, 1, 1)]))
+            * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)] 
+            + xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)]))
             * 0.25 * dt * 0.5;
         right_flux  = (xarea[THARR2D(1, 0, 1)]
-            * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 0, 1)] 
-            + xvel0[THARR2D(1, 1, 1)] + xvel0[THARR2D(1, 1, 1)]))
+            * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)] 
+            + xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)]))
             * 0.25 * dt * 0.5;
 
         bottom_flux = (yarea[THARR2D(0, 0, 0)]
-            * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(0, 0, 1)] 
-            + yvel0[THARR2D(1, 0, 1)] + yvel0[THARR2D(1, 0, 1)]))
+            * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)] 
+            + yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)]))
             * 0.25 * dt * 0.5;
         top_flux    = (yarea[THARR2D(0, 1, 0)]
-            * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(0, 1, 1)] 
-            + yvel0[THARR2D(1, 1, 1)] + yvel0[THARR2D(1, 1, 1)]))
+            * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)] 
+            + yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)]))
             * 0.25 * dt * 0.5;
 
         total_flux = right_flux - left_flux + top_flux - bottom_flux;
@@ -89,9 +87,11 @@ const double * __restrict const yvel1)
         density1[THARR2D(0, 0, 0)] = density0[THARR2D(0, 0, 0)] * volume_change;
     }
 
-    //reduction to get error conditon, if any
+    Reduce< BLOCK_SZ/2 >::run(err_cond_kernel, error_condition, max_func);
+    
+    /*
     __syncthreads();
-    for(size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
+    for(int offset = blockDim.x / 2; offset > 0; offset /= 2)
     {
         if(threadIdx.x < offset)
         {
@@ -101,6 +101,7 @@ const double * __restrict const yvel1)
         __syncthreads();
     }
     error_condition[blockIdx.x] = err_cond_kernel[0];;
+    */
 }
 
 __global__ void device_PdV_cuda_kernel_not_predict
@@ -130,25 +131,25 @@ const double * __restrict const yvel1)
     double recip_volume, energy_change, min_cell_volume,
         right_flux, left_flux, top_flux, bottom_flux, total_flux;
     
-    if(row > 1 && column > 1
-    && row < y_max+2 && column < x_max+2)
+    if (row >= (y_min + 1) && row <= (y_max + 1)
+    && column >= (x_min + 1) && column <= (x_max + 1))
     {
         left_flux   = (xarea[THARR2D(0, 0, 1)]
-            * (xvel0[THARR2D(0, 0, 1)] + xvel1[THARR2D(0, 0, 1)] 
-            + xvel0[THARR2D(0, 1, 1)] + xvel1[THARR2D(0, 1, 1)]))
+            * (xvel0[THARR2D(0, 0, 1)] + xvel0[THARR2D(0, 1, 1)] 
+            + xvel1[THARR2D(0, 0, 1)] + xvel1[THARR2D(0, 1, 1)]))
             * 0.25 * dt;
         right_flux  = (xarea[THARR2D(1, 0, 1)]
-            * (xvel0[THARR2D(1, 0, 1)] + xvel1[THARR2D(1, 0, 1)] 
-            + xvel0[THARR2D(1, 1, 1)] + xvel1[THARR2D(1, 1, 1)]))
+            * (xvel0[THARR2D(1, 0, 1)] + xvel0[THARR2D(1, 1, 1)] 
+            + xvel1[THARR2D(1, 0, 1)] + xvel1[THARR2D(1, 1, 1)]))
             * 0.25 * dt;
 
         bottom_flux = (yarea[THARR2D(0, 0, 0)]
-            * (yvel0[THARR2D(0, 0, 1)] + yvel1[THARR2D(0, 0, 1)] 
-            + yvel0[THARR2D(1, 0, 1)] + yvel1[THARR2D(1, 0, 1)]))
+            * (yvel0[THARR2D(0, 0, 1)] + yvel0[THARR2D(1, 0, 1)] 
+            + yvel1[THARR2D(0, 0, 1)] + yvel1[THARR2D(1, 0, 1)]))
             * 0.25 * dt;
         top_flux    = (yarea[THARR2D(0, 1, 0)]
-            * (yvel0[THARR2D(0, 1, 1)] + yvel1[THARR2D(0, 1, 1)] 
-            + yvel0[THARR2D(1, 1, 1)] + yvel1[THARR2D(1, 1, 1)]))
+            * (yvel0[THARR2D(0, 1, 1)] + yvel0[THARR2D(1, 1, 1)] 
+            + yvel1[THARR2D(0, 1, 1)] + yvel1[THARR2D(1, 1, 1)]))
             * 0.25 * dt;
 
         total_flux = right_flux - left_flux + top_flux - bottom_flux;
@@ -181,8 +182,11 @@ const double * __restrict const yvel1)
 
     }
 
+    Reduce< BLOCK_SZ/2 >::run(err_cond_kernel, error_condition, max_func);
+    
+    /*
     __syncthreads();
-    for(size_t offset = blockDim.x / 2; offset > 0; offset /= 2)
+    for(int offset = blockDim.x / 2; offset > 0; offset /= 2)
     {
         if(threadIdx.x < offset)
         {
@@ -192,110 +196,7 @@ const double * __restrict const yvel1)
         __syncthreads();
     }
     error_condition[blockIdx.x] = err_cond_kernel[0];;
-}
-
-void PdV_cuda
-(int error_condition,int predict,int x_min,int x_max,int y_min,int y_max,
-double dt,
-double *xarea,
-double *yarea,
-double *volume,
-double *density0,
-double *density1,
-double *energy0,
-double *energy1,
-double *pressure,
-double *viscosity,
-double *xvel0,
-double *yvel0,
-double *xvel1,
-double *yvel1)
-{
-    pointer_storage.setSize(x_max, y_max);
-
-    double* xarea_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xarea, BUFSZ2D(1, 0));
-    double* yarea_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yarea, BUFSZ2D(0, 1));
-
-    double* density0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, density0, BUFSZ2D(0, 0));
-    double* energy0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, energy0, BUFSZ2D(0, 0));
-
-    double* volume_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, volume, BUFSZ2D(0, 0));
-    double* pressure_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, pressure, BUFSZ2D(0, 0));
-    double* viscosity_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, viscosity, BUFSZ2D(0, 0));
-
-    double* xvel0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xvel0, BUFSZ2D(1, 1));
-    double* xvel1_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, xvel1, BUFSZ2D(1, 1));
-    double* yvel0_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yvel0, BUFSZ2D(1, 1));
-    double* yvel1_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, yvel1, BUFSZ2D(1, 1));
-
-    double* energy1_d = pointer_storage.getDevStorage(__LINE__, __FILE__);
-    double* density1_d = pointer_storage.getDevStorage(__LINE__, __FILE__);
-
-    size_t num_blocks = (((x_max+4)*(y_max+4))/BLOCK_SZ);
-    //error condition
-    thrust::device_ptr<int> reduce_ptr_1 =
-        thrust::device_malloc<int>(num_blocks*sizeof(int));
-    int* err_condition_arr_d = thrust::raw_pointer_cast(reduce_ptr_1);
-
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(device);
-#endif
-
-    if(predict)
-    {
-        device_PdV_cuda_kernel_predict<<< ((x_max+4)*(y_max+4))/BLOCK_SZ, BLOCK_SZ >>>
-        (x_min, x_max, y_min, y_max, dt, err_condition_arr_d,
-            xarea_d, yarea_d, volume_d, density0_d, density1_d,
-            energy0_d, energy1_d, pressure_d, viscosity_d,
-            xvel0_d, yvel0_d, xvel1_d, yvel1_d);
-    }
-    else
-    {
-        device_PdV_cuda_kernel_not_predict<<< ((x_max+4)*(y_max+4))/BLOCK_SZ, BLOCK_SZ >>>
-        (x_min, x_max, y_min, y_max, dt, err_condition_arr_d,
-            xarea_d, yarea_d, volume_d, density0_d, density1_d,
-            energy0_d, energy1_d, pressure_d, viscosity_d,
-            xvel0_d, yvel0_d, xvel1_d, yvel1_d);
-    }
-
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(device);
-#endif
-
-errChk(__LINE__, __FILE__);
-
-    pointer_storage.freeDevStorageAndCopy(energy1_d, energy1, BUFSZ2D(0, 0));
-    pointer_storage.freeDevStorageAndCopy(density1_d, density1, BUFSZ2D(0, 0));
-
-    pointer_storage.freeDevStorage(xarea_d);
-    pointer_storage.freeDevStorage(yarea_d);
-    pointer_storage.freeDevStorage(volume_d);
-    pointer_storage.freeDevStorage(pressure_d);
-    pointer_storage.freeDevStorage(viscosity_d);
-    pointer_storage.freeDevStorage(xvel0_d);
-    pointer_storage.freeDevStorage(yvel0_d);
-    pointer_storage.freeDevStorage(xvel1_d);
-    pointer_storage.freeDevStorage(yvel1_d);
-    pointer_storage.freeDevStorage(energy0_d);
-    pointer_storage.freeDevStorage(density0_d);
-
-    /*
-    int err_cond = thrust::reduce(reduce_ptr_1,
-        reduce_ptr_1 + num_blocks,
-        0, thrust::maximum<int>());
-    // */
-    int err_cond = *thrust::max_element(reduce_ptr_1, reduce_ptr_1 + num_blocks);
-    thrust::device_free(reduce_ptr_1);
-
-    if(err_cond == 1)
-    {
-        std::cerr << "Negative volume in PdV kernel" << std::endl;
-    }
-    else if(err_cond == 2)
-    {
-        std::cerr << "Negative cell volume in PdV kernel" << std::endl;
-    }
-
+    */
 }
 
 extern "C" void pdv_kernel_cuda_
@@ -311,31 +212,16 @@ double *viscosity,
 double *xvel0,
 double *xvel1,
 double *yvel0,
-double *yvel1)
+double *yvel1,
+double *unused_array)
 {
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(host);
-#endif
-    #ifndef CUDA_RESIDENT
-    PdV_cuda(*errorcondition, *prdct,  *xmin, *xmax, *ymin, *ymax,*dtbyt,
-        xarea, yarea, volume, density0, density1, energy0, energy1,
-        pressure, viscosity, xvel0, yvel0, xvel1, yvel1);
-    #else
     chunk.PdV_kernel(errorcondition, *prdct, *dtbyt);
-    #endif
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(host);
-#endif
 }
 
-
 void CloverleafCudaChunk::PdV_kernel
 (int* error_condition, int predict, double dt)
 {
-
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(device);
-#endif
+    _CUDA_BEGIN_PROFILE_name(device);
 
     if(predict)
     {
@@ -356,9 +242,7 @@ _CUDA_BEGIN_PROFILE_name(device);
         errChk(__LINE__, __FILE__);
     }
 
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(device);
-#endif
+    _CUDA_END_PROFILE_name(device);
 
     int err_cond = *thrust::max_element(reduce_pdv,
         reduce_pdv + num_blocks);
diff --git a/clover.in b/clover.in
index f285740..1c8f821 100644
--- a/clover.in
+++ b/clover.in
@@ -3,8 +3,8 @@
  state 1 density=0.2 energy=1.0
  state 2 density=1.0 energy=2.5 geometry=rectangle xmin=0.0 xmax=5.0 ymin=0.0 ymax=2.0
 
- x_cells=960
- y_cells=960
+ x_cells=3840
+ y_cells=3840
 
  xmin=0.0
  ymin=0.0
@@ -14,9 +14,8 @@
  initial_timestep=0.04
  timestep_rise=1.5
  max_timestep=0.04
- end_time=0.5
+ end_time=95.5
  end_step=87
-
  use_cuda_kernels
 
 *endclover
diff --git a/cuda_common.cu b/cuda_common.cu
index 01a68fd..10e528f 100644
--- a/cuda_common.cu
+++ b/cuda_common.cu
@@ -3,7 +3,7 @@
 #define __CUDA_COMMON_INC
 
 // size of workgroup/block
-#define BLOCK_SZ 512
+#define BLOCK_SZ 256
 
 // number of bytes to allocate for x size array
 #define BUFSZX(x_extra)   \
@@ -21,7 +21,9 @@
     * ((y_max) + 4 + y_extra)       \
     * sizeof(double) )
 
-// access a volue in a 2d array given the x and y offset from current thread index, adding or subtracting a bit more if it is one of the arrays with bigger rows
+// access a volue in a 2d array given the x and y offset from current thread
+// index, adding or subtracting a bit more if it is one of the arrays with
+// bigger rows
 #define THARR2D(x_offset, y_offset, big_row)\
     ( glob_id                               \
     + (x_offset)                            \
@@ -35,26 +37,6 @@
     const int row = glob_id / (x_max + 4);  \
     const int column = glob_id % (x_max + 4);
 
-// some arrays are larger in the x direction 
-#define __large_kernel_indexes \
-    const int glob_id = threadIdx.x         \
-        + blockIdx.x * blockDim.x;          \
-    const int row = glob_id / (x_max + 4 + x_extra);  \
-    const int column = glob_id % (x_max + 4 + x_extra);
-
-// beginning of profiling bit
-#define _CUDA_BEGIN_PROFILE_name(x) \
-    double x##t_0, x##t_1;          \
-    x##t_0 = omp_get_wtime();
-
-// end of profiling bit
-#define _CUDA_END_PROFILE_name(x)                   \
-    cudaDeviceSynchronize();                        \
-    x##t_1 = omp_get_wtime();                       \
-    std::cout << "[PROFILING] " << x##t_1 - x##t_0  \
-    << " to calculate block \"" << #x <<            \
-    "\" in " << __FILE__  <<std::endl;
-
 // used in update_halo and for copying back to host for mpi transfers
 #define FIELD_density0      0
 #define FIELD_density1      1
@@ -71,6 +53,9 @@
 #define FIELD_vol_flux_y    12
 #define FIELD_mass_flux_x   13
 #define FIELD_mass_flux_y   14
+#define NUM_FIELDS          15
+
+enum {CELL_DATA, VERTEX_DATA, X_FACE_DATA, Y_FACE_DATA};
 
 /*******************/
 
@@ -78,6 +63,7 @@
 #include <vector>
 #include <algorithm>
 #include "omp.h"
+#include "ftocmacros.h"
 
 // for reduction in calc_dt and PdV
 #include "thrust/copy.h"
@@ -89,7 +75,7 @@
 
 /*******************/
 
-// lots of time is spent error checking - define this to stop checking for errors. I don't have to tell you that this can be a silly idea
+// lots of time is spent error checking - define this to stop checking for errors.
 #ifndef NO_ERR_CHK
 
 #ifdef _GNUC_
@@ -106,99 +92,135 @@ inline void errChk
         std::cout << "error on line " << line_num << " of ";
         std::cout << file << std::endl;
         std::cout << "return code " << l_e; 
-        switch(l_e)
-        {
-            case 8: std::cout << " (invalid device function - recompile correctly for architecture)"; break;
-            case 11: std::cout << " (invalid value - some number was passed wrong)"; break;
-            default: std::cout << " ()"; break;
-        }
         std::cout << std::endl;
-        exit(-1);
+        exit(l_e);
     }
 }
 
 #else
 
-#define errChk(l, f) ;//nop
+// do nothing instead
+#define errChk(l, f) ;
 
 #endif //NO_ERR_CHK
 
-/*******************/
+// whether to time kernel run times
+#ifdef TIME_KERNELS
 
-class CudaDevPtrStorage
-{
-private:
-    //work arrays used for storing data used by any kernel
-    std::vector< double* > work_arrays;
+// beginning of profiling bit
+#define _CUDA_BEGIN_PROFILE_name(x) \
+    double x##t_0, x##t_1;          \
+    x##t_0 = omp_get_wtime();
 
-    int x_max, y_max;
-public:
+// end of profiling bit
+#define _CUDA_END_PROFILE_name(x)                   \
+    cudaDeviceSynchronize();                        \
+    x##t_1 = omp_get_wtime();                       \
+    std::cout << "[PROFILING] " << x##t_1 - x##t_0  \
+    << " to calculate block \"" << #x <<            \
+    "\" in " << __FILE__  <<std::endl;
 
-    CudaDevPtrStorage
-    ()
-    :x_max(0), y_max(0)
-    {
-    }
+#else
 
-    //*
-    void setSize
-    (int new_x_max, int new_y_max)
+#define _CUDA_BEGIN_PROFILE_name(x) ;
+#define _CUDA_END_PROFILE_name(x) ;
+
+#endif // TIME_KERNELS
+
+typedef struct cell_info {
+    const int x_e;
+    const int y_e;
+    const int x_i;
+    const int y_i;
+    const int x_f;
+    const int y_f;
+    const int grid_type;
+
+    cell_info
+    (int x_extra, int y_extra,
+    int x_invert, int y_invert,
+    int x_face, int y_face,
+    int in_type)
+    :x_e(x_extra), y_e(y_extra),
+    x_i(x_invert), y_i(y_invert),
+    x_f(x_face), y_f(y_face),
+    grid_type(in_type)
     {
-        x_max = new_x_max;
-        y_max = new_y_max;
+        ;
     }
-    // */
 
-    ~CudaDevPtrStorage
-    (void)
-    {
-        //std::for_each(work_arrays.begin(), work_arrays.end(), cudaFree);
-    }
+} cell_info_t;
+
+// types of array data
+const static cell_info_t CELL(    0, 0,  1,  1, 0, 0, CELL_DATA);
+const static cell_info_t VERTEX_X(1, 1, -1,  1, 0, 0, VERTEX_DATA);
+const static cell_info_t VERTEX_Y(1, 1,  1, -1, 0, 0, VERTEX_DATA);
+const static cell_info_t X_FACE(  1, 0, -1,  1, 1, 0, X_FACE_DATA);
+const static cell_info_t Y_FACE(  0, 1,  1, -1, 0, 1, Y_FACE_DATA);
+
+__device__ inline static int sum_func (int x, int y) { return x + y; }
+__device__ inline static int min_func (int x, int y) { return MIN(x, y); }
+__device__ inline static int max_func (int x, int y) { return MAX(x, y); }
 
-    // gets a pointer to some device storage, if it's already allocated then use it, or allocate some more
-    double* getDevStorage
-    (int line, std::string const& file)
+__device__ inline static double sum_func (double x, double y) { return x + y; }
+__device__ inline static double min_func (double x, double y) { return MIN(x, y); }
+__device__ inline static double max_func (double x, double y) { return MAX(x, y); }
+
+template < int offset >
+class Reduce
+{
+public:
+    __device__ inline static void run
+    (double* array, double* out, double(*func)(double, double))
     {
-        double * new_storage;
-        if(work_arrays.size() < 1)
+        // only need to synch if not working within a warp
+        if (offset > 16)
         {
-            cudaMalloc((void**) &new_storage, ((x_max+5)*(y_max+5)*sizeof(double)));
-            errChk(line, file);
+            __syncthreads();
         }
-        else
+
+        // only continue if it's in the lower half
+        if (threadIdx.x < offset)
         {
-            new_storage = work_arrays.back();
-            work_arrays.pop_back();
+            array[threadIdx.x] = func(array[threadIdx.x], array[threadIdx.x + offset]);
+            Reduce< offset/2 >::run(array, out, func);
         }
-        return new_storage;
     }
 
-    // same as above, but also copies data from an existing host pointer to the device
-    double* getDevStorageAndCopy
-    (int line, std::string const& file, const double* existing, size_t size)
+    __device__ inline static void run
+    (int* array, int* out, int(*func)(int, int))
     {
-        double * new_storage;
-        new_storage = getDevStorage(line, file);
-        cudaMemcpy(new_storage, existing, size, cudaMemcpyHostToDevice);
-        errChk(line, file);
-        return new_storage;
+        // only need to synch if not working within a warp
+        if (offset > 16)
+        {
+            __syncthreads();
+        }
+
+        // only continue if it's in the lower half
+        if (threadIdx.x < offset)
+        {
+            array[threadIdx.x] = func(array[threadIdx.x], array[threadIdx.x + offset]);
+            Reduce< offset/2 >::run(array, out, func);
+        }
     }
+};
 
-    // frees up some device storage to be used again elsewhere
-    void freeDevStorage
-    (double* dev_ptr)
+template < >
+class Reduce < 0 >
+{
+public:
+    __device__ inline static void run
+    (double* array, double* out, double(*func)(double, double))
     {
-        work_arrays.push_back(dev_ptr);
+        out[blockIdx.x] = array[0];
     }
 
-    // same as above, but copies data back from the device as well
-    void freeDevStorageAndCopy
-    (double* dev_ptr, double* existing, size_t size)
+    __device__ inline static void run
+    (int* array, int* out, int(*func)(int, int))
     {
-        cudaMemcpy(existing, dev_ptr, size, cudaMemcpyDeviceToHost);
-        errChk(__LINE__, __FILE__);
-        freeDevStorage(dev_ptr);
+        out[blockIdx.x] = array[0];
     }
 };
 
 #endif
+
diff --git a/mpi_transfers_cuda.cu b/mpi_transfers_cuda.cu
index 6642e93..ffe426c 100644
--- a/mpi_transfers_cuda.cu
+++ b/mpi_transfers_cuda.cu
@@ -2,6 +2,7 @@
 
 #include "chunk_cuda.cu"
 #include "cuda_common.cu"
+#include <numeric>
 
 #include "pack_buffer_kernels.cu"
 
@@ -19,14 +20,11 @@ extern CloverleafCudaChunk chunk;
 extern "C" void cudapackbuffers_
 (const int* which_array,
 const int* which_side,
-const int* x_inc,
-const int* y_inc,
 double* buffer,
 const int* buffer_size,
 const int* depth)
 {
     chunk.packBuffer((*which_array) - 1, *which_side,
-        *x_inc, *y_inc,
         buffer, *buffer_size, *depth);
 }
 
@@ -34,69 +32,45 @@ const int* depth)
 extern "C" void cudaunpackbuffers_
 (const int* which_array,
 const int* which_side,
-const int* x_inc,
-const int* y_inc,
 double* buffer,
 const int* buffer_size,
 const int* depth)
 {
     chunk.unpackBuffer((*which_array) - 1, *which_side,
-        *x_inc, *y_inc,
         buffer, *buffer_size, *depth);
 }
 
 void CloverleafCudaChunk::packBuffer
 (const int which_array,
 const int which_side,
-const int x_inc,
-const int y_inc,
 double* buffer,
 const int buffer_size,
 const int depth)
 {
-    #define PACK_CUDA_BUFFERS(dev_ptr, x_extra, y_extra) \
+    #define CALL_PACK(dev_ptr, type, face, dir)\
+	{\
+        const int launch_sz = (ceil((dir##_max+4+type.dir##_e)/static_cast<float>(BLOCK_SZ))) * depth; \
+        device_pack##face##Buffer<<< launch_sz, BLOCK_SZ >>> \
+        (x_min, x_max, y_min, y_max, type, \
+        dev_ptr, dev_##face##_send_buffer, depth); \
+        errChk(__LINE__, __FILE__); \
+        cudaMemcpy(buffer, dev_##face##_send_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \
+        errChk(__LINE__, __FILE__); \
+        cudaDeviceSynchronize();\
+        break; \
+	}
+
+    #define PACK_CUDA_BUFFERS(dev_ptr, type) \
         switch(which_side) \
         { \
             case LEFT_FACE: \
-                device_packLeftBuffer<<< num_blocks, BLOCK_SZ >>> \
-                (x_min, x_max, y_min, y_max,\
-                x_extra, y_extra, \
-                x_inc, y_inc, \
-                dev_ptr, dev_left_buffer, buffer_size, depth); \
-                errChk(__LINE__, __FILE__); \
-                cudaMemcpy(buffer, dev_left_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \
-                errChk(__LINE__, __FILE__); \
-                break;\
+                CALL_PACK(dev_ptr, type, left, y);\
             case RIGHT_FACE:\
-                device_packRightBuffer<<< num_blocks, BLOCK_SZ >>> \
-                (x_min, x_max, y_min, y_max,\
-                x_extra, y_extra, \
-                x_inc, y_inc, \
-                dev_ptr, dev_right_buffer, buffer_size, depth); \
-                errChk(__LINE__, __FILE__); \
-                cudaMemcpy(buffer, dev_right_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \
-                errChk(__LINE__, __FILE__); \
-                break;\
-            case TOP_FACE:\
-                device_packTopBuffer<<< num_blocks, BLOCK_SZ >>> \
-                (x_min, x_max, y_min, y_max,\
-                x_extra, y_extra, \
-                x_inc, y_inc, \
-                dev_ptr, dev_top_buffer, buffer_size, depth); \
-                errChk(__LINE__, __FILE__); \
-                cudaMemcpy(buffer, dev_top_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \
-                errChk(__LINE__, __FILE__); \
-                break;\
+                CALL_PACK(dev_ptr, type, right, y);\
             case BOTTOM_FACE:\
-                device_packBottomBuffer<<< num_blocks, BLOCK_SZ >>> \
-                (x_min, x_max, y_min, y_max,\
-                x_extra, y_extra, \
-                x_inc, y_inc, \
-                dev_ptr, dev_bottom_buffer, buffer_size, depth); \
-                errChk(__LINE__, __FILE__); \
-                cudaMemcpy(buffer, dev_bottom_buffer, buffer_size*sizeof(double), cudaMemcpyDeviceToHost); \
-                errChk(__LINE__, __FILE__); \
-                break;\
+                CALL_PACK(dev_ptr, type, bottom, x);\
+            case TOP_FACE:\
+                CALL_PACK(dev_ptr, type, top, x);\
             default: \
                 std::cout << "Invalid side passed to buffer packing in task " << task << std::endl; \
                 exit(1); \
@@ -104,77 +78,57 @@ const int depth)
 
     switch(which_array)
     {
-        case FIELD_density0: PACK_CUDA_BUFFERS(density0, 0, 0); break;
-        case FIELD_density1: PACK_CUDA_BUFFERS(density1, 0, 0); break;
-        case FIELD_energy0: PACK_CUDA_BUFFERS(energy0, 0, 0); break;
-        case FIELD_energy1: PACK_CUDA_BUFFERS(energy1, 0, 0); break;
-        case FIELD_pressure: PACK_CUDA_BUFFERS(pressure, 0, 0); break;
-        case FIELD_viscosity: PACK_CUDA_BUFFERS(viscosity, 0, 0); break;
-        case FIELD_soundspeed: PACK_CUDA_BUFFERS(soundspeed, 0, 0); break;
-        case FIELD_xvel0: PACK_CUDA_BUFFERS(xvel0, 1, 1); break;
-        case FIELD_xvel1: PACK_CUDA_BUFFERS(xvel1, 1, 1); break;
-        case FIELD_yvel0: PACK_CUDA_BUFFERS(yvel0, 1, 1); break;
-        case FIELD_yvel1: PACK_CUDA_BUFFERS(yvel1, 1, 1); break;
-        case FIELD_vol_flux_x: PACK_CUDA_BUFFERS(vol_flux_x, 1, 0); break;
-        case FIELD_vol_flux_y: PACK_CUDA_BUFFERS(vol_flux_y, 0, 1); break;
-        case FIELD_mass_flux_x: PACK_CUDA_BUFFERS(mass_flux_x, 1, 0); break;
-        case FIELD_mass_flux_y: PACK_CUDA_BUFFERS(mass_flux_y, 0, 1); break;
+        case FIELD_density0: PACK_CUDA_BUFFERS(density0, CELL); break;
+        case FIELD_density1: PACK_CUDA_BUFFERS(density1, CELL); break;
+        case FIELD_energy0: PACK_CUDA_BUFFERS(energy0, CELL); break;
+        case FIELD_energy1: PACK_CUDA_BUFFERS(energy1, CELL); break;
+        case FIELD_pressure: PACK_CUDA_BUFFERS(pressure, CELL); break;
+        case FIELD_viscosity: PACK_CUDA_BUFFERS(viscosity, CELL); break;
+        case FIELD_soundspeed: PACK_CUDA_BUFFERS(soundspeed, CELL); break;
+        case FIELD_xvel0: PACK_CUDA_BUFFERS(xvel0, VERTEX_X); break;
+        case FIELD_xvel1: PACK_CUDA_BUFFERS(xvel1, VERTEX_X); break;
+        case FIELD_yvel0: PACK_CUDA_BUFFERS(yvel0, VERTEX_Y); break;
+        case FIELD_yvel1: PACK_CUDA_BUFFERS(yvel1, VERTEX_Y); break;
+        case FIELD_vol_flux_x: PACK_CUDA_BUFFERS(vol_flux_x, X_FACE); break;
+        case FIELD_vol_flux_y: PACK_CUDA_BUFFERS(vol_flux_y, Y_FACE); break;
+        case FIELD_mass_flux_x: PACK_CUDA_BUFFERS(mass_flux_x, X_FACE); break;
+        case FIELD_mass_flux_y: PACK_CUDA_BUFFERS(mass_flux_y, Y_FACE); break;
         default: std::cerr << "Invalid which_array identifier passed to CUDA for MPI transfer" << std::endl; exit(1);
     }
+
 }
 
 void CloverleafCudaChunk::unpackBuffer
 (const int which_array,
 const int which_side,
-const int x_inc,
-const int y_inc,
 double* buffer,
 const int buffer_size,
 const int depth)
 {
-    #define UNPACK_CUDA_BUFFERS(dev_ptr, x_extra, y_extra) \
+    #define CALL_UNPACK(dev_ptr, type, face, dir)\
+	{ \
+        cudaMemcpy(dev_##face##_recv_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \
+        errChk(__LINE__, __FILE__); \
+        cudaDeviceSynchronize();\
+        const int launch_sz = (ceil((dir##_max+4+type.dir##_e)/static_cast<float>(BLOCK_SZ))) * depth; \
+        device_unpack##face##Buffer<<< launch_sz, BLOCK_SZ >>> \
+        (x_min, x_max, y_min, y_max, type, \
+        dev_ptr, dev_##face##_recv_buffer, depth); \
+        errChk(__LINE__, __FILE__); \
+        break; \
+	}
+
+    #define UNPACK_CUDA_BUFFERS(dev_ptr, type) \
         switch(which_side) \
         { \
             case LEFT_FACE: \
-                device_unpackLeftBuffer<<< num_blocks, BLOCK_SZ >>> \
-                (x_min, x_max, y_min, y_max,\
-                x_extra, y_extra, \
-                x_inc, y_inc, \
-                dev_ptr, dev_left_buffer, buffer_size, depth); \
-                errChk(__LINE__, __FILE__); \
-                cudaMemcpy(dev_left_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \
-                errChk(__LINE__, __FILE__); \
-                break;\
+                CALL_UNPACK(dev_ptr, type, left, y);\
             case RIGHT_FACE:\
-                device_unpackRightBuffer<<< num_blocks, BLOCK_SZ >>> \
-                (x_min, x_max, y_min, y_max,\
-                x_extra, y_extra, \
-                x_inc, y_inc, \
-                dev_ptr, dev_right_buffer, buffer_size, depth); \
-                errChk(__LINE__, __FILE__); \
-                cudaMemcpy(dev_right_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \
-                errChk(__LINE__, __FILE__); \
-                break;\
-            case TOP_FACE:\
-                device_unpackTopBuffer<<< num_blocks, BLOCK_SZ >>> \
-                (x_min, x_max, y_min, y_max,\
-                x_extra, y_extra, \
-                x_inc, y_inc, \
-                dev_ptr, dev_top_buffer, buffer_size, depth); \
-                errChk(__LINE__, __FILE__); \
-                cudaMemcpy(dev_top_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \
-                errChk(__LINE__, __FILE__); \
-                break;\
+                CALL_UNPACK(dev_ptr, type, right, y);\
             case BOTTOM_FACE:\
-                device_unpackBottomBuffer<<< num_blocks, BLOCK_SZ >>> \
-                (x_min, x_max, y_min, y_max,\
-                x_extra, y_extra, \
-                x_inc, y_inc, \
-                dev_ptr, dev_bottom_buffer, buffer_size, depth); \
-                errChk(__LINE__, __FILE__); \
-                cudaMemcpy(dev_bottom_buffer, buffer, buffer_size*sizeof(double), cudaMemcpyHostToDevice); \
-                errChk(__LINE__, __FILE__); \
-                break;\
+                CALL_UNPACK(dev_ptr, type, bottom, x);\
+            case TOP_FACE:\
+                CALL_UNPACK(dev_ptr, type, top, x);\
             default: \
                 std::cout << "Invalid side passed to buffer unpacking in task " << task << std::endl; \
                 exit(1); \
@@ -182,91 +136,23 @@ const int depth)
 
     switch(which_array)
     {
-        case FIELD_density0: UNPACK_CUDA_BUFFERS(density0, 0, 0); break;
-        case FIELD_density1: UNPACK_CUDA_BUFFERS(density1, 0, 0); break;
-        case FIELD_energy0: UNPACK_CUDA_BUFFERS(energy0, 0, 0); break;
-        case FIELD_energy1: UNPACK_CUDA_BUFFERS(energy1, 0, 0); break;
-        case FIELD_pressure: UNPACK_CUDA_BUFFERS(pressure, 0, 0); break;
-        case FIELD_viscosity: UNPACK_CUDA_BUFFERS(viscosity, 0, 0); break;
-        case FIELD_soundspeed: UNPACK_CUDA_BUFFERS(soundspeed, 0, 0); break;
-        case FIELD_xvel0: UNPACK_CUDA_BUFFERS(xvel0, 1, 1); break;
-        case FIELD_xvel1: UNPACK_CUDA_BUFFERS(xvel1, 1, 1); break;
-        case FIELD_yvel0: UNPACK_CUDA_BUFFERS(yvel0, 1, 1); break;
-        case FIELD_yvel1: UNPACK_CUDA_BUFFERS(yvel1, 1, 1); break;
-        case FIELD_vol_flux_x: UNPACK_CUDA_BUFFERS(vol_flux_x, 1, 0); break;
-        case FIELD_vol_flux_y: UNPACK_CUDA_BUFFERS(vol_flux_y, 0, 1); break;
-        case FIELD_mass_flux_x: UNPACK_CUDA_BUFFERS(mass_flux_x, 1, 0); break;
-        case FIELD_mass_flux_y: UNPACK_CUDA_BUFFERS(mass_flux_y, 0, 1); break;
-        default: std::cerr << "Invalid which_array identifier passed to CUDA for MPI transfer" << std::endl; exit(1);
-    }
-}
-
-/****************************************************/
-
-// copy data out from device back to host
-extern "C" void clover_cuda_copy_out_
-(const int* which_array, double* copy_into)
-{
-    chunk.copyToHost((*which_array) - 1, copy_into);
-}
-
-// copy data back to device from host
-extern "C" void clover_cuda_copy_back_
-(const int* which_array, double* copy_from)
-{
-    chunk.copyToDevice((*which_array) - 1, copy_from);
-}
-void CloverleafCudaChunk::copyToHost(const int which_array, double* copy_into)
-{
-    #define COPY_OUT(dev_ptr, sz) \
-        std::cout << "copying to host: " << #dev_ptr << std::endl; \
-        cudaMemcpy(copy_into, dev_ptr, sz, cudaMemcpyDeviceToHost); \
-        errChk(__LINE__, __FILE__);
-    switch(which_array)
-    {
-        case FIELD_density0: COPY_OUT(density0, BUFSZ2D(0, 0)); break;
-        case FIELD_density1: COPY_OUT(density1, BUFSZ2D(0, 0)); break;
-        case FIELD_energy0: COPY_OUT(energy0, BUFSZ2D(0, 0)); break;
-        case FIELD_energy1: COPY_OUT(energy1, BUFSZ2D(0, 0)); break;
-        case FIELD_pressure: COPY_OUT(pressure, BUFSZ2D(0, 0)); break;
-        case FIELD_viscosity: COPY_OUT(viscosity, BUFSZ2D(0, 0)); break;
-        case FIELD_soundspeed: COPY_OUT(soundspeed, BUFSZ2D(0, 0)); break;
-        case FIELD_xvel0: COPY_OUT(xvel0, BUFSZ2D(1, 1)); break;
-        case FIELD_xvel1: COPY_OUT(xvel1, BUFSZ2D(1, 1)); break;
-        case FIELD_yvel0: COPY_OUT(yvel0, BUFSZ2D(1, 1)); break;
-        case FIELD_yvel1: COPY_OUT(yvel1, BUFSZ2D(1, 1)); break;
-        case FIELD_vol_flux_x: COPY_OUT(vol_flux_x, BUFSZ2D(1, 0)); break;
-        case FIELD_vol_flux_y: COPY_OUT(vol_flux_y, BUFSZ2D(0, 1)); break;
-        case FIELD_mass_flux_x: COPY_OUT(mass_flux_x, BUFSZ2D(1, 0)); break;
-        case FIELD_mass_flux_y: COPY_OUT(mass_flux_y, BUFSZ2D(0, 1)); break;
+        case FIELD_density0: UNPACK_CUDA_BUFFERS(density0, CELL); break;
+        case FIELD_density1: UNPACK_CUDA_BUFFERS(density1, CELL); break;
+        case FIELD_energy0: UNPACK_CUDA_BUFFERS(energy0, CELL); break;
+        case FIELD_energy1: UNPACK_CUDA_BUFFERS(energy1, CELL); break;
+        case FIELD_pressure: UNPACK_CUDA_BUFFERS(pressure, CELL); break;
+        case FIELD_viscosity: UNPACK_CUDA_BUFFERS(viscosity, CELL); break;
+        case FIELD_soundspeed: UNPACK_CUDA_BUFFERS(soundspeed, CELL); break;
+        case FIELD_xvel0: UNPACK_CUDA_BUFFERS(xvel0, VERTEX_X); break;
+        case FIELD_xvel1: UNPACK_CUDA_BUFFERS(xvel1, VERTEX_X); break;
+        case FIELD_yvel0: UNPACK_CUDA_BUFFERS(yvel0, VERTEX_Y); break;
+        case FIELD_yvel1: UNPACK_CUDA_BUFFERS(yvel1, VERTEX_Y); break;
+        case FIELD_vol_flux_x: UNPACK_CUDA_BUFFERS(vol_flux_x, X_FACE); break;
+        case FIELD_vol_flux_y: UNPACK_CUDA_BUFFERS(vol_flux_y, Y_FACE); break;
+        case FIELD_mass_flux_x: UNPACK_CUDA_BUFFERS(mass_flux_x, X_FACE); break;
+        case FIELD_mass_flux_y: UNPACK_CUDA_BUFFERS(mass_flux_y, Y_FACE); break;
         default: std::cerr << "Invalid which_array identifier passed to CUDA for MPI transfer" << std::endl; exit(1);
     }
-}
 
-void CloverleafCudaChunk::copyToDevice(const int which_array, double* copy_from)
-{
-    #define COPY_BACK(dev_ptr, sz) \
-        std::cout << "copying to device: " << #dev_ptr << std::endl; \
-        cudaMemcpy(dev_ptr, copy_from, sz, cudaMemcpyHostToDevice); \
-        errChk(__LINE__, __FILE__);
-    switch(which_array)
-    {
-        case FIELD_density0: COPY_BACK(density0, BUFSZ2D(0, 0)); break;
-        case FIELD_density1: COPY_BACK(density1, BUFSZ2D(0, 0)); break;
-        case FIELD_energy0: COPY_BACK(energy0, BUFSZ2D(0, 0)); break;
-        case FIELD_energy1: COPY_BACK(energy1, BUFSZ2D(0, 0)); break;
-        case FIELD_pressure: COPY_BACK(pressure, BUFSZ2D(0, 0)); break;
-        case FIELD_viscosity: COPY_BACK(viscosity, BUFSZ2D(0, 0)); break;
-        case FIELD_soundspeed: COPY_BACK(soundspeed, BUFSZ2D(0, 0)); break;
-        case FIELD_xvel0: COPY_BACK(xvel0, BUFSZ2D(1, 1)); break;
-        case FIELD_xvel1: COPY_BACK(xvel1, BUFSZ2D(1, 1)); break;
-        case FIELD_yvel0: COPY_BACK(yvel0, BUFSZ2D(1, 1)); break;
-        case FIELD_yvel1: COPY_BACK(yvel1, BUFSZ2D(1, 1)); break;
-        case FIELD_vol_flux_x: COPY_BACK(vol_flux_x, BUFSZ2D(1, 0)); break;
-        case FIELD_vol_flux_y: COPY_BACK(vol_flux_y, BUFSZ2D(0, 1)); break;
-        case FIELD_mass_flux_x: COPY_BACK(mass_flux_x, BUFSZ2D(1, 0)); break;
-        case FIELD_mass_flux_y: COPY_BACK(mass_flux_y, BUFSZ2D(0, 1)); break;
-        default: std::cerr << "Invalid which_array identifier passed to CUDA for MPI transfer" << std::endl; exit(1);
-    }
 }
 
diff --git a/pack_buffer_kernels.cu b/pack_buffer_kernels.cu
index e57c437..df093f5 100644
--- a/pack_buffer_kernels.cu
+++ b/pack_buffer_kernels.cu
@@ -2,8 +2,11 @@
 
 /********************/
 
+// j is column
+// k is row
+
 // could put this check in, prob doesnt need it
-// if(row > 1 - depth && row < y_max + 2 + depth + y_inc)
+// if (row > 1 - depth && row < y_max + 2 + depth + y_inc)
 
 // left/right buffer
 // index=j+(k+depth-1)*depth
@@ -31,197 +34,202 @@
 
 /********************/
 
-// j is column
-// k is row
-
-// pack buffers
-
-// >= is more like fortran do loops, should help with conversion
-#define BETWEEN_COLUMNS(upper, lower) (column >= (lower) && column <= (upper))
-#define BETWEEN_ROWS(upper, lower) (row >= (lower) && row <= (upper))
+// for top/bottom
+#define HORZ_IDX(add) (column + depth + ((add + 1) - 1) * (x_max + x_inc + (2 * depth)))
+// for left/right
+#define VERT_IDX(add) ((add + 1) + (row + depth - 1) * depth)
 
-__global__ void device_packLeftBuffer
+__global__ void device_packleftBuffer
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_inc, int y_inc,
-double* array,
-double* left_buffer,
-int buffer_size,
-int depth)
+cell_info_t type,
+const double* array,
+      double* left_buffer,
+const int depth)
 {
-    __large_kernel_indexes;
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0;
+    int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0;
 
-    // x_min + x_inc - 1 + j
-    // x_min = 1 in fortran -> 2 in c
-    // j = 1 or 2
-    if(column == 2 + x_inc - 1 + 1)
-    {
-        left_buffer[row] = array[THARR2D(0, 0, x_extra)];
-    }
-    else if(depth > 1 && column == 2 + x_inc - 1 + 2)
+    //__kernel_indexes;
+    const int glob_id = threadIdx.x + blockIdx.x * blockDim.x;
+    const int row = glob_id / depth;
+    const int column = glob_id % depth;
+
+    if (row >= (y_min + 1) - depth && row <= (y_max + 1) + y_inc + depth)
     {
-        left_buffer[row + y_max + 4 + y_extra] = array[THARR2D(0, 0, x_extra)];
+        const int row_begin = row * (x_max + 4 + x_extra);
+
+        left_buffer[VERT_IDX(column)] = array[row_begin + (x_min + 1) + x_inc - 1 + 1 + column];
     }
 }
 
-__global__ void device_unpackLeftBuffer
+__global__ void device_unpackleftBuffer
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_inc, int y_inc,
-double* array,
-double* left_buffer,
-int buffer_size,
-int depth)
+cell_info_t type,
+      double* array,
+const double* left_buffer,
+const int depth)
 {
-    __large_kernel_indexes;
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0;
+    int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0;
 
-    // x_min - j
-    if(column == 2 - 1)
-    {
-        array[THARR2D(0, 0, x_extra)] = left_buffer[row];
-    }
-    else if(depth > 1 && column == 2 - 2)
+    //__kernel_indexes;
+    const int glob_id = threadIdx.x + blockIdx.x * blockDim.x;
+    const int row = glob_id / depth;
+    const int column = glob_id % depth;
+
+    if (row >= (y_min + 1) - depth && row <= (y_max + 1) + y_inc + depth)
     {
-        array[THARR2D(0, 0, x_extra)] = left_buffer[row + y_max + 4 + y_extra];
+        const int row_begin = row * (x_max + 4 + x_extra);
+
+        array[row_begin + (x_min + 1) - (1 + column)] = left_buffer[VERT_IDX(column)];
     }
 }
 
 /************************************************************/
 
-__global__ void device_packRightBuffer
+__global__ void device_packrightBuffer
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_inc, int y_inc,
-double* array,
-double* right_buffer,
-int buffer_size,
-int depth)
+cell_info_t type,
+const double* array,
+      double* right_buffer,
+const int depth)
 {
-    __large_kernel_indexes;
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0;
+    int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0;
 
-    // x_max + 1 - j
-    if(column == x_max + 1 + 1 - 1)
-    {
-        right_buffer[row] = array[THARR2D(0, 0, x_extra)];
-    }
-    else if(depth > 1 && column == x_max + 1 + 1 - 2)
+    //__kernel_indexes;
+    const int glob_id = threadIdx.x + blockIdx.x * blockDim.x;
+    const int row = glob_id / depth;
+    const int column = glob_id % depth;
+
+    if (row >= (y_min + 1) - depth && row <= (y_max + 1) + y_inc + depth)
     {
-        //array[THARR2D(0, 0, x_extra)] = right_buffer[row + y_max + 4];
-        right_buffer[row + y_max + 4 + y_extra] = array[THARR2D(0, 0, x_extra)];
+        const int row_begin = row * (x_max + 4 + x_extra);
+
+        right_buffer[VERT_IDX(column)] = array[row_begin + (x_max + 1) + 1 - (1 + column)];
     }
 }
 
-__global__ void device_unpackRightBuffer
+__global__ void device_unpackrightBuffer
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_inc, int y_inc,
-double* array,
-double* right_buffer,
-int buffer_size,
-int depth)
+cell_info_t type,
+      double* array,
+const double* right_buffer,
+const int depth)
 {
-    __large_kernel_indexes;
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0;
+    int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0;
 
-    // x_max + x_inc + j
-    if(column == x_max + 1 + x_inc + 1)
-    {
-        array[THARR2D(0, 0, x_extra)] = right_buffer[row];
-    }
-    else if(depth > 1 && column == x_max + 1 + x_inc + 1)
+    //__kernel_indexes;
+    const int glob_id = threadIdx.x + blockIdx.x * blockDim.x;
+    const int row = glob_id / depth;
+    const int column = glob_id % depth;
+
+    if (row >= (y_min + 1) - depth && row <= (y_max + 1) + y_inc + depth)
     {
-        array[THARR2D(0, 0, x_extra)] = right_buffer[row + y_max + 4 + y_extra];
+        const int row_begin = row * (x_max + 4 + x_extra);
+
+        array[row_begin + (x_max + 1) + x_inc + 1 + column] = right_buffer[VERT_IDX(column)];
     }
 }
 
 /************************************************************/
 
-__global__ void device_packBottomBuffer
+__global__ void device_packbottomBuffer
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_inc, int y_inc,
+cell_info_t type,
 double* array,
 double* bottom_buffer,
-int buffer_size,
-int depth)
+const int depth)
 {
-    __large_kernel_indexes;
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0;
+    int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0;
+    __kernel_indexes;
 
-    // y_min + y_inc - 1 + k
-    if(row == 2 + y_inc - 1 + 1)
-    {
-        bottom_buffer[row] = array[THARR2D(0, 0, x_extra)];
-    }
-    else if(depth > 1 && row == 2 + y_inc - 1 + 2)
+    if (column >= (x_min + 1) - depth && column <= (x_max + 1) + y_inc + depth)
     {
-        bottom_buffer[row + x_max + 4 + x_extra] = array[THARR2D(0, 0, x_extra)];
+        if (row < depth)
+        {
+            bottom_buffer[HORZ_IDX(row)] = array[THARR2D(0, (y_min + 1) + y_inc - 1 + 1, x_extra)];
+        }
     }
 }
 
-__global__ void device_unpackBottomBuffer
+__global__ void device_unpackbottomBuffer
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_inc, int y_inc,
+cell_info_t type,
 double* array,
 double* bottom_buffer,
-int buffer_size,
-int depth)
+const int depth)
 {
-    __large_kernel_indexes;
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0;
+    int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0;
+    __kernel_indexes;
 
-    // y_min - k
-    if(row == 2 - 1)
-    {
-        array[THARR2D(0, 0, x_extra)] = bottom_buffer[row];
-    }
-    else if(depth > 1 && row == 2 - 2)
+    if (column >= (x_min + 1) - depth && column <= (x_max + 1) + y_inc + depth)
     {
-        array[THARR2D(0, 0, x_extra)] = bottom_buffer[row + x_max + 4 + x_extra];
+        if (row < depth)
+        {
+            array[THARR2D(0, (y_min + 1) - (1 + 2*row), x_extra)] = bottom_buffer[HORZ_IDX(row)];
+        }
     }
 }
 
 /************************************************************/
 
-__global__ void device_packTopBuffer
+__global__ void device_packtopBuffer
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_inc, int y_inc,
+cell_info_t type,
 double* array,
 double* top_buffer,
-int buffer_size,
-int depth)
+const int depth)
 {
-    __large_kernel_indexes;
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0;
+    int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0;
+    __kernel_indexes;
 
-    // y_min + y_inc - 1 + k
-    if(row == 2 + y_inc - 1 + 1)
-    {
-        top_buffer[row] = array[THARR2D(0, 0, x_extra)];
-    }
-    else if(depth > 1 && row == 2 + y_inc - 1 + 2)
+    if (column >= (x_min + 1) - depth && column <= (x_max + 1) + y_inc + depth)
     {
-        top_buffer[row + x_max + 4 + x_extra] = array[THARR2D(0, 0, x_extra)];
+        if (row < depth)
+        {
+            top_buffer[HORZ_IDX(row)] = array[THARR2D(0, (y_max + 1) + 1 - (1 + 2*row), x_extra)];
+        }
     }
 }
 
-__global__ void device_unpackTopBuffer
+__global__ void device_unpacktopBuffer
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_inc, int y_inc,
+cell_info_t type,
 double* array,
 double* top_buffer,
-int buffer_size,
-int depth)
+const int depth)
 {
-    __large_kernel_indexes;
-
-    // y_max + y_inc + k
-    if(row == y_max + 1 + y_inc + 1)
-    {
-        array[THARR2D(0, 0, x_extra)] = top_buffer[row];
-    }
-    else if(depth > 1 && row == y_max + 1 + y_inc + 2)
-    {
-        array[THARR2D(0, 0, x_extra)] = top_buffer[row + x_max + 4 + x_extra];
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_inc = (type.grid_type == VERTEX_DATA || type.grid_type == X_FACE_DATA) ? 1 : 0;
+    int y_inc = (type.grid_type == VERTEX_DATA || type.grid_type == Y_FACE_DATA) ? 1 : 0;
+    __kernel_indexes;
+
+    if (column >= (x_min + 1) - depth && column <= (x_max + 1) + y_inc + depth)
+    {
+        if (row < depth)
+        {
+            array[THARR2D(0, (y_max + 1) + y_inc + 1, x_extra)] = top_buffer[HORZ_IDX(row)];
+        }
     }
 }
 
diff --git a/update_halo_kernel_cuda.cu b/update_halo_kernel_cuda.cu
index 1b0313f..985af9c 100755
--- a/update_halo_kernel_cuda.cu
+++ b/update_halo_kernel_cuda.cu
@@ -1,10 +1,8 @@
-// TODO some copies are negative, or copy form slightly different locations - look through and fix
-// change to use depth
-
 #include "ftocmacros.h"
 #include "cuda_common.cu"
 
 #include "chunk_cuda.cu"
+#include <cstdio>
 
 #define CHUNK_left          0
 #define CHUNK_right         1
@@ -13,245 +11,145 @@
 
 #define EXTERNAL_FACE       (-1)
 
-#define NUM_FIELDS          15
-
-extern CudaDevPtrStorage pointer_storage;
-
 extern CloverleafCudaChunk chunk;
 
-/*
-*   copies from the outer bit from [x_min, x_max] to the outer cells
-*/
-
 __global__ void device_update_halo_kernel_bottom_cuda
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_invert, int y_invert,
+cell_info_t type,
 double* cur_array,
 int depth)
 {
-    __large_kernel_indexes;
-    // if its on the third row
-    /*
-    if(row == 2
-    && column > 0 && column < x_max+3+x_extra)
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int y_invert = type.y_i;
+    __kernel_indexes;
+
+    // offset by 1 if it is anything but a CELL grid
+    int b_offset = (type.grid_type != CELL_DATA) ? 1 : 0;
+
+    if(column >= 2 - depth && column <= (x_max + 1) + x_extra + depth)
     {
-        for(int ii = 0; ii < depth; ii++)
+        if (row < depth)
         {
-            //FIXME 1+k for all the ones that have x extra or y extra
-            cur_array[THARR2D(0, -(1 + ii), x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 0, x_extra)];
+            const int offset = 2 + b_offset;
+
+            /*
+             * 1 - 2 * row means that row 0 services row 1, and vice versa
+             * this means that it can be dispatched with 'depth' rows only
+             */
+            cur_array[THARR2D(0, 1 - (2 * row), x_extra)] =
+                y_invert * cur_array[THARR2D(0, offset, x_extra)];
         }
     }
-    */
-
-    if(row == y_max)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, -1, x_extra)];
-    }
-    else if(row == y_max + 1)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, -3, x_extra)];
-    }
-    else if(row == y_max + 2)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, -5, x_extra)];
-    }
 }
 
 __global__ void device_update_halo_kernel_top_cuda
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_invert, int y_invert,
+cell_info_t type,
 double* cur_array,
 int depth)
 {
-    __large_kernel_indexes;
-    /*
-    // if on the third row from the bottom
-    if(row == y_max+1+y_extra
-    && column > 0 && column < x_max+3+x_extra)
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int y_invert = type.y_i;
+    int x_face = type.x_f;
+    __kernel_indexes;
+
+    // if x face data, offset source/dest by - 1
+    int x_f_offset = (x_face) ? 1 : 0;
+
+    if(column >= 2 - depth && column <= (x_max + 1) + x_extra + depth)
     {
-        for(int ii = 0; ii < depth; ii++)
+        if (row < depth)
         {
-            cur_array[THARR2D(0, 1 + ii, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 0, x_extra)];
-        }
-    }
-    */
+            const int offset = (- row) * 2 - 1 - x_f_offset;
 
-    if(row == 3)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 1, x_extra)];
-    }
-    else if(row == 2)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 3, x_extra)];
-    }
-    else if(row == 1)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (y_invert?-1:1) * cur_array[THARR2D(0, 5, x_extra)];
+            cur_array[THARR2D(0, y_extra + y_max + 2, x_extra)] =
+                y_invert * cur_array[THARR2D(0, y_max + 2 + offset, x_extra)];
+        }
     }
 }
 
 __global__ void device_update_halo_kernel_left_cuda
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_invert, int y_invert,
+cell_info_t type,
 double* cur_array,
 int depth)
 {
-    __large_kernel_indexes;
-    /*
-    // if on the third column from the left
-    if(row > 0 && row < y_max+3+y_extra
-    && column == 2)
-    {
-        for(int ii = 0; ii < depth; ii++)
-        {
-            cur_array[THARR2D(-(1 + ii), 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(0, 0, x_extra)];
-        }
-    }
-    */
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_invert = type.x_i;
 
-    if(column == 3)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(1, 0, x_extra)];
-    }
-    else if(column == 2)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(3, 0, x_extra)];
-    }
-    else if(column == 1)
+    // offset by 1 if it is anything but a CELL grid
+    int l_offset = (type.grid_type != CELL_DATA) ? 1 : 0;
+
+    // special indexes for specific depth
+    const int glob_id = threadIdx.x + blockIdx.x * blockDim.x;
+    const int row = glob_id / depth;
+    const int column = glob_id % depth;
+
+    if(row >= 2 - depth && row <= (y_max + 1) + y_extra + depth)
     {
-        cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(5, 0, x_extra)];
+        // first in row
+        const int offset = row * (x_max + 4 + x_extra);
+
+        cur_array[offset + (1 - column)] = x_invert * cur_array[offset + 2 + column + l_offset];
     }
 }
 
 __global__ void device_update_halo_kernel_right_cuda
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_invert, int y_invert,
+cell_info_t type,
 double* cur_array,
 int depth)
 {
-    __large_kernel_indexes;
-    /*
-    // if on the third column from the right
-    if(row > 0 && row < y_max+3+y_extra
-    && column == x_max+1+x_extra)
-    {
-        for(int ii = 0; ii < depth; ii++)
-        {
-            cur_array[THARR2D(1 + ii, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(0, 0, x_extra)];
-        }
-    }
-    */
+    int x_extra = type.x_e;
+    int y_extra = type.y_e;
+    int x_invert = type.x_i;
+    int y_face = type.y_f;
 
-    if(column == x_max)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(-1, 0, x_extra)];
-    }
-    else if(column == x_max + 1)
-    {
-        cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(-3, 0, x_extra)];
-    }
-    else if(column == x_max + 2)
+    // offset source by -1 if its a y face
+    int y_f_offset = (y_face) ? 1 : 0;
+
+    const int glob_id = threadIdx.x + blockIdx.x * blockDim.x;
+    const int row = glob_id / depth;
+    const int column = glob_id % depth;
+
+    if(row >= 2 - depth && row <= (y_max + 1) + y_extra + depth)
     {
-        cur_array[THARR2D(0, 0, x_extra)] = (x_invert?-1:1) * cur_array[THARR2D(-5, 0, x_extra)];
+        const int offset = row * (x_max + 4 + x_extra);
+
+        cur_array[offset + x_max + 2 + x_extra + column] = x_invert * cur_array[offset + x_max + 1 - (column + y_f_offset)];
     }
 }
 
 void update_array
 (int x_min,int x_max,int y_min,int y_max,
-int x_extra, int y_extra,
-int x_invert, int y_invert,
+cell_info_t const& type,
 const int* chunk_neighbours,
-double* cur_array,
+double* cur_array_d,
 int depth)
 {
-    #define CHECK_LAUNCH(dir) \
-    if(chunk_neighbours[CHUNK_ ## dir] == EXTERNAL_FACE)\
+    #define CHECK_LAUNCH(face, dir) \
+    if(chunk_neighbours[CHUNK_ ## face] == EXTERNAL_FACE)\
     {\
-        device_update_halo_kernel_ ## dir ## _cuda<<< ((x_max+5)*(y_max+5))/BLOCK_SZ, BLOCK_SZ >>>\
-        (x_min,x_max,y_min,y_max, x_extra, y_extra, x_invert, y_invert, cur_array, depth);\
-        errChk(__LINE__, __FILE__);\
+        const int launch_sz = (ceil((dir##_max+4+type.dir##_e)/static_cast<float>(BLOCK_SZ))) * depth; \
+        device_update_halo_kernel_##face##_cuda \
+        <<< launch_sz, BLOCK_SZ >>> \
+        (x_min, x_max, y_min, y_max, type, cur_array_d, depth); \
+        errChk(__LINE__, __FILE__); \
     }
 
-    CHECK_LAUNCH(bottom);
-    CHECK_LAUNCH(top);
-    CHECK_LAUNCH(right);
-    CHECK_LAUNCH(left);
-
-}
-
-void update_halo_cuda
-(int x_min,int x_max,int y_min,int y_max,
-
-const int* chunk_neighbours,
-
-double* density0,
-double* energy0,
-double* pressure,
-double* viscosity,
-double* soundspeed,
-double* density1,
-double* energy1,
-double* xvel0,
-double* yvel0,
-double* xvel1,
-double* yvel1,
-double* vol_flux_x,
-double* vol_flux_y,
-double* mass_flux_x,
-double* mass_flux_y,
-
-const int* fields,
-int depth)
-{
-
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(host);
-#endif
-
-    pointer_storage.setSize(x_max, y_max);
-    double* cur_array_d ;
-
-    #define HALO_UPDATE(arr, x_e, y_e, x_i, y_i) \
-    {if(fields[FIELD_ ## arr] == 1)\
-    {\
-        cur_array_d = pointer_storage.getDevStorageAndCopy(__LINE__, __FILE__, arr, BUFSZ2D(x_e, y_e));\
-        update_array(x_min, x_max, y_min, y_max, \
-            x_e, y_e, x_i, y_i, \
-            chunk_neighbours, cur_array_d, depth);\
-        pointer_storage.freeDevStorageAndCopy(cur_array_d, arr, BUFSZ2D(x_e, y_e));\
-    }}
-
-    HALO_UPDATE(density0, 0, 0, 0, 0);
-    HALO_UPDATE(energy0, 0, 0, 0, 0);
-    HALO_UPDATE(pressure, 0, 0, 0, 0);
-    HALO_UPDATE(viscosity, 0, 0, 0, 0);
-    HALO_UPDATE(soundspeed, 0, 0, 0, 0);
-    HALO_UPDATE(density1, 0, 0, 0, 0);
-    HALO_UPDATE(energy1, 0, 0, 0, 0);
-
-    HALO_UPDATE(xvel0, 1, 1, 1, 0);
-    HALO_UPDATE(yvel0, 1, 1, 0, 1);
-    HALO_UPDATE(xvel1, 1, 1, 1, 0);
-    HALO_UPDATE(yvel1, 1, 1, 0, 1);
-
-    HALO_UPDATE(vol_flux_x, 1, 0, 1, 0);
-    HALO_UPDATE(vol_flux_y, 0, 1, 0, 1);
-    HALO_UPDATE(mass_flux_x, 1, 0, 1, 0);
-    HALO_UPDATE(mass_flux_y, 0, 1, 0, 1);
-
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(host);
-#endif
+    CHECK_LAUNCH(bottom, x);
+    CHECK_LAUNCH(top, x);
+    CHECK_LAUNCH(left, y);
+    CHECK_LAUNCH(right, y);
 }
 
 extern "C" void update_halo_kernel_cuda_
 (int *x_min,int *x_max,int *y_min,int *y_max,
-int* left,int* bottom,int* right,int* top,
-int* left_boundary,int* bottom_boundary,int* right_boundary,int* top_boundary,
+int* left, int* bottom, int* right, int* top,
+int* left_boundary, int* bottom_boundary, int* right_boundary, int* top_boundary,
 
 const int* chunk_neighbours,
 
@@ -272,62 +170,44 @@ double* mass_flux_x,
 double* mass_flux_y,
 
 const int* fields,
-int* depth)
+const int* depth)
 {
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(host);
-#endif
-    #ifndef CUDA_RESIDENT
-    update_halo_cuda(*x_min, *x_max, *y_min, *y_max,
-        chunk_neighbours,
-        density0, energy0, pressure, viscosity, soundspeed, density1, energy1,
-        xvel0, yvel0, xvel1, yvel1,
-        vol_flux_x, vol_flux_y, mass_flux_x, mass_flux_y,
-        fields, *depth);
-    #else
     chunk.update_halo_kernel(fields, *depth, chunk_neighbours);
-    #endif
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(host);
-#endif
 }
 
 void CloverleafCudaChunk::update_halo_kernel
-(const int* fields, int depth,
+(const int* fields,
+const int depth,
 const int* chunk_neighbours)
 {
-#ifdef TIME_KERNELS
-_CUDA_BEGIN_PROFILE_name(device);
-#endif
+    _CUDA_BEGIN_PROFILE_name(device);
 
-    #define HALO_UPDATE_RESIDENT(arr, x_e, y_e, x_i, y_i) \
+    #define HALO_UPDATE_RESIDENT(arr, type) \
     {if(fields[FIELD_ ## arr] == 1)\
     {\
         update_array(x_min, x_max, y_min, y_max, \
-            x_e, y_e, x_i, y_i, \
-            chunk_neighbours, arr, depth);\
+            type, chunk_neighbours, arr, depth);\
     }}
 
-    HALO_UPDATE_RESIDENT(density0, 0, 0, 0, 0);
-    HALO_UPDATE_RESIDENT(energy0, 0, 0, 0, 0);
-    HALO_UPDATE_RESIDENT(pressure, 0, 0, 0, 0);
-    HALO_UPDATE_RESIDENT(viscosity, 0, 0, 0, 0);
-    HALO_UPDATE_RESIDENT(soundspeed, 0, 0, 0, 0);
-    HALO_UPDATE_RESIDENT(density1, 0, 0, 0, 0);
-    HALO_UPDATE_RESIDENT(energy1, 0, 0, 0, 0);
-
-    HALO_UPDATE_RESIDENT(xvel0, 1, 1, 1, 0);
-    HALO_UPDATE_RESIDENT(yvel0, 1, 1, 0, 1);
-    HALO_UPDATE_RESIDENT(xvel1, 1, 1, 1, 0);
-    HALO_UPDATE_RESIDENT(yvel1, 1, 1, 0, 1);
-
-    HALO_UPDATE_RESIDENT(vol_flux_x, 1, 0, 1, 0);
-    HALO_UPDATE_RESIDENT(vol_flux_y, 0, 1, 0, 1);
-    HALO_UPDATE_RESIDENT(mass_flux_x, 1, 0, 1, 0);
-    HALO_UPDATE_RESIDENT(mass_flux_y, 0, 1, 0, 1);
-
-#ifdef TIME_KERNELS
-_CUDA_END_PROFILE_name(device);
-#endif
+    HALO_UPDATE_RESIDENT(density0, CELL);
+    HALO_UPDATE_RESIDENT(density1, CELL);
+    HALO_UPDATE_RESIDENT(energy0, CELL);
+    HALO_UPDATE_RESIDENT(energy1, CELL);
+    HALO_UPDATE_RESIDENT(pressure, CELL);
+    HALO_UPDATE_RESIDENT(viscosity, CELL);
+
+    HALO_UPDATE_RESIDENT(xvel0, VERTEX_X);
+    HALO_UPDATE_RESIDENT(xvel1, VERTEX_X);
+
+    HALO_UPDATE_RESIDENT(yvel0, VERTEX_Y);
+    HALO_UPDATE_RESIDENT(yvel1, VERTEX_Y);
+
+    HALO_UPDATE_RESIDENT(vol_flux_x, X_FACE);
+    HALO_UPDATE_RESIDENT(mass_flux_x, X_FACE);
+
+    HALO_UPDATE_RESIDENT(vol_flux_y, Y_FACE);
+    HALO_UPDATE_RESIDENT(mass_flux_y, Y_FACE);
+
+    _CUDA_END_PROFILE_name(device);
 }