Merge branch 'dev' into output-always-param

cholla-hydro · Dec 14, 2023 · 0d218b3 · 0d218b3
2 parents 156c642 + b30468a
commit 0d218b3
Show file tree

Hide file tree

Showing 69 changed files with 679 additions and 637 deletions.
diff --git a/builds/make.inc.template b/builds/make.inc.template
@@ -4,7 +4,7 @@
 
 #To use GPUs, CUDA must be turned on here
 #Optional error checking can also be enabled
-DFLAGS += -DCUDA #-DCUDA_ERROR_CHECK
+DFLAGS += -DCUDA
 
 #To use MPI, DFLAGS must include -DMPI_CHOLLA
 DFLAGS += -DMPI_CHOLLA
@@ -65,7 +65,7 @@ DFLAGS += -DTEMPERATURE_FLOOR
 #DFLAGS += -DDYNAMIC_GPU_ALLOC
 
 # Set the cooling function
-#DFLAGS += -DCOOLING_GPU 
+#DFLAGS += -DCOOLING_GPU
 #DFLAGS += -DCLOUDY_COOL
 
 # Use Tiled Iitial Conditions for Scaling Tets

diff --git a/builds/make.type.disk b/builds/make.type.disk
@@ -22,7 +22,7 @@ DFLAGS += -DGRAVITY_5_POINTS_GRADIENT
 #DFLAGS += -DSTATIC_GRAV
 
 DFLAGS    += -DCUDA
-DFLAGS    += -DMPI_CHOLLA 
+DFLAGS    += -DMPI_CHOLLA
 DFLAGS    += -DPRECISION=2
 DFLAGS    += -DPPMC
 DFLAGS    += -DHLLC
@@ -42,9 +42,7 @@ DFLAGS    += -DHYDRO_GPU
 OUTPUT    ?=  -DOUTPUT -DHDF5 -DSLICES -DPROJECTION
 DFLAGS    += $(OUTPUT)
 
-DFLAGS    += $(MPI_GPU)  
+DFLAGS    += $(MPI_GPU)
 
 DFLAGS += -DPARALLEL_OMP
 DFLAGS += -DN_OMP_THREADS=$(OMP_NUM_THREADS)
-
-#DFLAGS += -DCUDA_ERROR_CHECK
diff --git a/builds/make.type.mhd b/builds/make.type.mhd
@@ -43,11 +43,11 @@ DFLAGS    += $(OUTPUT)
 #This is set in the system make.host file
 DFLAGS    += $(MPI_GPU)
 
+# Disable CUDA error checking
+# DFLAGS += -DDISABLE_GPU_ERROR_CHECKING
+
 # NOTE: The following macros are to help facilitate debugging and should not be
 # used on scientific runs
 
-# Do CUDA error checking
-# DFLAGS += -DCUDA_ERROR_CHECK
-
 # Limit the number of steps to evolve.
 # DFLAGS += -DN_STEPS_LIMIT=1000
diff --git a/clang-tidy-runner.sh b/clang-tidy-runner.sh
diff --git a/src/analysis/feedback_analysis.cpp b/src/analysis/feedback_analysis.cpp
@@ -16,8 +16,8 @@ FeedbackAnalysis::FeedbackAnalysis(Grid3D& G)
   h_circ_vel_y = (Real*)malloc(G.H.n_cells * sizeof(Real));
 
 #ifdef PARTICLES_GPU
-  CHECK(cudaMalloc((void**)&d_circ_vel_x, G.H.n_cells * sizeof(Real)));
-  CHECK(cudaMalloc((void**)&d_circ_vel_y, G.H.n_cells * sizeof(Real)));
+  GPU_Error_Check(cudaMalloc((void**)&d_circ_vel_x, G.H.n_cells * sizeof(Real)));
+  GPU_Error_Check(cudaMalloc((void**)&d_circ_vel_y, G.H.n_cells * sizeof(Real)));
 #endif
 
   // setup the (constant) circular speed arrays
@@ -40,8 +40,8 @@ FeedbackAnalysis::FeedbackAnalysis(Grid3D& G)
   }
 
 #ifdef PARTICLES_GPU
-  CHECK(cudaMemcpy(d_circ_vel_x, h_circ_vel_x, G.H.n_cells * sizeof(Real), cudaMemcpyHostToDevice));
-  CHECK(cudaMemcpy(d_circ_vel_y, h_circ_vel_y, G.H.n_cells * sizeof(Real), cudaMemcpyHostToDevice));
+  GPU_Error_Check(cudaMemcpy(d_circ_vel_x, h_circ_vel_x, G.H.n_cells * sizeof(Real), cudaMemcpyHostToDevice));
+  GPU_Error_Check(cudaMemcpy(d_circ_vel_y, h_circ_vel_y, G.H.n_cells * sizeof(Real), cudaMemcpyHostToDevice));
 #endif
 }
 
@@ -50,8 +50,8 @@ FeedbackAnalysis::~FeedbackAnalysis()
   free(h_circ_vel_x);
   free(h_circ_vel_y);
 #ifdef PARTICLES_GPU
-  CHECK(cudaFree(d_circ_vel_x));
-  CHECK(cudaFree(d_circ_vel_y));
+  GPU_Error_Check(cudaFree(d_circ_vel_x));
+  GPU_Error_Check(cudaFree(d_circ_vel_y));
 #endif
 }
 

diff --git a/src/analysis/feedback_analysis_gpu.cu b/src/analysis/feedback_analysis_gpu.cu
@@ -147,8 +147,8 @@ void FeedbackAnalysis::Compute_Gas_Velocity_Dispersion_GPU(Grid3D &G)
   Real *d_partial_vel;
   Real *h_partial_mass = (Real *)malloc(ngrid * sizeof(Real));
   Real *h_partial_vel  = (Real *)malloc(ngrid * sizeof(Real));
-  CHECK(cudaMalloc((void **)&d_partial_mass, ngrid * sizeof(Real)));
-  CHECK(cudaMalloc((void **)&d_partial_vel, ngrid * sizeof(Real)));
+  GPU_Error_Check(cudaMalloc((void **)&d_partial_mass, ngrid * sizeof(Real)));
+  GPU_Error_Check(cudaMalloc((void **)&d_partial_vel, ngrid * sizeof(Real)));
 
   Real total_mass = 0;
   Real total_vel  = 0;
@@ -177,8 +177,8 @@ void FeedbackAnalysis::Compute_Gas_Velocity_Dispersion_GPU(Grid3D &G)
 
   // cudaDeviceSynchronize();
 
-  CHECK(cudaMemcpy(h_partial_mass, d_partial_mass, ngrid * sizeof(Real), cudaMemcpyDeviceToHost));
-  CHECK(cudaMemcpy(h_partial_vel, d_partial_vel, ngrid * sizeof(Real), cudaMemcpyDeviceToHost));
+  GPU_Error_Check(cudaMemcpy(h_partial_mass, d_partial_mass, ngrid * sizeof(Real), cudaMemcpyDeviceToHost));
+  GPU_Error_Check(cudaMemcpy(h_partial_vel, d_partial_vel, ngrid * sizeof(Real), cudaMemcpyDeviceToHost));
 
   #ifdef MPI_CHOLLA
   MPI_Allreduce(h_partial_mass, &total_mass, 1, MPI_CHREAL, MPI_SUM, world);
@@ -195,8 +195,8 @@ void FeedbackAnalysis::Compute_Gas_Velocity_Dispersion_GPU(Grid3D &G)
   chprintf("feedback: time %f, dt=%f, vrms = %f km/s\n", G.H.t, G.H.dt,
            sqrt(total_vel / total_mass) * VELOCITY_UNIT / 1e5);
 
-  CHECK(cudaFree(d_partial_vel));
-  CHECK(cudaFree(d_partial_mass));
+  GPU_Error_Check(cudaFree(d_partial_vel));
+  GPU_Error_Check(cudaFree(d_partial_mass));
 
   free(h_partial_mass);
   free(h_partial_vel);

diff --git a/src/chemistry_gpu/chemistry_functions_gpu.cu b/src/chemistry_gpu/chemistry_functions_gpu.cu
@@ -17,38 +17,33 @@
 
 void Chem_GPU::Allocate_Array_GPU_float(float **array_dev, int size)
 {
-  cudaMalloc((void **)array_dev, size * sizeof(float));
-  CudaCheckError();
+  GPU_Error_Check(cudaMalloc((void **)array_dev, size * sizeof(float)));
 }
 
 void Chem_GPU::Copy_Float_Array_to_Device(int size, float *array_h, float *array_d)
 {
-  CudaSafeCall(cudaMemcpy(array_d, array_h, size * sizeof(float), cudaMemcpyHostToDevice));
+  GPU_Error_Check(cudaMemcpy(array_d, array_h, size * sizeof(float), cudaMemcpyHostToDevice));
   cudaDeviceSynchronize();
 }
 
-void Chem_GPU::Free_Array_GPU_float(float *array_dev)
-{
-  cudaFree(array_dev);
-  CudaCheckError();
-}
+void Chem_GPU::Free_Array_GPU_float(float *array_dev) { GPU_Error_Check(cudaFree(array_dev)); }
 
 void Chem_GPU::Allocate_Array_GPU_Real(Real **array_dev, int size)
 {
-  cudaMalloc((void **)array_dev, size * sizeof(Real));
-  CudaCheckError();
+  GPU_Error_Check(cudaMalloc((void **)array_dev, size * sizeof(Real)));
+  GPU_Error_Check();
 }
 
 void Chem_GPU::Copy_Real_Array_to_Device(int size, Real *array_h, Real *array_d)
 {
-  CudaSafeCall(cudaMemcpy(array_d, array_h, size * sizeof(Real), cudaMemcpyHostToDevice));
+  GPU_Error_Check(cudaMemcpy(array_d, array_h, size * sizeof(Real), cudaMemcpyHostToDevice));
   cudaDeviceSynchronize();
 }
 
 void Chem_GPU::Free_Array_GPU_Real(Real *array_dev)
 {
-  cudaFree(array_dev);
-  CudaCheckError();
+  GPU_Error_Check(cudaFree(array_dev));
+  GPU_Error_Check();
 }
 
 class Thermal_State
@@ -622,7 +617,7 @@ void Do_Chemistry_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghos
   hipLaunchKernelGGL(Update_Chemistry_kernel, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields,
                      dt, Chem_H);
 
-  CudaCheckError();
+  GPU_Error_Check();
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
   cudaEventElapsedTime(&time, start, stop);

diff --git a/src/cooling/cooling_cuda.cu b/src/cooling/cooling_cuda.cu
@@ -26,7 +26,7 @@ void Cooling_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, in
   dim3 dim1dBlock(TPB, 1, 1);
   hipLaunchKernelGGL(cooling_kernel, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields, dt,
                      gama, coolTexObj, heatTexObj);
-  CudaCheckError();
+  GPU_Error_Check();
 }
 
 /*! \fn void cooling_kernel(Real *dev_conserved, int nx, int ny, int nz, int

diff --git a/src/cooling/load_cloudy_texture.cu b/src/cooling/load_cloudy_texture.cu
@@ -110,16 +110,16 @@ void Load_Cuda_Textures()
 
   // allocate host arrays to be copied to textures
   // these arrays are declared as external pointers in global.h
-  CudaSafeCall(cudaHostAlloc(&cooling_table, nx * ny * sizeof(float), cudaHostAllocDefault));
-  CudaSafeCall(cudaHostAlloc(&heating_table, nx * ny * sizeof(float), cudaHostAllocDefault));
+  GPU_Error_Check(cudaHostAlloc(&cooling_table, nx * ny * sizeof(float), cudaHostAllocDefault));
+  GPU_Error_Check(cudaHostAlloc(&heating_table, nx * ny * sizeof(float), cudaHostAllocDefault));
 
   // Read cooling tables into the host arrays
   Host_Read_Cooling_Tables(cooling_table, heating_table);
 
   // Allocate CUDA arrays in device memory
   cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-  cudaMallocArray(&cuCoolArray, &channelDesc, nx, ny);
-  cudaMallocArray(&cuHeatArray, &channelDesc, nx, ny);
+  GPU_Error_Check(cudaMallocArray(&cuCoolArray, &channelDesc, nx, ny));
+  GPU_Error_Check(cudaMallocArray(&cuHeatArray, &channelDesc, nx, ny));
 
   // Copy the cooling and heating arrays from host to device
 
@@ -164,8 +164,8 @@ void Load_Cuda_Textures()
   cudaCreateTextureObject(&heatTexObj, &heatResDesc, &texDesc, NULL);
 
   // Free the memory associated with the cooling tables on the host
-  CudaSafeCall(cudaFreeHost(cooling_table));
-  CudaSafeCall(cudaFreeHost(heating_table));
+  GPU_Error_Check(cudaFreeHost(cooling_table));
+  GPU_Error_Check(cudaFreeHost(heating_table));
 
   // Run Test
   // Test_Cloudy_Textures();
@@ -261,7 +261,7 @@ void Test_Cloudy_Textures()
   dim3 dim1dGrid((num_n * num_T + TPB - 1) / TPB, 1, 1);
   dim3 dim1dBlock(TPB, 1, 1);
   hipLaunchKernelGGL(Test_Cloudy_Textures_Kernel, dim1dGrid, dim1dBlock, 0, 0, num_n, num_T, coolTexObj, heatTexObj);
-  CHECK(cudaDeviceSynchronize());
+  GPU_Error_Check(cudaDeviceSynchronize());
   printf("Exiting due to Test_Cloudy_Textures() being called \n");
   exit(0);
 }
@@ -272,12 +272,12 @@ void Test_Cloudy_Speed()
   int num_T = 1 + 80 * 81;
   dim3 dim1dGrid((num_n * num_T + TPB - 1) / TPB, 1, 1);
   dim3 dim1dBlock(TPB, 1, 1);
-  CHECK(cudaDeviceSynchronize());
+  GPU_Error_Check(cudaDeviceSynchronize());
   Real time_start = Get_Time();
   for (int i = 0; i < 100; i++) {
     hipLaunchKernelGGL(Test_Cloudy_Speed_Kernel, dim1dGrid, dim1dBlock, 0, 0, num_n, num_T, coolTexObj, heatTexObj);
   }
-  CHECK(cudaDeviceSynchronize());
+  GPU_Error_Check(cudaDeviceSynchronize());
   Real time_end = Get_Time();
   printf(" Cloudy Test Time %9.4f micro-s \n", (time_end - time_start));
   printf("Exiting due to Test_Cloudy_Speed() being called \n");

diff --git a/src/dust/dust_cuda.cu b/src/dust/dust_cuda.cu
@@ -32,7 +32,7 @@ void Dust_Update(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n
   dim3 dim1dGrid(ngrid, 1, 1);
   dim3 dim1dBlock(TPB, 1, 1);
   hipLaunchKernelGGL(Dust_Kernel, dim1dGrid, dim1dBlock, 0, 0, dev_conserved, nx, ny, nz, n_ghost, n_fields, dt, gamma);
-  CudaCheckError();
+  GPU_Error_Check();
 }
 
 __global__ void Dust_Kernel(Real *dev_conserved, int nx, int ny, int nz, int n_ghost, int n_fields, Real dt, Real gamma)

diff --git a/src/global/global.cpp b/src/global/global.cpp
@@ -57,6 +57,26 @@ int Sgn(Real x)
   }
 }
 
+// global mpi-related variables (they are declared here because they are initialized even when
+// the MPI_CHOLLA variable is not defined)
+
+int procID; /*process rank*/
+int nproc;  /*number of processes in global comm*/
+int root;   /*rank of root process*/
+
+/* Used when MPI_CHOLLA is not defined to initialize a subset of the global mpi-related variables
+ * that still meaningful in non-mpi simulations.
+ */
+void Init_Global_Parallel_Vars_No_MPI()
+{
+#ifdef MPI_CHOLLA
+  CHOLLA_ERROR("This function should not be executed when compiled with MPI");
+#endif
+  procID = 0;
+  nproc  = 1;
+  root   = 0;
+}
+
 /*! \fn char Trim(char *s)
  *  \brief Gets rid of trailing and leading whitespace. */
 char *Trim(char *s)

diff --git a/src/global/global.h b/src/global/global.h
@@ -155,6 +155,20 @@ extern double Get_Time(void);
  *  \brief Mathematical sign function. Returns sign of x. */
 extern int Sgn(Real x);
 
+/* Global variables for mpi (but they are also initialized to sensible defaults when not using mpi)
+ *
+ * It may make sense to move these back into mpi_routines (but reorganizing the ifdef statements
+ * would take some work). It may make sense to also put these into their own namespace.
+ */
+extern int procID; /*process rank*/
+extern int nproc;  /*number of processes executing simulation*/
+extern int root;   /*rank of root process*/
+
+/* Used when MPI_CHOLLA is not defined to initialize a subset of the global mpi-related variables
+ * that still meaningful in non-mpi simulations.
+ */
+void Init_Global_Parallel_Vars_No_MPI();
+
 struct Parameters {
   int nx;
   int ny;