Merge pull request CompFUSE#328 from PDoakORNL/best_effort_for_frontier

Fixes for Frontier
PDoakORNL · Aug 30, 2024 · f96c99b · f96c99b
2 parents aae08d8 + c901074
commit f96c99b
Show file tree

Hide file tree

Showing 29 changed files with 623 additions and 219 deletions.
diff --git a/build-aux/frontier_rocm6_build.sh b/build-aux/frontier_rocm6_build.sh
@@ -15,26 +15,9 @@ cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
       -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \
       -DCMAKE_INSTALL_PREFIX=$INST \
       -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
+      -DCMAKE_HIP_LINK_FLAGS=--hip-link \
       -GNinja \
       ..
 
-#cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
-      -DFFTW_ROOT=$FFTW_PATH \
-      -DDCA_FIX_BROKEN_MPICH=ON \
-      -DROCM_ROOT=${ROCM_PATH} \
-      -DMAGMA_ROOT=${MAGMA_ROOT} \
-      -DLAPACK_ROOT=${OPENBLAS_ROOT} \
-      -DBLAS_ROOT=${OPENBLAS_ROOT} \
-      -DDCA_WITH_TESTS_FAST=ON \
-      -DTEST_RUNNER="srun" \
-      -DGPU_TARGETS=gfx90a \
-      -DAMDGPU_TARGETS=gfx90a \
-      -DCMAKE_C_COMPILER=mpicc \
-      -DCMAKE_CXX_COMPILER=mpic++ \
-      -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \
-      -DCMAKE_INSTALL_PREFIX=$INST \
-      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-      -GNinja \
-      ..
 # cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON -DFFTW_ROOT=$FFTW_PATH -DDCA_FIX_BROKEN_MPICH=ON -DROCM_ROOT=${ROCM_PATH} -DMAGMA_ROOT=${MAGMA_ROOT} -DLAPACK_ROOT=${OPENBLAS_ROOT} -DBLAS_ROOT=${OPENBLAS_ROOT} -DDCA_WITH_TESTS_FAST=ON -DTEST_RUNNER="srun" -DGPU_TARGETS=gfx90a -DAMDGPU_TARGETS=gfx90a -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ -DCMAKE_INSTALL_PREFIX=$INST -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" -GNinja ..
-      ..
+
diff --git a/build-aux/frontier_rocm6_load_modules.sh b/build-aux/frontier_rocm6_load_modules.sh
@@ -9,12 +9,14 @@
 
 module reset
 module load amd-mixed/6.0.0
-spack load cmake%gcc@11.2.0
-spack load ninja%gcc@11.2.0
-spack load magma@master amdgpu_target=gfx90a
-spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
-spack load fftw ~mpi %rocmcc@6.0.0
-spack load openblas@0.3.25 %gcc@11.2.0
+module load ninja
+module load cmake
+#spack load cmake%gcc@11.2.0
+#spack load ninja%gcc@11.2.0
+#spack load magma@master amdgpu_target=gfx90a
+#spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
+#spack load fftw ~mpi %rocmcc@6.0.0
+#spack load openblas@0.3.25 %gcc@11.2.0
 
 export CC=mpicc
 export CXX=mpicxx

diff --git a/cmake/dca_config.cmake b/cmake/dca_config.cmake
@@ -416,6 +416,12 @@ else()
   set(TWO_PARTICLE_ALLOCATOR "dca::linalg::util::DeviceAllocator<T>")
 endif()
 
+option(DCA_WITH_CTAUX_TRACING "special debug tracing of of delayed spin updates in ctaux" OFF)
+mark_as_advanced(DCA_WITH_CTAUX_TRACING)
+if(DCA_WITH_CTAUX_TRACING)
+  add_compile_definitions(CTAUX_DEBUG_TRACING)
+endif()
+
 configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in"
         "${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY)
 

diff --git a/cmake/dca_hip.cmake b/cmake/dca_hip.cmake
@@ -74,6 +74,10 @@ if (CMAKE_HIP_COMPILER)
   set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a")
   set(CMAKE_HIP_STANDARD 17)
   list(APPEND HIP_HIPCC_FLAGS "-fPIC")
+  list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics")
+  list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread")
+  list(APPEND HIP_HIPCC_FLAGS_DEBUG "--save-temps -g")
+
   # doesn't appear to work
   set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu)
   message("Enabled HIP as a language")

diff --git a/include/dca/linalg/matrix.hpp b/include/dca/linalg/matrix.hpp
@@ -112,14 +112,22 @@ class Matrix : public ALLOC {
   // This method is available only if device_name == CPU.
   template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
   ScalarType& operator()(int i, int j) {
-    assert(i >= 0 && i < size_.first);
-    assert(j >= 0 && j < size_.second);
+#ifndef NDEBUG
+    if(!(i >= 0 && i <= size_.first))
+      throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
+    if(!(j >= 0 && j <= size_.second))
+      throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
+#endif
     return data_[i + j * leadingDimension()];
   }
   template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
   const ScalarType& operator()(int i, int j) const {
-    assert(i >= 0 && i < size_.first);
-    assert(j >= 0 && j < size_.second);
+#ifndef NDEBUG
+    if(!(i >= 0 && i <= size_.first))
+      throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
+    if(!(j >= 0 && j <= size_.second))
+      throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
+#endif
     return data_[i + j * leadingDimension()];
   }
 
@@ -142,13 +150,22 @@ class Matrix : public ALLOC {
   // a pointer past the end of the range if i == size().first or j == size().second.
   // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second.
   ValueType* ptr(int i, int j) {
-    assert(i >= 0 && i <= size_.first);
-    assert(j >= 0 && j <= size_.second);
+    //These can be an annoyance debugging so making them "manual" asserts
+#ifndef NDEBUG
+    if(!(i >= 0 && i <= size_.first))
+      throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
+    if(!(j >= 0 && j <= size_.second))
+      throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
+#endif
     return data_ + i + j * leadingDimension();
   }
   const ValueType* ptr(int i, int j) const {
-    assert(i >= 0 && i <= size_.first);
-    assert(j >= 0 && j <= size_.second);
+#ifndef NDEBUG
+    if(!(i >= 0 && i <= size_.first))
+      throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
+    if(!(j >= 0 && j <= size_.second))
+      throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
+#endif
     return data_ + i + j * leadingDimension();
   }
 

diff --git a/include/dca/linalg/util/atomic_add_cuda.cu.hpp b/include/dca/linalg/util/atomic_add_cuda.cu.hpp
@@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) {
   atomicAddImpl(address, val);
 }
 
+#elif defined(DCA_HAVE_HIP)
+// HIP seems to have some horrible problem with concurrent atomic operations.
+__device__ double inline atomicAddImpl(double* address, const double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+__device__ double inline atomicAddImpl(float* address, const float val) {
+  unsigned long int* address_as_int = (unsigned long int*)address;
+  unsigned long int old = *address_as_int, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_int, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+
+__device__ void inline atomicAdd(float* address, const float val) {
+  atomicAddImpl(address, val);
+}
+
+__device__ void inline atomicAdd(double* address, const double val) {
+  atomicAddImpl(address, val);
+}
+
+__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) {
+  double* a_d = reinterpret_cast<double*>(address);
+  atomicAddImpl(a_d, val.x);
+  atomicAddImpl(a_d + 1, val.y);
+}
+
+  __device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
+  double* a_d = reinterpret_cast<double*>(address);
+  atomicAddImpl(a_d, val.x);
+  atomicAddImpl(a_d + 1, val.y);
+}
+
 #else
 __device__ void inline atomicAdd(double* address, double val) {
   ::atomicAdd(address, val);
@@ -62,7 +108,7 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val)
   atomicAdd(a_d, val.x);
   atomicAdd(a_d + 1, val.y);
 }
-#endif  // __CUDA_ARCH__
+#endif  // atomic operation help
 
 }  // linalg
 }  // dca

diff --git a/include/dca/linalg/util/magma_queue.hpp b/include/dca/linalg/util/magma_queue.hpp
@@ -42,6 +42,7 @@ class MagmaQueue {
   MagmaQueue& operator=(const MagmaQueue& rhs) = delete;
 
   MagmaQueue(MagmaQueue&& rhs) noexcept : queue_(std::move(rhs.queue_)) {
+    std::swap(stream_, rhs.stream_);
     std::swap(cublas_handle_, rhs.cublas_handle_);
     std::swap(cusparse_handle_, rhs.cusparse_handle_);
     std::swap(queue_, rhs.queue_);

diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp
@@ -406,8 +406,8 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST>::updateFrom(walker_type&
   walker.get_error_distribution() = 0;
 #endif  // DCA_WITH_QMC_BIT
 
-  single_particle_accumulator_obj.syncStreams(*event);
-  two_particle_accumulator_.syncStreams(*event);
+  //single_particle_accumulator_obj.syncStreams(*event);
+  //two_particle_accumulator_.syncStreams(*event);
 }
 
 template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST>