Skip to content

Commit

Permalink
Merge pull request CompFUSE#328 from PDoakORNL/best_effort_for_frontier
Browse files Browse the repository at this point in the history
Fixes for Frontier
  • Loading branch information
PDoakORNL authored Aug 30, 2024
2 parents aae08d8 + c901074 commit f96c99b
Show file tree
Hide file tree
Showing 29 changed files with 623 additions and 219 deletions.
21 changes: 2 additions & 19 deletions build-aux/frontier_rocm6_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,9 @@ cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
-DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \
-DCMAKE_INSTALL_PREFIX=$INST \
-DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-DCMAKE_HIP_LINK_FLAGS=--hip-link \
-GNinja \
..

#cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
-DFFTW_ROOT=$FFTW_PATH \
-DDCA_FIX_BROKEN_MPICH=ON \
-DROCM_ROOT=${ROCM_PATH} \
-DMAGMA_ROOT=${MAGMA_ROOT} \
-DLAPACK_ROOT=${OPENBLAS_ROOT} \
-DBLAS_ROOT=${OPENBLAS_ROOT} \
-DDCA_WITH_TESTS_FAST=ON \
-DTEST_RUNNER="srun" \
-DGPU_TARGETS=gfx90a \
-DAMDGPU_TARGETS=gfx90a \
-DCMAKE_C_COMPILER=mpicc \
-DCMAKE_CXX_COMPILER=mpic++ \
-DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \
-DCMAKE_INSTALL_PREFIX=$INST \
-DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-GNinja \
..
# cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON -DFFTW_ROOT=$FFTW_PATH -DDCA_FIX_BROKEN_MPICH=ON -DROCM_ROOT=${ROCM_PATH} -DMAGMA_ROOT=${MAGMA_ROOT} -DLAPACK_ROOT=${OPENBLAS_ROOT} -DBLAS_ROOT=${OPENBLAS_ROOT} -DDCA_WITH_TESTS_FAST=ON -DTEST_RUNNER="srun" -DGPU_TARGETS=gfx90a -DAMDGPU_TARGETS=gfx90a -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ -DCMAKE_INSTALL_PREFIX=$INST -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" -GNinja ..
..

14 changes: 8 additions & 6 deletions build-aux/frontier_rocm6_load_modules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@

module reset
module load amd-mixed/6.0.0
spack load cmake%gcc@11.2.0
spack load ninja%gcc@11.2.0
spack load magma@master amdgpu_target=gfx90a
spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
spack load fftw ~mpi %rocmcc@6.0.0
spack load openblas@0.3.25 %gcc@11.2.0
module load ninja
module load cmake
#spack load cmake%gcc@11.2.0
#spack load ninja%gcc@11.2.0
#spack load magma@master amdgpu_target=gfx90a
#spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
#spack load fftw ~mpi %rocmcc@6.0.0
#spack load openblas@0.3.25 %gcc@11.2.0

export CC=mpicc
export CXX=mpicxx
Expand Down
6 changes: 6 additions & 0 deletions cmake/dca_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,12 @@ else()
set(TWO_PARTICLE_ALLOCATOR "dca::linalg::util::DeviceAllocator<T>")
endif()

option(DCA_WITH_CTAUX_TRACING "special debug tracing of of delayed spin updates in ctaux" OFF)
mark_as_advanced(DCA_WITH_CTAUX_TRACING)
if(DCA_WITH_CTAUX_TRACING)
add_compile_definitions(CTAUX_DEBUG_TRACING)
endif()

configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in"
"${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY)

Expand Down
4 changes: 4 additions & 0 deletions cmake/dca_hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ if (CMAKE_HIP_COMPILER)
set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a")
set(CMAKE_HIP_STANDARD 17)
list(APPEND HIP_HIPCC_FLAGS "-fPIC")
list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics")
list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread")
list(APPEND HIP_HIPCC_FLAGS_DEBUG "--save-temps -g")

# doesn't appear to work
set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu)
message("Enabled HIP as a language")
Expand Down
33 changes: 25 additions & 8 deletions include/dca/linalg/matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,22 @@ class Matrix : public ALLOC {
// This method is available only if device_name == CPU.
template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
ScalarType& operator()(int i, int j) {
assert(i >= 0 && i < size_.first);
assert(j >= 0 && j < size_.second);
#ifndef NDEBUG
if(!(i >= 0 && i <= size_.first))
throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
if(!(j >= 0 && j <= size_.second))
throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
#endif
return data_[i + j * leadingDimension()];
}
template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
const ScalarType& operator()(int i, int j) const {
assert(i >= 0 && i < size_.first);
assert(j >= 0 && j < size_.second);
#ifndef NDEBUG
if(!(i >= 0 && i <= size_.first))
throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
if(!(j >= 0 && j <= size_.second))
throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
#endif
return data_[i + j * leadingDimension()];
}

Expand All @@ -142,13 +150,22 @@ class Matrix : public ALLOC {
// a pointer past the end of the range if i == size().first or j == size().second.
// Preconditions: 0 <= i <= size().first, 0 <= j <= size().second.
ValueType* ptr(int i, int j) {
assert(i >= 0 && i <= size_.first);
assert(j >= 0 && j <= size_.second);
//These can be an annoyance debugging so making them "manual" asserts
#ifndef NDEBUG
if(!(i >= 0 && i <= size_.first))
throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
if(!(j >= 0 && j <= size_.second))
throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
#endif
return data_ + i + j * leadingDimension();
}
const ValueType* ptr(int i, int j) const {
assert(i >= 0 && i <= size_.first);
assert(j >= 0 && j <= size_.second);
#ifndef NDEBUG
if(!(i >= 0 && i <= size_.first))
throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
if(!(j >= 0 && j <= size_.second))
throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
#endif
return data_ + i + j * leadingDimension();
}

Expand Down
48 changes: 47 additions & 1 deletion include/dca/linalg/util/atomic_add_cuda.cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) {
atomicAddImpl(address, val);
}

#elif defined(DCA_HAVE_HIP)
// HIP seems to have some horrible problem with concurrent atomic operations.
__device__ double inline atomicAddImpl(double* address, const double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
} while (assumed != old);
return __longlong_as_double(old);
}

__device__ double inline atomicAddImpl(float* address, const float val) {
unsigned long int* address_as_int = (unsigned long int*)address;
unsigned long int old = *address_as_int, assumed;
do {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val + __int_as_float(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
} while (assumed != old);
return __int_as_float(old);
}

__device__ void inline atomicAdd(float* address, const float val) {
atomicAddImpl(address, val);
}

__device__ void inline atomicAdd(double* address, const double val) {
atomicAddImpl(address, val);
}

__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) {
double* a_d = reinterpret_cast<double*>(address);
atomicAddImpl(a_d, val.x);
atomicAddImpl(a_d + 1, val.y);
}

__device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
double* a_d = reinterpret_cast<double*>(address);
atomicAddImpl(a_d, val.x);
atomicAddImpl(a_d + 1, val.y);
}

#else
__device__ void inline atomicAdd(double* address, double val) {
::atomicAdd(address, val);
Expand All @@ -62,7 +108,7 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val)
atomicAdd(a_d, val.x);
atomicAdd(a_d + 1, val.y);
}
#endif // __CUDA_ARCH__
#endif // atomic operation help

} // linalg
} // dca
Expand Down
1 change: 1 addition & 0 deletions include/dca/linalg/util/magma_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class MagmaQueue {
MagmaQueue& operator=(const MagmaQueue& rhs) = delete;

MagmaQueue(MagmaQueue&& rhs) noexcept : queue_(std::move(rhs.queue_)) {
std::swap(stream_, rhs.stream_);
std::swap(cublas_handle_, rhs.cublas_handle_);
std::swap(cusparse_handle_, rhs.cusparse_handle_);
std::swap(queue_, rhs.queue_);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,8 +406,8 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST>::updateFrom(walker_type&
walker.get_error_distribution() = 0;
#endif // DCA_WITH_QMC_BIT

single_particle_accumulator_obj.syncStreams(*event);
two_particle_accumulator_.syncStreams(*event);
//single_particle_accumulator_obj.syncStreams(*event);
//two_particle_accumulator_.syncStreams(*event);
}

template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST>
Expand Down
Loading

0 comments on commit f96c99b

Please sign in to comment.