diff --git a/build-aux/frontier_rocm6_build.sh b/build-aux/frontier_rocm6_build.sh index cf68705ed..8a8e1e698 100644 --- a/build-aux/frontier_rocm6_build.sh +++ b/build-aux/frontier_rocm6_build.sh @@ -15,26 +15,9 @@ cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \ -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \ -DCMAKE_INSTALL_PREFIX=$INST \ -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ + -DCMAKE_HIP_LINK_FLAGS=--hip-link \ -GNinja \ .. -#cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \ - -DFFTW_ROOT=$FFTW_PATH \ - -DDCA_FIX_BROKEN_MPICH=ON \ - -DROCM_ROOT=${ROCM_PATH} \ - -DMAGMA_ROOT=${MAGMA_ROOT} \ - -DLAPACK_ROOT=${OPENBLAS_ROOT} \ - -DBLAS_ROOT=${OPENBLAS_ROOT} \ - -DDCA_WITH_TESTS_FAST=ON \ - -DTEST_RUNNER="srun" \ - -DGPU_TARGETS=gfx90a \ - -DAMDGPU_TARGETS=gfx90a \ - -DCMAKE_C_COMPILER=mpicc \ - -DCMAKE_CXX_COMPILER=mpic++ \ - -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \ - -DCMAKE_INSTALL_PREFIX=$INST \ - -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \ - -GNinja \ - .. # cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON -DFFTW_ROOT=$FFTW_PATH -DDCA_FIX_BROKEN_MPICH=ON -DROCM_ROOT=${ROCM_PATH} -DMAGMA_ROOT=${MAGMA_ROOT} -DLAPACK_ROOT=${OPENBLAS_ROOT} -DBLAS_ROOT=${OPENBLAS_ROOT} -DDCA_WITH_TESTS_FAST=ON -DTEST_RUNNER="srun" -DGPU_TARGETS=gfx90a -DAMDGPU_TARGETS=gfx90a -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ -DCMAKE_INSTALL_PREFIX=$INST -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" -GNinja .. - .. + diff --git a/build-aux/frontier_rocm6_load_modules.sh b/build-aux/frontier_rocm6_load_modules.sh index 44951217c..10f63107c 100644 --- a/build-aux/frontier_rocm6_load_modules.sh +++ b/build-aux/frontier_rocm6_load_modules.sh @@ -9,12 +9,14 @@ module reset module load amd-mixed/6.0.0 -spack load cmake%gcc@11.2.0 -spack load ninja%gcc@11.2.0 -spack load magma@master amdgpu_target=gfx90a -spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0 -spack load fftw ~mpi %rocmcc@6.0.0 -spack load openblas@0.3.25 %gcc@11.2.0 +module load ninja +module load cmake +#spack load cmake%gcc@11.2.0 +#spack load ninja%gcc@11.2.0 +#spack load magma@master amdgpu_target=gfx90a +#spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0 +#spack load fftw ~mpi %rocmcc@6.0.0 +#spack load openblas@0.3.25 %gcc@11.2.0 export CC=mpicc export CXX=mpicxx diff --git a/cmake/dca_config.cmake b/cmake/dca_config.cmake index 07f10d773..bcf8ff05f 100644 --- a/cmake/dca_config.cmake +++ b/cmake/dca_config.cmake @@ -416,6 +416,12 @@ else() set(TWO_PARTICLE_ALLOCATOR "dca::linalg::util::DeviceAllocator") endif() +option(DCA_WITH_CTAUX_TRACING "special debug tracing of of delayed spin updates in ctaux" OFF) +mark_as_advanced(DCA_WITH_CTAUX_TRACING) +if(DCA_WITH_CTAUX_TRACING) + add_compile_definitions(CTAUX_DEBUG_TRACING) +endif() + configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in" "${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY) diff --git a/cmake/dca_hip.cmake b/cmake/dca_hip.cmake index 2d3e0ebb3..9938676f3 100644 --- a/cmake/dca_hip.cmake +++ b/cmake/dca_hip.cmake @@ -74,6 +74,10 @@ if (CMAKE_HIP_COMPILER) set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a") set(CMAKE_HIP_STANDARD 17) list(APPEND HIP_HIPCC_FLAGS "-fPIC") + list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics") + list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread") + list(APPEND HIP_HIPCC_FLAGS_DEBUG "--save-temps -g") + # doesn't appear to work set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu) message("Enabled HIP as a language") diff --git a/include/dca/linalg/matrix.hpp b/include/dca/linalg/matrix.hpp index e55181138..bd7d96ced 100644 --- a/include/dca/linalg/matrix.hpp +++ b/include/dca/linalg/matrix.hpp @@ -112,14 +112,22 @@ class Matrix : public ALLOC { // This method is available only if device_name == CPU. template > ScalarType& operator()(int i, int j) { - assert(i >= 0 && i < size_.first); - assert(j >= 0 && j < size_.second); +#ifndef NDEBUG + if(!(i >= 0 && i <= size_.first)) + throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!"); + if(!(j >= 0 && j <= size_.second)) + throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!"); +#endif return data_[i + j * leadingDimension()]; } template > const ScalarType& operator()(int i, int j) const { - assert(i >= 0 && i < size_.first); - assert(j >= 0 && j < size_.second); +#ifndef NDEBUG + if(!(i >= 0 && i <= size_.first)) + throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!"); + if(!(j >= 0 && j <= size_.second)) + throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!"); +#endif return data_[i + j * leadingDimension()]; } @@ -142,13 +150,22 @@ class Matrix : public ALLOC { // a pointer past the end of the range if i == size().first or j == size().second. // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second. ValueType* ptr(int i, int j) { - assert(i >= 0 && i <= size_.first); - assert(j >= 0 && j <= size_.second); + //These can be an annoyance debugging so making them "manual" asserts +#ifndef NDEBUG + if(!(i >= 0 && i <= size_.first)) + throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!"); + if(!(j >= 0 && j <= size_.second)) + throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!"); +#endif return data_ + i + j * leadingDimension(); } const ValueType* ptr(int i, int j) const { - assert(i >= 0 && i <= size_.first); - assert(j >= 0 && j <= size_.second); +#ifndef NDEBUG + if(!(i >= 0 && i <= size_.first)) + throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!"); + if(!(j >= 0 && j <= size_.second)) + throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!"); +#endif return data_ + i + j * leadingDimension(); } diff --git a/include/dca/linalg/util/atomic_add_cuda.cu.hpp b/include/dca/linalg/util/atomic_add_cuda.cu.hpp index da7b35dc3..a0afb4b61 100644 --- a/include/dca/linalg/util/atomic_add_cuda.cu.hpp +++ b/include/dca/linalg/util/atomic_add_cuda.cu.hpp @@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) { atomicAddImpl(address, val); } +#elif defined(DCA_HAVE_HIP) +// HIP seems to have some horrible problem with concurrent atomic operations. +__device__ double inline atomicAddImpl(double* address, const double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } + } while (assumed != old); + return __longlong_as_double(old); +} + +__device__ double inline atomicAddImpl(float* address, const float val) { + unsigned long int* address_as_int = (unsigned long int*)address; + unsigned long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } + } while (assumed != old); + return __int_as_float(old); +} + +__device__ void inline atomicAdd(float* address, const float val) { + atomicAddImpl(address, val); +} + +__device__ void inline atomicAdd(double* address, const double val) { + atomicAddImpl(address, val); +} + +__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) { + double* a_d = reinterpret_cast(address); + atomicAddImpl(a_d, val.x); + atomicAddImpl(a_d + 1, val.y); +} + + __device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) { + double* a_d = reinterpret_cast(address); + atomicAddImpl(a_d, val.x); + atomicAddImpl(a_d + 1, val.y); +} + #else __device__ void inline atomicAdd(double* address, double val) { ::atomicAdd(address, val); @@ -62,7 +108,7 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) atomicAdd(a_d, val.x); atomicAdd(a_d + 1, val.y); } -#endif // __CUDA_ARCH__ +#endif // atomic operation help } // linalg } // dca diff --git a/include/dca/linalg/util/magma_queue.hpp b/include/dca/linalg/util/magma_queue.hpp index 01ccb017a..e9b68597e 100644 --- a/include/dca/linalg/util/magma_queue.hpp +++ b/include/dca/linalg/util/magma_queue.hpp @@ -42,6 +42,7 @@ class MagmaQueue { MagmaQueue& operator=(const MagmaQueue& rhs) = delete; MagmaQueue(MagmaQueue&& rhs) noexcept : queue_(std::move(rhs.queue_)) { + std::swap(stream_, rhs.stream_); std::swap(cublas_handle_, rhs.cublas_handle_); std::swap(cusparse_handle_, rhs.cusparse_handle_); std::swap(queue_, rhs.queue_); diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp index 61c26c833..343b3f37e 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp @@ -406,8 +406,8 @@ void CtauxAccumulator::updateFrom(walker_type& walker.get_error_distribution() = 0; #endif // DCA_WITH_QMC_BIT - single_particle_accumulator_obj.syncStreams(*event); - two_particle_accumulator_.syncStreams(*event); + //single_particle_accumulator_obj.syncStreams(*event); + //two_particle_accumulator_.syncStreams(*event); } template diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp index 3e681495b..1d60d0522 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp @@ -13,6 +13,7 @@ #ifndef DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_CTAUX_CTAUX_WALKER_HPP #define DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_CTAUX_CTAUX_WALKER_HPP +#include #include #include // uint64_t #include // std::size_t @@ -42,6 +43,7 @@ #include "dca/phys/dca_step/cluster_solver/ctaux/walker/walker_bit.hpp" #include "dca/phys/dca_step/cluster_solver/shared_tools/util/accumulator.hpp" #include "dca/util/print_time.hpp" +#include "dca/config/lattice_model.hpp" namespace dca { namespace phys { @@ -66,7 +68,7 @@ class CtauxWalker : public WalkerBIT, public CtauxWalkerData, public CtauxWalkerData trace_; +#ifndef CTAUX_DEBUG_TRACING + template + void pushOnTrace(const ARGS&... args) {} + + static void dumpTrace(const std::vector trace, const Parameters& parameters_ref, + int thread_id, int depth_limit) {} +#else + template + void pushOnTrace(const ARGS&... args) { + std::ostringstream trace; + (trace << ... << args); + trace_.push_back(trace.str()); + } + + static void dumpTrace(const std::vector trace, const Parameters& parameters_ref, + int thread_id, int depth_limit) { + auto iter = std::make_reverse_iterator(trace.end()); + auto end = std::make_reverse_iterator(trace.begin()); + for (int depth = 0; iter > end, depth < depth_limit; ++iter, ++depth) + std::cerr << "rank:" << parameters_ref.get_concurrency().get_id() << "thread:" << thread_id + << " " << *iter << '\n'; + } +#endif +#endif }; template @@ -527,6 +557,9 @@ void CtauxWalker::initialize(int iteration) { template void CtauxWalker::doSweep() { Profiler profiler("do_sweep", "CT-AUX walker", __LINE__, thread_id); +#ifndef NDEBUG + trace_.clear(); +#endif const double sweeps_per_measurement{thermalized_ ? sweeps_per_measurement_ : 1.}; // Do at least one single spin update per sweep. @@ -603,10 +636,6 @@ std::enable_if_t CtauxW read_Gamma_matrices(e_DN); actually_download_from_device(); - // Gamma_up_CPU.setAsync(Gamma_up, thread_id, stream_id); - // Gamma_dn_CPU.setAsync(Gamma_dn, thread_id, stream_id); - - // linalg::util::syncStream(thread_id, stream_id); } // In case Gamma_up and Gamma_down reside in the CPU memory, avoid the copies using swap. @@ -625,9 +654,6 @@ std::enable_if_t CtauxW read_Gamma_matrices(e_DN); actually_download_from_device(); - - // Gamma_up_CPU.swap(Gamma_up); - // Gamma_dn_CPU.swap(Gamma_dn); } // In case Gamma_up and Gamma_down do not reside in the CPU memory, copy them. @@ -824,6 +850,13 @@ int CtauxWalker::generateDelayedSpinsAbortAtBennett( } } +#ifndef NDEBUG + if (delayed_spins.size() > max_num_delayed_spins) { + std::cout << "Delayed spins = " << delayed_spins.size() << " max_num_delayed_spins = " << max_num_delayed_spins << '\n'; + if (delayed_spins.size() >= 256) + throw std::runtime_error("delayed spins hit 256 or greater bailing out!"); + } +#endif // We need to unmark all "virtual" interacting spins, that we have temporarily marked as // annihilatable in CT_AUX_HS_configuration::get_random_noninteracting_vertex(). // TODO: Eliminate the need to mark and unmark these spins. @@ -834,6 +867,12 @@ int CtauxWalker::generateDelayedSpinsAbortAtBennett( assert(single_spin_updates_proposed == currently_proposed_creations_ + currently_proposed_annihilations_ + num_statics); +#ifndef NDEBUG + if (single_spin_updates_proposed >= max_num_delayed_spins) { + std::cout << "single_spin_updates_proposed = " << single_spin_updates_proposed << " max_num_delayed_spins = " << max_num_delayed_spins << '\n'; + } +#endif + return single_spin_updates_proposed; } @@ -1010,13 +1049,23 @@ void CtauxWalker::read_Gamma_matrices(e_spin_states // Profiler profiler(concurrency_, __FUNCTION__, "CT-AUX walker", __LINE__, thread_id); switch (e_spin) { case e_DN: + linalg::util::syncStream(thread_id, stream_id); +#ifndef NDEBUG + if(Gamma_dn.capacity().first < vertex_indixes.size()) + std::cout << "gamma_dn.capacity().first == " << Gamma_dn.capacity().first << " vertex_indixes.size() == " << vertex_indixes.size() << '\n'; +#endif CT_AUX_WALKER_TOOLS::compute_Gamma( Gamma_dn, N_dn, G_dn, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id); + // assume we've no guarantee this will be allowed to finish before the async copy starts + linalg::util::syncStream(thread_id, stream_id); break; case e_UP: + linalg::util::syncStream(thread_id, stream_id); CT_AUX_WALKER_TOOLS::compute_Gamma( Gamma_up, N_up, G_up, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id); + // assume we've no guarantee this will be allowed to finish before the async copy starts + linalg::util::syncStream(thread_id, stream_id); break; default: @@ -1060,10 +1109,24 @@ void CtauxWalker::compute_Gamma_matrices() { else { add_delayed_spin(delayed_index, Gamma_up_size, Gamma_dn_size); } + // upload_to_device(); } +#ifndef NDEBUG + if (last_failed_min_max_) + std::cout << "out of delayed spins loop but last vertex failed min_max test!\n"; +#endif add_delayed_spins_to_the_configuration(); +#ifndef NDEBUG + std::ostringstream gamma_diag; + for (int i = 0; i < Gamma_dn_CPU.size().first; i++) { + auto Gamma_val = std::abs(Gamma_dn_CPU(i, i)); + gamma_diag << Gamma_val << ", "; + } + pushOnTrace("diag of Gamma_dn:", gamma_diag.str()); +#endif + remove_non_accepted_and_bennett_spins_from_Gamma(Gamma_up_size, Gamma_dn_size); // #ifdef DCA_WITH_QMC_BIT @@ -1087,22 +1150,34 @@ void CtauxWalker::neutralize_delayed_spin(int& delay Gamma_up_size += 1; CT_AUX_WALKER_TOOLS::set_to_identity( Gamma_up_CPU, delayed_spins[delayed_index].Gamma_index_HS_field_DN); +#ifndef NDEBUG + pushOnTrace("delayed_spins[", std::to_string(delayed_index), "].e_spin_HS_field_DN == e_UP"); +#endif } else { Gamma_dn_size += 1; CT_AUX_WALKER_TOOLS::set_to_identity( Gamma_dn_CPU, delayed_spins[delayed_index].Gamma_index_HS_field_DN); +#ifndef NDEBUG + pushOnTrace("delayed_spins[", std::to_string(delayed_index), "].e_spin_HS_field_DN == e_DN"); +#endif } if (delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP) { Gamma_up_size += 1; CT_AUX_WALKER_TOOLS::set_to_identity( Gamma_up_CPU, delayed_spins[delayed_index].Gamma_index_HS_field_UP); +#ifndef NDEBUG + pushOnTrace("delayed_spins[", std::to_string(delayed_index), "].e_spin_HS_field_UP == e_UP"); +#endif } else { Gamma_dn_size += 1; CT_AUX_WALKER_TOOLS::set_to_identity( Gamma_dn_CPU, delayed_spins[delayed_index].Gamma_index_HS_field_UP); +#ifndef NDEBUG + pushOnTrace("delayed_spins[", std::to_string(delayed_index), "].e_spin_HS_field_UP == e_DN"); +#endif } // delayed_index += 1; @@ -1144,19 +1219,31 @@ void CtauxWalker::remove_non_accepted_and_bennett_sp if (e_spin_HS_field_UP == e_UP) { Gamma_up_size -= 1; dca::linalg::matrixop::removeRowAndCol(Gamma_up_CPU, Gamma_index_HS_field_UP); +#ifndef NDEBUG + pushOnTrace("gu-fu-eu:", std::to_string(Gamma_index_HS_field_UP)); +#endif } else { Gamma_dn_size -= 1; dca::linalg::matrixop::removeRowAndCol(Gamma_dn_CPU, Gamma_index_HS_field_UP); +#ifndef NDEBUG + pushOnTrace("gd-fu-ed:", std::to_string(Gamma_index_HS_field_UP)); +#endif } if (e_spin_HS_field_DN == e_UP) { Gamma_up_size -= 1; dca::linalg::matrixop::removeRowAndCol(Gamma_up_CPU, Gamma_index_HS_field_DN); +#ifndef NDEBUG + pushOnTrace("gu-fd-eu:", std::to_string(Gamma_index_HS_field_DN)); +#endif } else { Gamma_dn_size -= 1; dca::linalg::matrixop::removeRowAndCol(Gamma_dn_CPU, Gamma_index_HS_field_DN); +#ifndef NDEBUG + pushOnTrace("gd-fd-ed:", std::to_string(Gamma_index_HS_field_DN)); +#endif } } } @@ -1178,6 +1265,26 @@ void CtauxWalker::add_delayed_spin(int& delayed_inde delayed_spins[delayed_index].new_HS_spin_value, delayed_spins[delayed_index].exp_delta_V_HS_field_UP)); +#ifndef NDEBUG + // The espin_up corresponds the Gamma matrix and diags used. + auto isDebugTestMinMax = [this](bool HS_field_up, bool espin_up, auto& ratio_HS_field, auto& Gamma_index_HS_field, auto& Gamma_CPU, auto& Gamma_diag_max, auto& Gamma_diag_min, int trace_depth) { + if (std::abs(ratio_HS_field) <= std::abs(Scalar(1.e-16))) + this->pushOnTrace("gamma matrix was ill conditioned!"); + if (Gamma_index_HS_field > 0 && std::abs(ratio_HS_field) > 1.e-16 && + !ctaux_tools.test_max_min(Gamma_index_HS_field, Gamma_CPU, Gamma_diag_max, + Gamma_diag_min)) { + std::cerr + << "ratiod: " << ratio_HS_field + << "After e_spin_HS_field_ " << (HS_field_up ? "UP" : "DN") + << " == e_" << (espin_up ? "UP" : "DN") << ", solve_Gamma test_max_min on Gamma_LU failed!\n"; + this->dumpTrace(trace_, parameters_, thread_id, trace_depth); + this->last_failed_min_max_ = true; + } + else + this->last_failed_min_max_ = false; + }; +#endif + if (delayed_spins[delayed_index].HS_current_move == CREATION) number_of_creations += 1; @@ -1193,34 +1300,58 @@ void CtauxWalker::add_delayed_spin(int& delayed_inde Scalar ratio_HS_field_DN = 0.; Scalar ratio_HS_field_UP = 0.; + // So what are these really for, in the first block below I use them for the debug max_min check Real tmp_up_diag_max = Gamma_up_diag_max; Real tmp_up_diag_min = Gamma_up_diag_min; Real tmp_dn_diag_max = Gamma_dn_diag_max; Real tmp_dn_diag_min = Gamma_dn_diag_min; +#ifndef NDEBUG + int trace_depth = 64; +#endif + { if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP) { // std::cout << "\t" << "e_UP" << "\t" << Gamma_index_HS_field_DN << "\t" << Gamma_up_size << // "\t|\t"; - assert(Gamma_index_HS_field_DN == Gamma_up_size); - - ratio_HS_field_DN = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_DN, Gamma_up_CPU, - exp_delta_V_HS_field_DN, - Gamma_up_diag_max, Gamma_up_diag_min); - +#ifndef NDEBUG + if (Gamma_index_HS_field_DN != Gamma_up_size) { + std::cerr << "Gamma_index_HS_field_DN = " << Gamma_index_HS_field_DN << " Gamma_up_size = " << Gamma_up_size << '\n'; + } +#endif + if constexpr (Lattice::spin_symmetric) + ratio_HS_field_DN = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_DN, Gamma_up_CPU, + exp_delta_V_HS_field_DN, + Gamma_up_diag_max, Gamma_up_diag_min); + else + std::cout << "warning Gamma_up delayed spin for spin_symmetric=false model"; + +#ifndef NDEBUG + isDebugTestMinMax(false, true, ratio_HS_field_DN, Gamma_index_HS_field_DN, Gamma_up_CPU, Gamma_up_diag_max, Gamma_up_diag_min, trace_depth); +#endif + Gamma_up_size += 1; } else { // std::cout << "\t" << "e_DN" << "\t" << Gamma_index_HS_field_DN << "\t" << Gamma_dn_size << // "\t|\t"; - assert(Gamma_index_HS_field_DN == Gamma_dn_size); +#ifndef NDEBUG + if (Gamma_index_HS_field_DN != Gamma_dn_size) { + std::cerr << "Gamma_index_HS_field_DN = " << Gamma_index_HS_field_DN << " Gamma_dn_size = " << Gamma_dn_size << '\n'; + } +#endif ratio_HS_field_DN = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_DN, Gamma_dn_CPU, exp_delta_V_HS_field_DN, Gamma_dn_diag_max, Gamma_dn_diag_min); + +#ifndef NDEBUG + isDebugTestMinMax(false, false, ratio_HS_field_DN, Gamma_index_HS_field_DN, Gamma_dn_CPU, Gamma_dn_diag_max, Gamma_dn_diag_min, trace_depth); +#endif + Gamma_dn_size += 1; } @@ -1228,24 +1359,41 @@ void CtauxWalker::add_delayed_spin(int& delayed_inde // std::cout << "\t" << "e_UP" << "\t" << Gamma_index_HS_field_UP << "\t" << Gamma_up_size << // "\t|\t"; - assert(Gamma_index_HS_field_UP == Gamma_up_size); - - ratio_HS_field_UP = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_UP, Gamma_up_CPU, - exp_delta_V_HS_field_UP, - Gamma_up_diag_max, Gamma_up_diag_min); - +#ifndef NDEBUG + if (Gamma_index_HS_field_UP != Gamma_up_size) { + std::cerr << "Gamma_index_HS_field_UP = " << Gamma_index_HS_field_UP << " Gamma_up_size = " << Gamma_up_size << '\n'; + } +#endif + + if constexpr (Lattice::spin_symmetric) + ratio_HS_field_UP = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_UP, Gamma_up_CPU, + exp_delta_V_HS_field_UP, + Gamma_up_diag_max, Gamma_up_diag_min); + else + std::cout << "warning Gamma_up delayed spin for spin_symmetric=false model"; + +#ifndef NDEBUG + isDebugTestMinMax(true, true, ratio_HS_field_UP, Gamma_index_HS_field_UP, Gamma_up_CPU, Gamma_up_diag_max, Gamma_up_diag_min, trace_depth); +#endif Gamma_up_size += 1; } else { // std::cout << "\t" << "e_DN" << "\t" << Gamma_index_HS_field_UP << "\t" << Gamma_dn_size << // "\t|\t"; - assert(Gamma_index_HS_field_UP == Gamma_dn_size); - +#ifndef NDEBUG + if (Gamma_index_HS_field_UP != Gamma_dn_size) { + std::cerr << "Gamma_index_HS_field_UP = " << Gamma_index_HS_field_UP << " Gamma_dn_size = " << Gamma_dn_size << '\n'; + } +#endif + ratio_HS_field_UP = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_UP, Gamma_dn_CPU, exp_delta_V_HS_field_UP, Gamma_dn_diag_max, Gamma_dn_diag_min); +#ifndef NDEBUG + isDebugTestMinMax(true, false, ratio_HS_field_UP, Gamma_index_HS_field_UP, Gamma_dn_CPU, Gamma_dn_diag_max, Gamma_dn_diag_min, trace_depth); +#endif Gamma_dn_size += 1; } @@ -1253,9 +1401,6 @@ void CtauxWalker::add_delayed_spin(int& delayed_inde } const auto determinant_ratio = ratio_HS_field_UP * ratio_HS_field_DN; - // std::cout << "ratio up: " << std::setprecision(16) << ratio_HS_field_UP - // << " ratio down: " << std::setprecision(16) << ratio_HS_field_DN << '\n'; - // std::cout << "determinant_ratio: " << determinant_ratio << '\n'; auto acceptance_ratio = calculate_acceptance_ratio(determinant_ratio, delayed_spins[delayed_index].HS_current_move, delayed_spins[delayed_index].QMC_factor); @@ -1269,6 +1414,10 @@ void CtauxWalker::add_delayed_spin(int& delayed_inde if (std::abs(acceptance_ratio) >= rng()) { delayed_spins[delayed_index].is_accepted_move = true; +#ifndef NDEBUG + if (last_failed_min_max_) + std::cout << "Accepted but last vertex insert failed min_max test!\n"; +#endif // Update Monte Carlo weight. mc_log_weight_ += std::log(std::abs(determinant_ratio)); // std::cout << "mc_log_weight: " << mc_log_weight_ << '\n'; @@ -1296,40 +1445,73 @@ void CtauxWalker::add_delayed_spin(int& delayed_inde number_of_interacting_spins -= 1; } else { - /* - Gamma_up_diag_max = tmp_up_diag_max<1. ? 1. : tmp_up_diag_max; - Gamma_dn_diag_max = tmp_dn_diag_max<1. ? 1. : tmp_dn_diag_max; - Gamma_up_diag_min = tmp_up_diag_min>1. ? 1. : tmp_up_diag_min; - Gamma_dn_diag_min = tmp_dn_diag_min>1. ? 1. : tmp_dn_diag_min; +#ifndef NDEBUG + if (last_failed_min_max_) + std::cout << "Rejected but last vertex insert failed min_max test!\n"; +#endif + + /* + Gamma_up_diag_max = tmp_up_diag_max<1. ? 1. : tmp_up_diag_max; + Gamma_dn_diag_max = tmp_dn_diag_max<1. ? 1. : tmp_dn_diag_max; + Gamma_up_diag_min = tmp_up_diag_min>1. ? 1. : tmp_up_diag_min; + Gamma_dn_diag_min = tmp_dn_diag_min>1. ? 1. : tmp_dn_diag_min; + + delayed_spins[delayed_index].is_accepted_move = false; + + if(delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP) + { + CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_up_CPU, Gamma_up_size-1); + } + else + { + CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_dn_CPU, Gamma_dn_size-1); + } - delayed_spins[delayed_index].is_accepted_move = false; + if(delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP) + { + CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_up_CPU, Gamma_up_size-1); + } + else + { + CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_dn_CPU, Gamma_dn_size-1); + } + */ - if(delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP) - { - CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_up_CPU, Gamma_up_size-1); - } - else - { - CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_dn_CPU, Gamma_dn_size-1); - } + // delayed_spins[delayed_index].is_accepted_move = false; - if(delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP) - { - CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_up_CPU, Gamma_up_size-1); - } - else - { - CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_dn_CPU, Gamma_dn_size-1); - } - */ +#ifndef NDEBUG + auto getNewDiagMin = [](auto& Gamma, auto gamma_size, auto& gamma_min) -> Scalar { + for (int i = 0; i < gamma_size - 1; ++i) + gamma_min = std::min(std::abs(gamma_min), std::abs(Gamma(i, i))); + }; +#endif if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP and delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP) { Gamma_up_diag_max = tmp_up_diag_max < 1. ? 1. : tmp_up_diag_max; Gamma_up_diag_min = tmp_up_diag_min > 1. ? 1. : tmp_up_diag_min; +#ifndef NDEBUG + bool was_lowest = false; + for (int n : {Gamma_up_size - 2, Gamma_up_size - 1}) + if (Gamma_up_CPU(n, n) == Gamma_up_diag_min) { + was_lowest = true; + break; + } +#endif + CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_up_CPU, Gamma_up_size - 2); CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_up_CPU, Gamma_up_size - 1); + +#ifndef NDEBUG + if (was_lowest) + getNewDiagMin(Gamma_up_CPU, Gamma_up_size, Gamma_up_diag_min); + // here we have to check if Gamma_up_diag_min was one of the values that just got set to 1 if + // it is well that's bad for passing test_max_min + pushOnTrace("double delayed_spins[", std::to_string(delayed_index), + "].e_spin_HS_field_[DN,UP] == [e_UP,e_UP] G_up:", std::to_string(Gamma_up_size - 2), + ", G_up:", std::to_string(Gamma_up_size - 1)); +#endif } if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_DN and @@ -1341,6 +1523,11 @@ void CtauxWalker::add_delayed_spin(int& delayed_inde CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_dn_CPU, Gamma_dn_size - 1); CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_up_CPU, Gamma_up_size - 1); +#ifndef NDEBUG + pushOnTrace("double delayed_spins[", std::to_string(delayed_index), + "].e_spin_HS_field_[DN,UP] == [e_DN,e_UP] G_dn:", std::to_string(Gamma_dn_size - 1), + ", G_up:", std::to_string(Gamma_up_size - 1)); +#endif } if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP and @@ -1352,18 +1539,43 @@ void CtauxWalker::add_delayed_spin(int& delayed_inde CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_up_CPU, Gamma_up_size - 1); CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_dn_CPU, Gamma_dn_size - 1); +#ifndef NDEBUG + pushOnTrace("double delayed_spins[", std::to_string(delayed_index), + "].e_spin_HS_field_[DN,UP] == [e_UP,e_DN] G_dn", + std::to_string(Gamma_dn_size - 1), ", G_up:", std::to_string(Gamma_up_size - 1)); +#endif //NDEBUG } if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_DN and delayed_spins[delayed_index].e_spin_HS_field_UP == e_DN) { Gamma_dn_diag_max = tmp_dn_diag_max < 1. ? 1. : tmp_dn_diag_max; - Gamma_dn_diag_min = tmp_dn_diag_min > 1. ? 1. : tmp_dn_diag_min; + Gamma_dn_diag_min = tmp_dn_diag_min > 1 ? 1. : tmp_dn_diag_min; + +#ifndef NDEBUG + bool was_lowest = false; + for (int n : {Gamma_dn_size - 2, Gamma_dn_size - 1}) + if (Gamma_dn_CPU(n, n) == Gamma_dn_diag_min) { + was_lowest = true; + break; + } +#endif CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_dn_CPU, Gamma_dn_size - 2); CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_dn_CPU, Gamma_dn_size - 1); +#ifndef NDEBUG + if (was_lowest) + getNewDiagMin(Gamma_dn_CPU, Gamma_dn_size, Gamma_dn_diag_min); + pushOnTrace("double delayed_spins[", std::to_string(delayed_index), + "].e_spin_HS_field_[DN,UP] == [e_DN,e_DN] G_dn:", std::to_string(Gamma_dn_size - 2), + ", G_dn:", std::to_string(Gamma_dn_size - 1)); +#endif } + // Here I think Gamma has to go back to the GPU + // In the current code it doesn't until the loop over delayed spins ends. I don't know + // why this is wrong as everything seems to be happening on the CPU, but + // epirically it seems elements aren't ending up as identity on the GPU that should. } -} // namespace ctaux +} template void CtauxWalker::apply_bennett_on_Gamma_matrices(int& /*Gamma_up_size*/, @@ -1603,9 +1815,7 @@ template template const linalg::util::GpuEvent* CtauxWalker::computeM( std::array, 2>& Ms) { - // Stream 1 waits on stream 0. - sync_streams_event_.record(linalg::util::getStream(thread_id, 0)); - sync_streams_event_.block(linalg::util::getStream(thread_id, 1)); + linalg::util::syncStream(thread_id, stream_id); for (int s = 0; s < 2; ++s) { const auto& config = get_configuration().get(s == 0 ? e_DN : e_UP); @@ -1620,18 +1830,15 @@ const linalg::util::GpuEvent* CtauxWalker::computeM( M.resizeNoCopy(N.size()); if (device_t == linalg::GPU) { - exp_v_minus_one_dev_[s].setAsync(exp_v_minus_one_[s], thread_id, s); - dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_dev_[s], N, M, thread_id, s); + exp_v_minus_one_dev_[s].setAsync(exp_v_minus_one_[s], thread_id, stream_id); + dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_dev_[s], N, M, thread_id, + stream_id); } else { dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_[s], N, M); } } - - m_computed_events_[1].record(linalg::util::getStream(thread_id, 1)); - m_computed_events_[1].block(linalg::util::getStream(thread_id, 0)); - - m_computed_events_[0].record(linalg::util::getStream(thread_id, 0)); + m_computed_events_[0].record(linalg::util::getStream(thread_id, stream_id)); return &m_computed_events_[0]; } diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp index d41523f84..c4cfe721d 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp @@ -34,6 +34,8 @@ namespace solver { namespace ctaux { // dca::phys::solver::ctaux:: +/** Represents the Hubbard-Stratonovitch vertices in a particular configuration. + */ template class CT_AUX_HS_configuration { public: diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp index 857e32e0a..63a5820b8 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp @@ -82,24 +82,36 @@ class CT_AUX_WALKER_TOOLS { // exp_delta_V); auto solve_Gamma(int n, dca::linalg::Matrix& Gamma_LU, Scalar exp_delta_V, Real& max, Real& min) -> Real; - auto solve_Gamma_blocked(int n, dca::linalg::Matrix& Gamma_LU, + + /** Solve gamma using blocked submatrix updates + * \param[in] int + * \param[inout] Gamma_LU + * \param[ioout] max + * \param[inout] min + */ + auto solve_Gamma_blocked(const int n, dca::linalg::Matrix& Gamma_LU, Scalar exp_delta_V, Real& max, Real& min) -> Scalar; auto apply_bennett_on_Gamma(int k, int n, dca::linalg::Matrix& Gamma_LU, Scalar phani_gamma, Real& max, Real& min) -> Real; + /** Debugging function + * requires: n > 0 + */ + static bool test_max_min(int n, dca::linalg::Matrix& Gamma_LU, Real max, + Real min); + private: void solve_Gamma_slow(int n, dca::linalg::Matrix& Gamma_LU); + void solve_Gamma_slow(int n, Scalar* Gamma_LU, int lda); void solve_Gamma_fast(int n, dca::linalg::Matrix& Gamma_LU); + void solve_Gamma_fast(int n, Scalar* A, int LD); void solve_Gamma_BLAS(int n, dca::linalg::Matrix& Gamma_LU); + void solve_Gamma_BLAS(int n, Scalar* Gamma_LU, int lda); - void solve_Gamma_fast(int n, Scalar* A, int LD); void solve_Gamma_blocked(int n, dca::linalg::Matrix& Gamma_LU); - bool test_max_min(int n, dca::linalg::Matrix& Gamma_LU, Real max, - Real min); - /** Return x/y. * nothing but that is done for real number but for complex division an implementation consistent with cuda * and magma libraries is used. diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc index b6aa9281a..25862b83d 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc +++ b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc @@ -129,7 +129,7 @@ void N_MATRIX_TOOLS::copy_rows( assert(N_new_spins.nrCols() == N.nrCols()); assert(N_new_spins.nrRows() == permutation.size()); assert(permutation.size() <= identity.size()); - + dca::linalg::util::syncStream(thread_id, stream_id); dca::linalg::matrixop::copyRows(N, permutation, N_new_spins, identity, thread_id, stream_id); } @@ -139,6 +139,7 @@ void N_MATRIX_TOOLS::compute_G_cols( dca::linalg::Matrix& G, dca::linalg::Matrix& G_cols) { exp_V.setAsync(exp_V_CPU, linalg::util::getStream(thread_id, stream_id)); + dca::linalg::util::syncStream(thread_id, stream_id); assert(N.nrRows() == G.nrRows()); assert(N.nrRows() == G_cols.nrRows()); diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp index b869876da..752c3283a 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp @@ -53,7 +53,7 @@ class DMatrixBuilder final : public DMatrixBuilder::computeG0. // Out: G0. Device matrix void computeG0(Matrix& G0, const details::DeviceConfiguration& configuration, int n_init, - bool right_section, GpuStream stream) const override; + bool right_section, const GpuStream& stream) const; private: const G0Interpolation& g0_ref_; diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp index 8157478a1..d749dfc93 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp @@ -30,7 +30,17 @@ namespace details { template void buildG0Matrix(linalg::MatrixView G0, const int n_init, const bool right_section, DeviceConfiguration config, - DeviceInterpolationData g0_interp, dca::linalg::util::GpuStream stream); + DeviceInterpolationData g0_interp, const dca::linalg::util::GpuStream& stream); +extern template void buildG0Matrix(linalg::MatrixView, const int, const bool, + DeviceConfiguration, DeviceInterpolationData, const dca::linalg::util::GpuStream&); +extern template void buildG0Matrix(linalg::MatrixView, const int, const bool, + DeviceConfiguration, DeviceInterpolationData, const dca::linalg::util::GpuStream&); +extern template void buildG0Matrix(linalg::MatrixView, linalg::GPU>, const int, + const bool, DeviceConfiguration, + DeviceInterpolationData, std::complex>, const dca::linalg::util::GpuStream&); +extern template void buildG0Matrix(linalg::MatrixView, linalg::GPU>, const int, + const bool, DeviceConfiguration, + DeviceInterpolationData, std::complex>, const dca::linalg::util::GpuStream&); } // namespace details } // namespace ctint diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp index 1984fe0c6..679941d6f 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp @@ -30,7 +30,7 @@ namespace details { template void computeG0(linalg::MatrixView& g0_mat, DeviceInterpolationData> g0, const Real* t_l, const int* b_l, - const int* r_lf, const Real* t_r, const int* b_r, const int* r_r, const dca::linalg::util::GpuStream stream); + const int* r_lf, const Real* t_r, const int* b_r, const int* r_r, const dca::linalg::util::GpuStream& stream); diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp index e75b4fba6..ef891428d 100644 --- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp +++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp @@ -96,7 +96,7 @@ class SpAccumulator } auto get_streams() { - return std::array{&streams_[0], &streams_[1]}; + return std::array{&streams_[0], &streams_[0]}; } // Returns the allocated device memory in bytes. @@ -122,7 +122,7 @@ class SpAccumulator */ void finalizeFunction(std::array& ft_objs, MFunction& function, bool m_sqr); - std::array streams_; + std::array streams_; /** gpu M_r_t */ std::array cached_nfft_obj_; /** \todo Don't always pay the memory cost even when not collect single measurement G's */ @@ -137,9 +137,9 @@ SpAccumulator::SpAccumulator(const Parameters& paramete : BaseClass(parameters_ref, accumulate_m_sqr), streams_(), cached_nfft_obj_{NfftType(parameters_.get_beta(), streams_[0], accumulate_m_sqr), - NfftType(parameters_.get_beta(), streams_[1], accumulate_m_sqr)}, + NfftType(parameters_.get_beta(), streams_[0], accumulate_m_sqr)}, single_measurement_M_r_t_device_{NfftType(parameters_.get_beta(), streams_[0], false), - NfftType(parameters_.get_beta(), streams_[1], false)} { + NfftType(parameters_.get_beta(), streams_[0], false)} { single_measurement_M_r_w_.reset(new MFunction("M_r_w")); } @@ -189,7 +189,7 @@ void SpAccumulator::accumulate( const std::array& configs, const Scalar factor) { std::array, 2> M_dev; for (int s = 0; s < 2; ++s) - M_dev[s].setAsync(Ms[s], streams_[s]); + M_dev[s].setAsync(Ms[s], streams_[0]); accumulate(M_dev, configs, factor); } diff --git a/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp b/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp index 7053def2f..825abdcba 100644 --- a/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp +++ b/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp @@ -310,9 +310,10 @@ double StdThreadQmciClusterSolver::finalize(dca_info_struct_t& dca_i // CTINT calculates its error here maybe double L2_Sigma_difference = QmciSolver::finalize(dca_info_struct); - if (dca_iteration_ == parameters_.get_dca_iterations() - 1) + if (dca_iteration_ == parameters_.get_dca_iterations() - 1) { + std::cout << "Writing Configurations.\n"; writeConfigurations(); - + } // autocorrelation_data_.sumConcurrency(concurrency_); if (BaseClass::writer_ && *BaseClass::writer_ && concurrency_.id() == concurrency_.first()) { diff --git a/include/dca/phys/models/general_interaction.hpp b/include/dca/phys/models/general_interaction.hpp index 50fa15a50..141e8e368 100644 --- a/include/dca/phys/models/general_interaction.hpp +++ b/include/dca/phys/models/general_interaction.hpp @@ -72,7 +72,7 @@ void general_interaction::set_vertex( func::dmn_variadic, RDmn>>& H_interaction) { // Create the vector of correlated spin-orbitals once. - static const std::vector correlated_orbitals = + const std::vector correlated_orbitals = general_interaction::make_correlated_orbitals(parameters, H_interaction); // Get a random pair of correlated spin-orbitals. diff --git a/include/dca/phys/parameters/model_parameters_bilayer_hubbard.inc b/include/dca/phys/parameters/model_parameters_bilayer_hubbard.inc index 18ad1e557..0c0a2eeba 100644 --- a/include/dca/phys/parameters/model_parameters_bilayer_hubbard.inc +++ b/include/dca/phys/parameters/model_parameters_bilayer_hubbard.inc @@ -156,5 +156,22 @@ void ModelParameters::compute_Gamma( Scalar N_ij = N(configuration_e_spin_index_i, configuration_e_spin_index_j); - Gamma(i, j) = (N_ij * exp_V[j] - delta) / (exp_V[j] - Real(1.)); + Gamma(i, j) = consistentScalarDiv((N_ij * exp_V[j] - delta), (exp_V[j] - Real(1.))); } else { Gamma(i, j) = @@ -100,10 +100,10 @@ void CT_AUX_WALKER_TOOLS::compute_Gamma( // return quot; // }; - Scalar gamma_k = exp_delta_V[j]; - auto y = gamma_k - dca::util::RealAlias(1.0); - Scalar inter_gamma = consistentScalarDiv(gamma_k, y); //(gamma_k) / (gamma_k - Real(1.)); - Gamma(i, j) -= inter_gamma; //(gamma_k) / (gamma_k - Real(1.)); + Scalar gamma_k = exp_delta_V[j]; + auto y = gamma_k - dca::util::RealAlias(1.0); + Scalar inter_gamma = consistentScalarDiv(gamma_k, y); //(gamma_k) / (gamma_k - Real(1.)); + Gamma(i, j) -= inter_gamma; //(gamma_k) / (gamma_k - Real(1.)); // } // else { // Scalar gamma_k = exp_delta_V[j]; @@ -149,24 +149,35 @@ bool CT_AUX_WALKER_TOOLS::test_max_min( min = Gamma_val < min ? Gamma_val : min; } - if (std::abs(max_ref - max) < 1.e-12 and std::fabs(min_ref - min) < 1.e-12) + // It's hard to get the min test right and it's not actually a failure state + // and std::fabs(min_ref - min) < 1.e-12) + if (std::abs(max_ref - max) < 1.e-12) return true; else { - std::cout << __FUNCTION__ << " for Gamma_LU has failed!\n"; - std::cout << "Has failed!\n"; + std::cout << __FUNCTION__ << " for Gamma_LU has failed.!\n"; std::cout.precision(16); std::cout << "\n\t n : " << n << "\n"; + for (int i = 1; i < n + 1; i++) { + Gamma_val = std::abs(Gamma_LU(i, i)); + std::cout << Gamma_val << ", "; + } + std::cout << '\n'; std::cout << std::scientific; std::cout << "max" - << "\t" + << "\t\t" << "max_ref" - << "\t" + << "\t\t" << "std::fabs(max_ref - max)" << '\n'; - std::cout << max << "\t" << max_ref << "\t" << std::fabs(max_ref - max) << '\n'; - std::cout << min << "\t" << min_ref << "\t" << std::fabs(min_ref - min) << '\n'; + std::cout << max_ref << "\t" << max << "\t" << std::fabs(max_ref - max) << '\n'; + std::cout << "min" + << "\t\t" + << "min_ref" + << "\t\t" + << "std::fabs(min_ref - min)" << '\n'; + std::cout << min_ref << "\t" << min << "\t" << std::fabs(min_ref - min) << '\n'; std::cout << std::endl; - Gamma_LU.print(); + // Gamma_LU.print(); return false; } @@ -177,8 +188,8 @@ auto CT_AUX_WALKER_TOOLS::solve_Gamma( int n, dca::linalg::Matrix& Gamma_LU, Scalar exp_delta_V, Real& max, Real& min) -> Real { // solve_Gamma_slow(n, Gamma_LU); - solve_Gamma_fast(n, Gamma_LU); - // solve_Gamma_BLAS(n, Gamma_LU); + // solve_Gamma_fast(n, Gamma_LU); + solve_Gamma_BLAS(n, Gamma_LU); Scalar Gamma_LU_n_n = Gamma_LU(n, n); Real Gamma_val = std::abs(Gamma_LU_n_n); @@ -199,10 +210,14 @@ auto CT_AUX_WALKER_TOOLS::solve_Gamma( min = Gamma_val; } + // Here this is fine since we don't reset Gamma_LU to identity as in the blocked solve #ifndef NDEBUG - if (!test_max_min(n, Gamma_LU, max, min)) - throw std::runtime_error("solve_Gamma_blocked test_max_min on Gamma_LU failed!"); + if (!test_max_min(n, Gamma_LU, max, min)) { + std::cerr << "solve_Gamma test_max_min on Gamma_LU failed!\n"; + throw std::runtime_error("solve_Gamma test_max_min on Gamma_LU failed!"); + } #endif + Scalar phani_gamma = exp_delta_V - Real(1.); Scalar determinant_ratio = -phani_gamma * Gamma_LU_n_n; @@ -268,6 +283,47 @@ void CT_AUX_WALKER_TOOLS::solve_Gamma_slow( } } +template +void CT_AUX_WALKER_TOOLS::solve_Gamma_slow(int n, Scalar* Gamma_LU, int LD) { + { + Scalar* y = &Gamma_LU[0 + n * LD]; // Gamma_LU.ptr(0, n); + Scalar* x = &Gamma_LU[n]; // Gamma_LU.ptr(n, 0); + + { + if (false) { // serial + for (int i = 0; i < n; i++) + for (int j = 0; j < i; j++) + y[i] -= Gamma_LU[i + j * LD] * y[j]; + } + else { // parallell + for (int j = 0; j < n; j++) + for (int i = j + 1; i < n; i++) + y[i] -= Gamma_LU[i + j * LD] * y[j]; + } + } + + { + if (true) { // serial + for (int j = 0; j < n; j++) { + for (int i = 0; i < j; i++) + x[j * LD] -= x[i * LD] * Gamma_LU[i + j * LD]; + x[j * LD] /= Gamma_LU[j + j * LD]; + } + } + else { // parallell + for (int i = 0; i < n; i++) { + x[i * LD] /= Gamma_LU[i + i * LD]; + for (int j = i + 1; j < n; j++) + x[j * LD] -= x[i * LD] * Gamma_LU[i + j * LD]; + } + } + } + + for (int i = 0; i < n; i++) + Gamma_LU[n + n * LD] -= x[i * LD] * y[i]; + } +} + /*! / | \ / | \ / | \ * | | | | | | | | | * \Gamma_{n+1} := | \Gamma_n | s | ---\ | L_n | 0 | | U_n | y | @@ -383,10 +439,19 @@ void CT_AUX_WALKER_TOOLS::solve_Gamma_BLAS( } } +template +void CT_AUX_WALKER_TOOLS::solve_Gamma_BLAS( + int n, Scalar* Gamma_LU /*, Scalar exp_delta_V*/, int lda) { + { + dca::linalg::blas::trsv("L", "N", "U", n, Gamma_LU, lda, Gamma_LU, 1); + dca::linalg::blas::trsv("U", "T", "N", n, Gamma_LU, lda, Gamma_LU, lda); + } +} + template auto CT_AUX_WALKER_TOOLS::solve_Gamma_blocked( - int n, dca::linalg::Matrix& Gamma_LU, Scalar exp_delta_V, Real& max, - Real& min) -> Scalar { + int const n, dca::linalg::Matrix& Gamma_LU, Scalar exp_delta_V, + Real& max, Real& min) -> Scalar { // std::cout << "\t(" << min << ", " << max << " ) "; solve_Gamma_blocked(n, Gamma_LU); @@ -407,13 +472,20 @@ auto CT_AUX_WALKER_TOOLS::solve_Gamma_blocked( // matrix. // Since the current diagonal element should not be considered for max/min, we need to already // update the Gamma matrix (which will set it to 1). - CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_LU, n); - + // CT_AUX_WALKER_TOOLS::set_to_identity(Gamma_LU, n); return Scalar(1.e-16); } else { max = new_max; min = new_min; + +#ifndef NDEBUG + // // This has to be here since it will fail almost always when we set Gamma_LU to identity. + // if (!test_max_min(n, Gamma_LU, max, min)) { + // std::cerr << "solve_Gamma_blocked test_max_min on Gamma_LU failed!\n"; + // //throw std::runtime_error("solve_Gamma_blocked test_max_min on Gamma_LU failed!"); + // } +#endif } } else { @@ -421,13 +493,7 @@ auto CT_AUX_WALKER_TOOLS::solve_Gamma_blocked( min = Gamma_val; } - // std::cout << min << ", " << max << ")\t"; -#ifndef NDEBUG - if (!test_max_min(n, Gamma_LU, max, min)) - throw std::runtime_error("solve_Gamma_blocked test_max_min on Gamma_LU failed!"); -#endif - - auto phani_gamma = exp_delta_V - Real(1.); + auto phani_gamma = exp_delta_V - Scalar(1.); auto determinant_ratio = -phani_gamma * Gamma_LU_n_n; return determinant_ratio; @@ -503,6 +569,7 @@ void CT_AUX_WALKER_TOOLS::solve_Gamma_blocked( } solve_Gamma_fast(l, A_00, LD); + // solve_Gamma_slow(l, A_00, LD); { Scalar xy = 0; @@ -610,7 +677,7 @@ auto CT_AUX_WALKER_TOOLS::apply_bennett_on_Gamma( Scalar ratio = 1.; for (int i = 0; i < n; ++i) - ratio *= consistentScalarDiv(Gamma_LU(i,i), d_ptr[i]); + ratio *= consistentScalarDiv(Gamma_LU(i, i), d_ptr[i]); { Real Gamma_val = std::abs(Gamma_LU(0, 0)); @@ -630,7 +697,7 @@ auto CT_AUX_WALKER_TOOLS::apply_bennett_on_Gamma( } const Scalar phani_gamma = exp_delta_V - Real(1.); - const Scalar det_ratio = consistentScalarDiv(-ratio,phani_gamma); + const Scalar det_ratio = consistentScalarDiv(-ratio, phani_gamma); if (std::abs(std::imag(det_ratio)) > std::numeric_limits::epsilon() * 1000) { throw(std::logic_error("The determinant is complex.")); diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp index 2a2f90837..2b59fffa6 100644 --- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp +++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp @@ -40,7 +40,7 @@ template void DMatrixBuilder::computeG0(Matrix& G0, const details::DeviceConfiguration& configuration, const int n_init, bool right_section, - GpuStream stream) const { + const GpuStream& stream) const { if (G0.nrRows() * G0.nrCols() == 0) return; details::buildG0Matrix(linalg::MatrixView(G0), n_init, right_section, diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu index efcf3fab6..632639e10 100644 --- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu +++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu @@ -66,7 +66,7 @@ __global__ void buildG0MatrixKernel(linalg::MatrixView G0, template void buildG0Matrix(linalg::MatrixView g0, const int n_init, const bool right_section, DeviceConfiguration config, - DeviceInterpolationData g0_interp, GpuStream stream) { + DeviceInterpolationData g0_interp, const GpuStream& stream) { // assert(CtintHelper::is_initialized()); const auto blocks = dca::util::getBlockSize(g0.nrRows(), g0.nrCols()); @@ -75,16 +75,16 @@ void buildG0Matrix(linalg::MatrixView g0, const int n_init, checkErrorsCudaDebug(); } - template void buildG0Matrix(linalg::MatrixView, const int, const bool, - DeviceConfiguration, DeviceInterpolationData, GpuStream); - template void buildG0Matrix(linalg::MatrixView, const int, const bool, - DeviceConfiguration, DeviceInterpolationData, GpuStream); - template void buildG0Matrix(linalg::MatrixView, linalg::GPU>, const int, +template void buildG0Matrix(linalg::MatrixView, const int, const bool, + DeviceConfiguration, DeviceInterpolationData, const GpuStream&); +template void buildG0Matrix(linalg::MatrixView, const int, const bool, + DeviceConfiguration, DeviceInterpolationData, const GpuStream&); +template void buildG0Matrix(linalg::MatrixView, linalg::GPU>, const int, const bool, DeviceConfiguration, - DeviceInterpolationData, std::complex>, GpuStream); - template void buildG0Matrix(linalg::MatrixView, linalg::GPU>, const int, + DeviceInterpolationData, std::complex>, const GpuStream&); +template void buildG0Matrix(linalg::MatrixView, linalg::GPU>, const int, const bool, DeviceConfiguration, - DeviceInterpolationData, std::complex>, GpuStream); + DeviceInterpolationData, std::complex>, const GpuStream&); } // namespace details } // namespace ctint diff --git a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator_kernels.cu b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator_kernels.cu index ef1e431f5..5c9cb93fc 100644 --- a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator_kernels.cu +++ b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator_kernels.cu @@ -45,7 +45,7 @@ template void computeG0(linalg::MatrixView& g0_mat, const DeviceInterpolationData g0, const Real* t_l, const int* b_l, const int* r_l, const Real* t_r, const int* b_r, const int* r_r, - const GpuStream stream) { + const GpuStream& stream) { assert(SolverHelper::initialized()); auto blocks = dca::util::get2DBlockSize(g0_mat.nrRows(), g0_mat.nrCols(), 32); @@ -56,19 +56,19 @@ void computeG0(linalg::MatrixView& g0_mat, template<> void computeG0(linalg::MatrixView&, const DeviceInterpolationData, const double*, const int*, const int*, const double*, - const int*, const int*, const GpuStream); + const int*, const int*, const GpuStream&); template<> void computeG0(linalg::MatrixView&, const DeviceInterpolationData, const float*, const int*, const int*, const float*, const int*, const int*, - const GpuStream); + const GpuStream&); template<> void computeG0, double, std::complex>( linalg::MatrixView, linalg::GPU>&, const DeviceInterpolationData, std::complex>, const double*, const int*, - const int*, const double*, const int*, const int*, GpuStream); + const int*, const double*, const int*, const int*, const GpuStream&); template<> void computeG0, float, std::complex>( linalg::MatrixView, linalg::GPU>&, const DeviceInterpolationData, std::complex>, const float*, const int*, const int*, - const float*, const int*, const int*, GpuStream); + const float*, const int*, const int*, const GpuStream&); } // namespace details diff --git a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu index 39259bc02..412cff348 100644 --- a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu +++ b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu @@ -983,77 +983,76 @@ template double updateG4, FourPointType::PARTICLE_PARTICLE_ // complex g0 -template <> -double updateG4, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex>( +template double updateG4, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_CHARGE, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, std::complex>(std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, std::complex>(std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_CHARGE, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP, std::complex>(std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN, std::complex>(std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); -template <> +template double updateG4, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex>( std::complex* G4, const std::complex* G_up, const int ldgu, const std::complex* G_down, const int ldgd, const std::complex factor, @@ -1070,6 +1069,11 @@ template double updateG4NoSpin, FourPointType::PARTICLE_PAR const std::complex factor, bool atomic, cudaStream_t stream, std::size_t start, std::size_t end); +template double updateG4NoSpin, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::int8_t>( + std::complex* G4, const std::complex* G_up, const int ldgu, + const std::int8_t factor, bool atomic, + cudaStream_t stream, std::size_t start, std::size_t end); + // template<> double updateG4< FourPointType::PARTICLE_HOLE_TRANSVERSE>( diff --git a/test/integration/cluster_solver/ctint/ctint_square_lattice_test_gpu.cpp b/test/integration/cluster_solver/ctint/ctint_square_lattice_test_gpu.cpp index 0a0bb8ef4..79464fcf9 100644 --- a/test/integration/cluster_solver/ctint/ctint_square_lattice_test_gpu.cpp +++ b/test/integration/cluster_solver/ctint/ctint_square_lattice_test_gpu.cpp @@ -10,7 +10,8 @@ // Compare GPU and CPU runs with CT-INT. // Model: square lattice with single band and double occupancy repulsion U. -#include +//#include +#include "dca/platform/dca_gpu.h" #include #include #include "dca/testing/gtest_h_w_warning_blocking.h" @@ -73,9 +74,9 @@ TEST(SquareLatticeTest, GpuSolver) { dca::phys::solver::CtintClusterSolver qmc_solver_gpu( parameters, data_gpu, nullptr); qmc_solver_gpu.initialize(0); - cudaProfilerStart(); + //cudaProfilerStart(); qmc_solver_gpu.integrate(); - cudaProfilerStop(); + //cudaProfilerStop(); qmc_solver_gpu.finalize(); EXPECT_NEAR(1.0, qmc_solver_gpu.computeDensity(), 1e-3); diff --git a/test/integration/cluster_solver/stdthread_qmci/gpu/stdthread_ctint_gpu_tp_test.cpp b/test/integration/cluster_solver/stdthread_qmci/gpu/stdthread_ctint_gpu_tp_test.cpp index 42bf103c2..004a48c49 100644 --- a/test/integration/cluster_solver/stdthread_qmci/gpu/stdthread_ctint_gpu_tp_test.cpp +++ b/test/integration/cluster_solver/stdthread_qmci/gpu/stdthread_ctint_gpu_tp_test.cpp @@ -13,7 +13,7 @@ #include #include "dca/testing/gtest_h_w_warning_blocking.h" - +#include "dca/platform/dca_gpu.h" #include "dca/platform/dca_gpu_complex.h" #include "dca/function/function.hpp" #include "dca/function/util/difference.hpp" diff --git a/test/unit/linalg/CMakeLists.txt b/test/unit/linalg/CMakeLists.txt index 4d13dae2d..daf550d02 100644 --- a/test/unit/linalg/CMakeLists.txt +++ b/test/unit/linalg/CMakeLists.txt @@ -49,7 +49,7 @@ dca_add_gtest(matrixop_complex_gpu_test CUDA LIBS ${DCA_LIBS} lapack lapack_kernels blas_kernels gpu_utils ${MAGMA_LIBARY} lapack_kernels) -dca_add_gtest(reshapable_matrix_cpu_test GTEST_MAIN LIBS ${UTIL_LIBS} ${DCA_LIBS} magma::magma) +dca_add_gtest(reshapable_matrix_cpu_test GTEST_MAIN LIBS ${UTIL_LIBS} ${DCA_LIBS}) dca_add_gtest(reshapable_matrix_gpu_test CUDA GTEST_MAIN LIBS ${DCA_LIBS} gpu_utils magma::magma) diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp index 106fa4a03..7f8478f60 100644 --- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp +++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp @@ -1,5 +1,5 @@ -// Copyright (C) 2018 ETH Zurich -// Copyright (C) 2018 UT-Battelle, LLC +// Copyright (C) 2024 ETH Zurich +// Copyright (C) 2024 UT-Battelle, LLC // All rights reserved. // // See LICENSE.txt for terms of usage. @@ -7,75 +7,98 @@ // // Author: Jérémie Bouquet (bouquetj@gmail.com). // Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch). +// Peter W. Doak (doakpw@ornl.gov) // // This class tests the GPU walker used by the ctint cluster solver by comparing it with the CPU // version. #include "dca/platform/dca_gpu.h" +using Scalar = double; #include "test/mock_mcconfig.hpp" namespace dca { namespace config { -using McOptions = MockMcOptions; +using McOptions = MockMcOptions; } // namespace config } // namespace dca +#include "test/unit/phys/dca_step/cluster_solver/test_setup.hpp" #include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp" - +#include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp" #include "dca/testing/gtest_h_w_warning_blocking.h" +#include "walker_wrapper.hpp" +#include "walker_wrapper_submatrix.hpp" #include "dca/linalg/matrixop.hpp" -#include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp" #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp" -#include "test/unit/phys/dca_step/cluster_solver/test_setup.hpp" -#include "walker_wrapper_submatrix.hpp" + +constexpr char input_name[] = + DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json"; using dca::linalg::CPU; using dca::linalg::GPU; -constexpr char input_name[] = - DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json"; +template +struct CtINTWalkerSubmatrixGPUTestT : public ::testing::Test { + using G0Setup = dca::testing::G0SetupBare; + virtual void SetUp() { + host_setup.SetUp(); + gpu_setup.SetUp(); + } -template -using CtintWalkerSubmatrixGpuTest = - typename dca::testing::G0Setup; + virtual void TearDown() {} + G0Setup host_setup; + G0Setup gpu_setup; +}; using namespace dca::phys::solver; -using FloatingPointTypes = ::testing::Types; -TYPED_TEST_CASE(CtintWalkerSubmatrixGpuTest, FloatingPointTypes); +template +using CtintWalkerSubmatrixGpuTest = CtINTWalkerSubmatrixGPUTestT; + +// Currently testing float isn't really possible due to the way the Scalar type is +// carried through from mc_options. See test_setup.hpp PD +using ScalarTypes = ::testing::Types; //double, +TYPED_TEST_CASE(CtintWalkerSubmatrixGpuTest, ScalarTypes); // Compare the submatrix update with a direct computation of the M matrix, and compare the // acceptance probability to // the CTINT walker with no submatrix update. TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { - using Real = TypeParam; - using Parameters = typename TestFixture::Parameters; - + using Scalar = TypeParam; + using Parameters = typename CtintWalkerSubmatrixGpuTest::G0Setup::Parameters; + using Walker = testing::phys::solver::ctint::WalkerWrapper; + using Matrix = typename Walker::Matrix; + using MatrixPair = std::array; using SbmWalkerCpu = - testing::phys::solver::ctint::WalkerWrapperSubmatrix; + testing::phys::solver::ctint::WalkerWrapperSubmatrix; using SbmWalkerGpu = - testing::phys::solver::ctint::WalkerWrapperSubmatrix; + testing::phys::solver::ctint::WalkerWrapperSubmatrix; std::vector setup_rngs{0., 0.00, 0.9, 0.5, 0.01, 0, 0.75, 0.02, 0, 0.6, 0.03, 1, 0.99, 0.04, 0.99}; - typename TestFixture::RngType rng(setup_rngs); + typename CtintWalkerSubmatrixGpuTest::G0Setup::RngType cpu_rng(setup_rngs); + typename CtintWalkerSubmatrixGpuTest::G0Setup::RngType gpu_rng(setup_rngs); - auto& data = *TestFixture::data_; - auto& parameters = TestFixture::parameters_; + auto& cpu_data = *this->host_setup.data_; + auto& gpu_data = *this->gpu_setup.data_; + auto& cpu_parameters = this->host_setup.parameters_; + auto& gpu_parameters = this->gpu_setup.parameters_; - const auto g0_func = dca::phys::solver::ctint::details::shrinkG0(data.G0_r_t); - G0Interpolation g0_cpu(g0_func); - G0Interpolation g0_gpu(g0_func); - typename TestFixture::LabelDomain label_dmn; + const auto g0_func_cpu = dca::phys::solver::ctint::details::shrinkG0(cpu_data.G0_r_t); + G0Interpolation g0_cpu(g0_func_cpu); + const auto g0_func_gpu = dca::phys::solver::ctint::details::shrinkG0(gpu_data.G0_r_t); + G0Interpolation g0_gpu(g0_func_gpu); + typename CtintWalkerSubmatrixGpuTest::G0Setup::LabelDomain label_dmn; // TODO: improve API. - SbmWalkerCpu::setDMatrixBuilder(g0_cpu); - SbmWalkerCpu::setDMatrixAlpha(parameters.getAlphas(), false); - SbmWalkerGpu::setDMatrixBuilder(g0_gpu); - SbmWalkerGpu::setDMatrixAlpha(parameters.getAlphas(), false); + Walker::setDMatrixBuilder(g0_cpu); + Walker::setDMatrixAlpha(cpu_parameters.getAlphas(), false); + Walker::setInteractionVertices(cpu_data, cpu_parameters); - SbmWalkerCpu::setInteractionVertices(data, parameters); - SbmWalkerGpu::setInteractionVertices(data, parameters); + SbmWalkerGpu::setDMatrixBuilder(g0_gpu); + SbmWalkerGpu::setDMatrixAlpha(gpu_parameters.getAlphas(), false); + SbmWalkerGpu::setInteractionVertices(gpu_data, gpu_parameters); // ************************************ // Test vertex insertion / removal **** @@ -99,20 +122,20 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) { }; for (const int initial_size : std::array{0, 5}) { - parameters.setInitialConfigurationSize(initial_size); - + cpu_parameters.setInitialConfigurationSize(initial_size); + gpu_parameters.setInitialConfigurationSize(initial_size); for (int steps = 1; steps <= 8; ++steps) { - rng.setNewValues(setup_rngs); - SbmWalkerCpu walker_cpu(parameters, rng); - rng.setNewValues(setup_rngs); - SbmWalkerGpu walker_gpu(parameters, rng); + cpu_rng.setNewValues(setup_rngs); + SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng); + gpu_rng.setNewValues(setup_rngs); + SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng); - rng.setNewValues(rng_vals); + cpu_rng.setNewValues(rng_vals); walker_cpu.doStep(steps); - rng.setNewValues(rng_vals); + gpu_rng.setNewValues(rng_vals); walker_gpu.doStep(steps); - constexpr Real tolerance = std::numeric_limits::epsilon() * 100; + constexpr Scalar tolerance = std::numeric_limits::epsilon() * 100; auto M_cpu = walker_cpu.getM(); auto M_gpu = walker_gpu.getM(); diff --git a/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt b/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt index d29861b2c..8b5395b3d 100644 --- a/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt +++ b/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt @@ -3,4 +3,4 @@ # why this test pulls in a bunch of magma references is a mystery dca_add_gtest(four_point_parameters_test GTEST_MAIN - LIBS json enumerations magma::magma ${DCA_GPU_LIBS}) + LIBS json enumerations ${DCA_GPU_LIBS})