diff --git a/build-aux/frontier_rocm6_build.sh b/build-aux/frontier_rocm6_build.sh
index cf68705ed..8a8e1e698 100644
--- a/build-aux/frontier_rocm6_build.sh
+++ b/build-aux/frontier_rocm6_build.sh
@@ -15,26 +15,9 @@ cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
       -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \
       -DCMAKE_INSTALL_PREFIX=$INST \
       -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
+      -DCMAKE_HIP_LINK_FLAGS=--hip-link \
       -GNinja \
       ..
 
-#cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
-      -DFFTW_ROOT=$FFTW_PATH \
-      -DDCA_FIX_BROKEN_MPICH=ON \
-      -DROCM_ROOT=${ROCM_PATH} \
-      -DMAGMA_ROOT=${MAGMA_ROOT} \
-      -DLAPACK_ROOT=${OPENBLAS_ROOT} \
-      -DBLAS_ROOT=${OPENBLAS_ROOT} \
-      -DDCA_WITH_TESTS_FAST=ON \
-      -DTEST_RUNNER="srun" \
-      -DGPU_TARGETS=gfx90a \
-      -DAMDGPU_TARGETS=gfx90a \
-      -DCMAKE_C_COMPILER=mpicc \
-      -DCMAKE_CXX_COMPILER=mpic++ \
-      -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \
-      -DCMAKE_INSTALL_PREFIX=$INST \
-      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-      -GNinja \
-      ..
 # cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON -DFFTW_ROOT=$FFTW_PATH -DDCA_FIX_BROKEN_MPICH=ON -DROCM_ROOT=${ROCM_PATH} -DMAGMA_ROOT=${MAGMA_ROOT} -DLAPACK_ROOT=${OPENBLAS_ROOT} -DBLAS_ROOT=${OPENBLAS_ROOT} -DDCA_WITH_TESTS_FAST=ON -DTEST_RUNNER="srun" -DGPU_TARGETS=gfx90a -DAMDGPU_TARGETS=gfx90a -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ -DCMAKE_INSTALL_PREFIX=$INST -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" -GNinja ..
-      ..
+
diff --git a/build-aux/frontier_rocm6_load_modules.sh b/build-aux/frontier_rocm6_load_modules.sh
index 44951217c..10f63107c 100644
--- a/build-aux/frontier_rocm6_load_modules.sh
+++ b/build-aux/frontier_rocm6_load_modules.sh
@@ -9,12 +9,14 @@
 
 module reset
 module load amd-mixed/6.0.0
-spack load cmake%gcc@11.2.0
-spack load ninja%gcc@11.2.0
-spack load magma@master amdgpu_target=gfx90a
-spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
-spack load fftw ~mpi %rocmcc@6.0.0
-spack load openblas@0.3.25 %gcc@11.2.0
+module load ninja
+module load cmake
+#spack load cmake%gcc@11.2.0
+#spack load ninja%gcc@11.2.0
+#spack load magma@master amdgpu_target=gfx90a
+#spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
+#spack load fftw ~mpi %rocmcc@6.0.0
+#spack load openblas@0.3.25 %gcc@11.2.0
 
 export CC=mpicc
 export CXX=mpicxx
diff --git a/cmake/dca_config.cmake b/cmake/dca_config.cmake
index 07f10d773..bcf8ff05f 100644
--- a/cmake/dca_config.cmake
+++ b/cmake/dca_config.cmake
@@ -416,6 +416,12 @@ else()
   set(TWO_PARTICLE_ALLOCATOR "dca::linalg::util::DeviceAllocator<T>")
 endif()
 
+option(DCA_WITH_CTAUX_TRACING "special debug tracing of of delayed spin updates in ctaux" OFF)
+mark_as_advanced(DCA_WITH_CTAUX_TRACING)
+if(DCA_WITH_CTAUX_TRACING)
+  add_compile_definitions(CTAUX_DEBUG_TRACING)
+endif()
+
 configure_file("${PROJECT_SOURCE_DIR}/include/dca/config/mc_options.hpp.in"
         "${CMAKE_BINARY_DIR}/include/dca/config/mc_options.hpp" @ONLY)
 
diff --git a/cmake/dca_hip.cmake b/cmake/dca_hip.cmake
index 2d3e0ebb3..9938676f3 100644
--- a/cmake/dca_hip.cmake
+++ b/cmake/dca_hip.cmake
@@ -74,6 +74,10 @@ if (CMAKE_HIP_COMPILER)
   set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a")
   set(CMAKE_HIP_STANDARD 17)
   list(APPEND HIP_HIPCC_FLAGS "-fPIC")
+  list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics")
+  list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread")
+  list(APPEND HIP_HIPCC_FLAGS_DEBUG "--save-temps -g")
+
   # doesn't appear to work
   set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu)
   message("Enabled HIP as a language")
diff --git a/include/dca/linalg/matrix.hpp b/include/dca/linalg/matrix.hpp
index e55181138..bd7d96ced 100644
--- a/include/dca/linalg/matrix.hpp
+++ b/include/dca/linalg/matrix.hpp
@@ -112,14 +112,22 @@ class Matrix : public ALLOC {
   // This method is available only if device_name == CPU.
   template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
   ScalarType& operator()(int i, int j) {
-    assert(i >= 0 && i < size_.first);
-    assert(j >= 0 && j < size_.second);
+#ifndef NDEBUG
+    if(!(i >= 0 && i <= size_.first))
+      throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
+    if(!(j >= 0 && j <= size_.second))
+      throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
+#endif
     return data_[i + j * leadingDimension()];
   }
   template <DeviceType dn = device_name, typename = std::enable_if_t<dn == CPU>>
   const ScalarType& operator()(int i, int j) const {
-    assert(i >= 0 && i < size_.first);
-    assert(j >= 0 && j < size_.second);
+#ifndef NDEBUG
+    if(!(i >= 0 && i <= size_.first))
+      throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
+    if(!(j >= 0 && j <= size_.second))
+      throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
+#endif
     return data_[i + j * leadingDimension()];
   }
 
@@ -142,13 +150,22 @@ class Matrix : public ALLOC {
   // a pointer past the end of the range if i == size().first or j == size().second.
   // Preconditions: 0 <= i <= size().first, 0 <= j <= size().second.
   ValueType* ptr(int i, int j) {
-    assert(i >= 0 && i <= size_.first);
-    assert(j >= 0 && j <= size_.second);
+    //These can be an annoyance debugging so making them "manual" asserts
+#ifndef NDEBUG
+    if(!(i >= 0 && i <= size_.first))
+      throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
+    if(!(j >= 0 && j <= size_.second))
+      throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
+#endif
     return data_ + i + j * leadingDimension();
   }
   const ValueType* ptr(int i, int j) const {
-    assert(i >= 0 && i <= size_.first);
-    assert(j >= 0 && j <= size_.second);
+#ifndef NDEBUG
+    if(!(i >= 0 && i <= size_.first))
+      throw std::runtime_error("assertion of i >= 0 && i <= size_.first failed!");
+    if(!(j >= 0 && j <= size_.second))
+      throw std::runtime_error("assertion of j >= 0 && j <= size_.second failed!");
+#endif
     return data_ + i + j * leadingDimension();
   }
 
diff --git a/include/dca/linalg/util/atomic_add_cuda.cu.hpp b/include/dca/linalg/util/atomic_add_cuda.cu.hpp
index da7b35dc3..a0afb4b61 100644
--- a/include/dca/linalg/util/atomic_add_cuda.cu.hpp
+++ b/include/dca/linalg/util/atomic_add_cuda.cu.hpp
@@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) {
   atomicAddImpl(address, val);
 }
 
+#elif defined(DCA_HAVE_HIP)
+// HIP seems to have some horrible problem with concurrent atomic operations.
+__device__ double inline atomicAddImpl(double* address, const double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+__device__ double inline atomicAddImpl(float* address, const float val) {
+  unsigned long int* address_as_int = (unsigned long int*)address;
+  unsigned long int old = *address_as_int, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_int, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+  
+__device__ void inline atomicAdd(float* address, const float val) {
+  atomicAddImpl(address, val);
+}
+
+__device__ void inline atomicAdd(double* address, const double val) {
+  atomicAddImpl(address, val);
+}
+
+__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) {
+  double* a_d = reinterpret_cast<double*>(address);
+  atomicAddImpl(a_d, val.x);
+  atomicAddImpl(a_d + 1, val.y);
+}
+
+  __device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
+  double* a_d = reinterpret_cast<double*>(address);
+  atomicAddImpl(a_d, val.x);
+  atomicAddImpl(a_d + 1, val.y);
+}
+
 #else
 __device__ void inline atomicAdd(double* address, double val) {
   ::atomicAdd(address, val);
@@ -62,7 +108,7 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val)
   atomicAdd(a_d, val.x);
   atomicAdd(a_d + 1, val.y);
 }
-#endif  // __CUDA_ARCH__
+#endif  // atomic operation help
 
 }  // linalg
 }  // dca
diff --git a/include/dca/linalg/util/magma_queue.hpp b/include/dca/linalg/util/magma_queue.hpp
index 01ccb017a..e9b68597e 100644
--- a/include/dca/linalg/util/magma_queue.hpp
+++ b/include/dca/linalg/util/magma_queue.hpp
@@ -42,6 +42,7 @@ class MagmaQueue {
   MagmaQueue& operator=(const MagmaQueue& rhs) = delete;
 
   MagmaQueue(MagmaQueue&& rhs) noexcept : queue_(std::move(rhs.queue_)) {
+    std::swap(stream_, rhs.stream_);
     std::swap(cublas_handle_, rhs.cublas_handle_);
     std::swap(cusparse_handle_, rhs.cusparse_handle_);
     std::swap(queue_, rhs.queue_);
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp
index 61c26c833..343b3f37e 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp
@@ -406,8 +406,8 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST>::updateFrom(walker_type&
   walker.get_error_distribution() = 0;
 #endif  // DCA_WITH_QMC_BIT
 
-  single_particle_accumulator_obj.syncStreams(*event);
-  two_particle_accumulator_.syncStreams(*event);
+  //single_particle_accumulator_obj.syncStreams(*event);
+  //two_particle_accumulator_.syncStreams(*event);
 }
 
 template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST>
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
index 3e681495b..1d60d0522 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
@@ -13,6 +13,7 @@
 #ifndef DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_CTAUX_CTAUX_WALKER_HPP
 #define DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_CTAUX_CTAUX_WALKER_HPP
 
+#include <algorithm>
 #include <cassert>
 #include <cstdint>  // uint64_t
 #include <cstdlib>  // std::size_t
@@ -42,6 +43,7 @@
 #include "dca/phys/dca_step/cluster_solver/ctaux/walker/walker_bit.hpp"
 #include "dca/phys/dca_step/cluster_solver/shared_tools/util/accumulator.hpp"
 #include "dca/util/print_time.hpp"
+#include "dca/config/lattice_model.hpp"
 
 namespace dca {
 namespace phys {
@@ -66,7 +68,7 @@ class CtauxWalker : public WalkerBIT<Parameters, Data>, public CtauxWalkerData<d
 
 public:
   /** constructor
-   *  param[in] id    thread id?
+   *  param[in] id    thread id
    */
   CtauxWalker(Parameters& parameters_ref, Data& MOMS_ref, Rng& rng_ref, int id);
 
@@ -351,6 +353,34 @@ class CtauxWalker : public WalkerBIT<Parameters, Data>, public CtauxWalkerData<d
   unsigned long n_steps_ = 0;
   linalg::util::GpuEvent sync_streams_event_;
   bool config_initialized_;
+
+#ifndef NDEBUG
+  bool last_failed_min_max_ = false;
+  std::vector<std::string> trace_;
+#ifndef CTAUX_DEBUG_TRACING
+  template <typename... ARGS>
+  void pushOnTrace(const ARGS&... args) {}
+
+  static void dumpTrace(const std::vector<std::string> trace, const Parameters& parameters_ref,
+                        int thread_id, int depth_limit) {}
+#else
+  template <typename... ARGS>
+  void pushOnTrace(const ARGS&... args) {
+    std::ostringstream trace;
+    (trace << ... << args);
+    trace_.push_back(trace.str());
+  }
+
+  static void dumpTrace(const std::vector<std::string> trace, const Parameters& parameters_ref,
+                        int thread_id, int depth_limit) {
+    auto iter = std::make_reverse_iterator(trace.end());
+    auto end = std::make_reverse_iterator(trace.begin());
+    for (int depth = 0; iter > end, depth < depth_limit; ++iter, ++depth)
+      std::cerr << "rank:" << parameters_ref.get_concurrency().get_id() << "thread:" << thread_id
+                << " " << *iter << '\n';
+  }
+#endif
+#endif
 };
 
 template <dca::linalg::DeviceType device_t, class Parameters, class Data>
@@ -527,6 +557,9 @@ void CtauxWalker<device_t, Parameters, Data>::initialize(int iteration) {
 template <dca::linalg::DeviceType device_t, class Parameters, class Data>
 void CtauxWalker<device_t, Parameters, Data>::doSweep() {
   Profiler profiler("do_sweep", "CT-AUX walker", __LINE__, thread_id);
+#ifndef NDEBUG
+  trace_.clear();
+#endif
   const double sweeps_per_measurement{thermalized_ ? sweeps_per_measurement_ : 1.};
 
   // Do at least one single spin update per sweep.
@@ -603,10 +636,6 @@ std::enable_if_t<dev_t == device_t && device_t != dca::linalg::CPU, void> CtauxW
   read_Gamma_matrices(e_DN);
 
   actually_download_from_device<device_t>();
-  // Gamma_up_CPU.setAsync(Gamma_up, thread_id, stream_id);
-  // Gamma_dn_CPU.setAsync(Gamma_dn, thread_id, stream_id);
-
-  // linalg::util::syncStream(thread_id, stream_id);
 }
 
 // In case Gamma_up and Gamma_down reside in the CPU memory, avoid the copies using swap.
@@ -625,9 +654,6 @@ std::enable_if_t<dev_t == device_t && device_t == dca::linalg::CPU, void> CtauxW
   read_Gamma_matrices(e_DN);
 
   actually_download_from_device<device_t>();
-
-  // Gamma_up_CPU.swap(Gamma_up);
-  // Gamma_dn_CPU.swap(Gamma_dn);
 }
 
 // In case Gamma_up and Gamma_down do not reside in the CPU memory, copy them.
@@ -824,6 +850,13 @@ int CtauxWalker<device_t, Parameters, Data>::generateDelayedSpinsAbortAtBennett(
     }
   }
 
+#ifndef NDEBUG
+  if (delayed_spins.size() > max_num_delayed_spins) {
+    std::cout << "Delayed spins = " << delayed_spins.size() << "  max_num_delayed_spins = " << max_num_delayed_spins << '\n';
+    if (delayed_spins.size() >= 256)
+      throw std::runtime_error("delayed spins hit 256 or greater bailing out!");
+  }
+#endif
   // We need to unmark all "virtual" interacting spins, that we have temporarily marked as
   // annihilatable in CT_AUX_HS_configuration::get_random_noninteracting_vertex().
   // TODO: Eliminate the need to mark and unmark these spins.
@@ -834,6 +867,12 @@ int CtauxWalker<device_t, Parameters, Data>::generateDelayedSpinsAbortAtBennett(
   assert(single_spin_updates_proposed ==
          currently_proposed_creations_ + currently_proposed_annihilations_ + num_statics);
 
+#ifndef NDEBUG
+  if (single_spin_updates_proposed >= max_num_delayed_spins) {
+    std::cout << "single_spin_updates_proposed = " << single_spin_updates_proposed << "  max_num_delayed_spins = " << max_num_delayed_spins << '\n';
+  }
+#endif
+
   return single_spin_updates_proposed;
 }
 
@@ -1010,13 +1049,23 @@ void CtauxWalker<device_t, Parameters, Data>::read_Gamma_matrices(e_spin_states
   // Profiler profiler(concurrency_, __FUNCTION__, "CT-AUX walker", __LINE__, thread_id);
   switch (e_spin) {
     case e_DN:
+      linalg::util::syncStream(thread_id, stream_id);
+#ifndef NDEBUG
+      if(Gamma_dn.capacity().first < vertex_indixes.size())
+	std::cout << "gamma_dn.capacity().first == " << Gamma_dn.capacity().first << "  vertex_indixes.size() == " << vertex_indixes.size() << '\n';
+#endif
       CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
           Gamma_dn, N_dn, G_dn, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
+      // assume we've no guarantee this will be allowed to finish before the async copy starts
+      linalg::util::syncStream(thread_id, stream_id);
       break;
 
     case e_UP:
+      linalg::util::syncStream(thread_id, stream_id);
       CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
           Gamma_up, N_up, G_up, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
+      // assume we've no guarantee this will be allowed to finish before the async copy starts
+      linalg::util::syncStream(thread_id, stream_id);
       break;
 
     default:
@@ -1060,10 +1109,24 @@ void CtauxWalker<device_t, Parameters, Data>::compute_Gamma_matrices() {
     else {
       add_delayed_spin(delayed_index, Gamma_up_size, Gamma_dn_size);
     }
+    // upload_to_device();
   }
+#ifndef NDEBUG
+  if (last_failed_min_max_)
+    std::cout << "out of delayed spins loop but last vertex failed min_max test!\n";
+#endif
 
   add_delayed_spins_to_the_configuration();
 
+#ifndef NDEBUG
+  std::ostringstream gamma_diag;
+  for (int i = 0; i < Gamma_dn_CPU.size().first; i++) {
+    auto Gamma_val = std::abs(Gamma_dn_CPU(i, i));
+    gamma_diag << Gamma_val << ", ";
+  }
+  pushOnTrace("diag of Gamma_dn:", gamma_diag.str());
+#endif
+
   remove_non_accepted_and_bennett_spins_from_Gamma(Gamma_up_size, Gamma_dn_size);
 
   // #ifdef DCA_WITH_QMC_BIT
@@ -1087,22 +1150,34 @@ void CtauxWalker<device_t, Parameters, Data>::neutralize_delayed_spin(int& delay
     Gamma_up_size += 1;
     CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(
         Gamma_up_CPU, delayed_spins[delayed_index].Gamma_index_HS_field_DN);
+#ifndef NDEBUG
+    pushOnTrace("delayed_spins[", std::to_string(delayed_index), "].e_spin_HS_field_DN == e_UP");
+#endif
   }
   else {
     Gamma_dn_size += 1;
     CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(
         Gamma_dn_CPU, delayed_spins[delayed_index].Gamma_index_HS_field_DN);
+#ifndef NDEBUG
+    pushOnTrace("delayed_spins[", std::to_string(delayed_index), "].e_spin_HS_field_DN == e_DN");
+#endif
   }
 
   if (delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP) {
     Gamma_up_size += 1;
     CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(
         Gamma_up_CPU, delayed_spins[delayed_index].Gamma_index_HS_field_UP);
+#ifndef NDEBUG
+    pushOnTrace("delayed_spins[", std::to_string(delayed_index), "].e_spin_HS_field_UP == e_UP");
+#endif
   }
   else {
     Gamma_dn_size += 1;
     CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(
         Gamma_dn_CPU, delayed_spins[delayed_index].Gamma_index_HS_field_UP);
+#ifndef NDEBUG
+    pushOnTrace("delayed_spins[", std::to_string(delayed_index), "].e_spin_HS_field_UP == e_DN");
+#endif
   }
 
   //     delayed_index += 1;
@@ -1144,19 +1219,31 @@ void CtauxWalker<device_t, Parameters, Data>::remove_non_accepted_and_bennett_sp
       if (e_spin_HS_field_UP == e_UP) {
         Gamma_up_size -= 1;
         dca::linalg::matrixop::removeRowAndCol(Gamma_up_CPU, Gamma_index_HS_field_UP);
+#ifndef NDEBUG
+        pushOnTrace("gu-fu-eu:", std::to_string(Gamma_index_HS_field_UP));
+#endif
       }
       else {
         Gamma_dn_size -= 1;
         dca::linalg::matrixop::removeRowAndCol(Gamma_dn_CPU, Gamma_index_HS_field_UP);
+#ifndef NDEBUG
+        pushOnTrace("gd-fu-ed:", std::to_string(Gamma_index_HS_field_UP));
+#endif
       }
 
       if (e_spin_HS_field_DN == e_UP) {
         Gamma_up_size -= 1;
         dca::linalg::matrixop::removeRowAndCol(Gamma_up_CPU, Gamma_index_HS_field_DN);
+#ifndef NDEBUG
+        pushOnTrace("gu-fd-eu:", std::to_string(Gamma_index_HS_field_DN));
+#endif
       }
       else {
         Gamma_dn_size -= 1;
         dca::linalg::matrixop::removeRowAndCol(Gamma_dn_CPU, Gamma_index_HS_field_DN);
+#ifndef NDEBUG
+        pushOnTrace("gd-fd-ed:", std::to_string(Gamma_index_HS_field_DN));
+#endif
       }
     }
   }
@@ -1178,6 +1265,26 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
                                   delayed_spins[delayed_index].new_HS_spin_value,
                                   delayed_spins[delayed_index].exp_delta_V_HS_field_UP));
 
+#ifndef NDEBUG
+  // The espin_up corresponds the Gamma matrix and diags used.
+  auto isDebugTestMinMax = [this](bool HS_field_up, bool espin_up, auto& ratio_HS_field,  auto& Gamma_index_HS_field, auto& Gamma_CPU, auto& Gamma_diag_max, auto& Gamma_diag_min, int trace_depth) {
+    if (std::abs(ratio_HS_field) <= std::abs(Scalar(1.e-16)))
+        this->pushOnTrace("gamma matrix was ill conditioned!");
+      if (Gamma_index_HS_field > 0 && std::abs(ratio_HS_field) > 1.e-16 &&
+          !ctaux_tools.test_max_min(Gamma_index_HS_field, Gamma_CPU, Gamma_diag_max,
+                                    Gamma_diag_min)) {
+        std::cerr
+            << "ratiod: " << ratio_HS_field
+            << "After e_spin_HS_field_ " << (HS_field_up ? "UP" : "DN")
+	    << " == e_" << (espin_up ? "UP" : "DN") << ", solve_Gamma test_max_min on Gamma_LU failed!\n";
+        this->dumpTrace(trace_, parameters_, thread_id, trace_depth);
+        this->last_failed_min_max_ = true;
+      }
+      else
+        this->last_failed_min_max_ = false;
+  };
+#endif
+
   if (delayed_spins[delayed_index].HS_current_move == CREATION)
     number_of_creations += 1;
 
@@ -1193,34 +1300,58 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
   Scalar ratio_HS_field_DN = 0.;
   Scalar ratio_HS_field_UP = 0.;
 
+  // So what are these really for, in the first block below I use them for the debug max_min check
   Real tmp_up_diag_max = Gamma_up_diag_max;
   Real tmp_up_diag_min = Gamma_up_diag_min;
   Real tmp_dn_diag_max = Gamma_dn_diag_max;
   Real tmp_dn_diag_min = Gamma_dn_diag_min;
 
+#ifndef NDEBUG
+  int trace_depth = 64;
+#endif
+
   {
     if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP) {
       // std::cout << "\t" << "e_UP" << "\t" << Gamma_index_HS_field_DN << "\t" << Gamma_up_size <<
       // "\t|\t";
 
-      assert(Gamma_index_HS_field_DN == Gamma_up_size);
-
-      ratio_HS_field_DN = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_DN, Gamma_up_CPU,
-                                                          exp_delta_V_HS_field_DN,
-                                                          Gamma_up_diag_max, Gamma_up_diag_min);
-
+#ifndef NDEBUG
+      if (Gamma_index_HS_field_DN != Gamma_up_size) {
+	std::cerr << "Gamma_index_HS_field_DN = " << Gamma_index_HS_field_DN << "  Gamma_up_size = " << Gamma_up_size << '\n';
+      }
+#endif
+      if constexpr (Lattice::spin_symmetric)
+        ratio_HS_field_DN = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_DN, Gamma_up_CPU,
+                                                            exp_delta_V_HS_field_DN,
+                                                            Gamma_up_diag_max, Gamma_up_diag_min);
+      else
+        std::cout << "warning Gamma_up delayed spin for spin_symmetric=false model";
+
+#ifndef NDEBUG
+      isDebugTestMinMax(false, true, ratio_HS_field_DN, Gamma_index_HS_field_DN, Gamma_up_CPU, Gamma_up_diag_max, Gamma_up_diag_min, trace_depth);
+#endif
+ 
       Gamma_up_size += 1;
     }
     else {
       // std::cout << "\t" << "e_DN" << "\t" << Gamma_index_HS_field_DN << "\t" << Gamma_dn_size <<
       // "\t|\t";
 
-      assert(Gamma_index_HS_field_DN == Gamma_dn_size);
+#ifndef NDEBUG
+      if (Gamma_index_HS_field_DN != Gamma_dn_size) {
+	std::cerr << "Gamma_index_HS_field_DN = " << Gamma_index_HS_field_DN << "  Gamma_dn_size = " << Gamma_dn_size << '\n';
+      }
+#endif
 
       ratio_HS_field_DN = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_DN, Gamma_dn_CPU,
                                                           exp_delta_V_HS_field_DN,
                                                           Gamma_dn_diag_max, Gamma_dn_diag_min);
 
+
+#ifndef NDEBUG
+      isDebugTestMinMax(false, false, ratio_HS_field_DN, Gamma_index_HS_field_DN, Gamma_dn_CPU, Gamma_dn_diag_max, Gamma_dn_diag_min, trace_depth);
+#endif
+
       Gamma_dn_size += 1;
     }
 
@@ -1228,24 +1359,41 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
       // std::cout << "\t" << "e_UP" << "\t" << Gamma_index_HS_field_UP << "\t" << Gamma_up_size <<
       // "\t|\t";
 
-      assert(Gamma_index_HS_field_UP == Gamma_up_size);
-
-      ratio_HS_field_UP = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_UP, Gamma_up_CPU,
-                                                          exp_delta_V_HS_field_UP,
-                                                          Gamma_up_diag_max, Gamma_up_diag_min);
-
+#ifndef NDEBUG
+      if (Gamma_index_HS_field_UP != Gamma_up_size) {
+	std::cerr << "Gamma_index_HS_field_UP = " << Gamma_index_HS_field_UP << "  Gamma_up_size = " << Gamma_up_size << '\n';
+      }
+#endif
+
+      if constexpr (Lattice::spin_symmetric)
+        ratio_HS_field_UP = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_UP, Gamma_up_CPU,
+                                                            exp_delta_V_HS_field_UP,
+                                                            Gamma_up_diag_max, Gamma_up_diag_min);
+      else
+        std::cout << "warning Gamma_up delayed spin for spin_symmetric=false model";
+
+#ifndef NDEBUG
+      isDebugTestMinMax(true, true, ratio_HS_field_UP, Gamma_index_HS_field_UP, Gamma_up_CPU, Gamma_up_diag_max, Gamma_up_diag_min, trace_depth);
+#endif
       Gamma_up_size += 1;
     }
     else {
       // std::cout << "\t" << "e_DN" << "\t" << Gamma_index_HS_field_UP << "\t" << Gamma_dn_size <<
       // "\t|\t";
 
-      assert(Gamma_index_HS_field_UP == Gamma_dn_size);
-
+#ifndef NDEBUG
+      if (Gamma_index_HS_field_UP != Gamma_dn_size) {
+	std::cerr << "Gamma_index_HS_field_UP = " << Gamma_index_HS_field_UP << "  Gamma_dn_size = " << Gamma_dn_size << '\n';
+      }
+#endif
+ 
       ratio_HS_field_UP = ctaux_tools.solve_Gamma_blocked(Gamma_index_HS_field_UP, Gamma_dn_CPU,
                                                           exp_delta_V_HS_field_UP,
                                                           Gamma_dn_diag_max, Gamma_dn_diag_min);
 
+#ifndef NDEBUG
+      isDebugTestMinMax(true, false, ratio_HS_field_UP, Gamma_index_HS_field_UP, Gamma_dn_CPU, Gamma_dn_diag_max, Gamma_dn_diag_min, trace_depth);
+#endif
       Gamma_dn_size += 1;
     }
 
@@ -1253,9 +1401,6 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
   }
 
   const auto determinant_ratio = ratio_HS_field_UP * ratio_HS_field_DN;
-  // std::cout << "ratio up: " << std::setprecision(16) << ratio_HS_field_UP
-  //          << "  ratio down: " << std::setprecision(16) << ratio_HS_field_DN << '\n';
-  // std::cout << "determinant_ratio: " << determinant_ratio << '\n';
   auto acceptance_ratio =
       calculate_acceptance_ratio(determinant_ratio, delayed_spins[delayed_index].HS_current_move,
                                  delayed_spins[delayed_index].QMC_factor);
@@ -1269,6 +1414,10 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
   if (std::abs(acceptance_ratio) >= rng()) {
     delayed_spins[delayed_index].is_accepted_move = true;
 
+#ifndef NDEBUG
+    if (last_failed_min_max_)
+      std::cout << "Accepted but last vertex insert failed min_max test!\n";
+#endif
     // Update Monte Carlo weight.
     mc_log_weight_ += std::log(std::abs(determinant_ratio));
     // std::cout << "mc_log_weight: " << mc_log_weight_ << '\n';
@@ -1296,40 +1445,73 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
       number_of_interacting_spins -= 1;
   }
   else {
-    /*
-    Gamma_up_diag_max = tmp_up_diag_max<1. ? 1. : tmp_up_diag_max;
-    Gamma_dn_diag_max = tmp_dn_diag_max<1. ? 1. : tmp_dn_diag_max;
-    Gamma_up_diag_min = tmp_up_diag_min>1. ? 1. : tmp_up_diag_min;
-    Gamma_dn_diag_min = tmp_dn_diag_min>1. ? 1. : tmp_dn_diag_min;
+#ifndef NDEBUG
+    if (last_failed_min_max_)
+      std::cout << "Rejected but last vertex insert failed min_max test!\n";
+#endif
+
+      /*
+      Gamma_up_diag_max = tmp_up_diag_max<1. ? 1. : tmp_up_diag_max;
+      Gamma_dn_diag_max = tmp_dn_diag_max<1. ? 1. : tmp_dn_diag_max;
+      Gamma_up_diag_min = tmp_up_diag_min>1. ? 1. : tmp_up_diag_min;
+      Gamma_dn_diag_min = tmp_dn_diag_min>1. ? 1. : tmp_dn_diag_min;
+
+      delayed_spins[delayed_index].is_accepted_move  = false;
+
+      if(delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP)
+        {
+          CT_AUX_WALKER_TOOLS<device_t, Scalar>::set_to_identity(Gamma_up_CPU, Gamma_up_size-1);
+        }
+      else
+        {
+          CT_AUX_WALKER_TOOLS<device_t, Scalar>::set_to_identity(Gamma_dn_CPU, Gamma_dn_size-1);
+        }
 
-    delayed_spins[delayed_index].is_accepted_move  = false;
+      if(delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP)
+        {
+          CT_AUX_WALKER_TOOLS<device_t, Scalar>::set_to_identity(Gamma_up_CPU, Gamma_up_size-1);
+        }
+      else
+        {
+          CT_AUX_WALKER_TOOLS<device_t, Scalar>::set_to_identity(Gamma_dn_CPU, Gamma_dn_size-1);
+        }
+      */
 
-    if(delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP)
-      {
-        CT_AUX_WALKER_TOOLS<device_t, Scalar>::set_to_identity(Gamma_up_CPU, Gamma_up_size-1);
-      }
-    else
-      {
-        CT_AUX_WALKER_TOOLS<device_t, Scalar>::set_to_identity(Gamma_dn_CPU, Gamma_dn_size-1);
-      }
+      // delayed_spins[delayed_index].is_accepted_move  = false;
 
-    if(delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP)
-      {
-        CT_AUX_WALKER_TOOLS<device_t, Scalar>::set_to_identity(Gamma_up_CPU, Gamma_up_size-1);
-      }
-    else
-      {
-        CT_AUX_WALKER_TOOLS<device_t, Scalar>::set_to_identity(Gamma_dn_CPU, Gamma_dn_size-1);
-      }
-    */
+#ifndef NDEBUG
+    auto getNewDiagMin = [](auto& Gamma, auto gamma_size, auto& gamma_min) -> Scalar {
+      for (int i = 0; i < gamma_size - 1; ++i)
+        gamma_min = std::min(std::abs(gamma_min), std::abs(Gamma(i, i)));
+    };
+#endif
 
     if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP and
         delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP) {
       Gamma_up_diag_max = tmp_up_diag_max < 1. ? 1. : tmp_up_diag_max;
       Gamma_up_diag_min = tmp_up_diag_min > 1. ? 1. : tmp_up_diag_min;
 
+#ifndef NDEBUG
+      bool was_lowest = false;
+      for (int n : {Gamma_up_size - 2, Gamma_up_size - 1})
+        if (Gamma_up_CPU(n, n) == Gamma_up_diag_min) {
+          was_lowest = true;
+          break;
+        }
+#endif
+
       CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_up_CPU, Gamma_up_size - 2);
       CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_up_CPU, Gamma_up_size - 1);
+
+#ifndef NDEBUG
+      if (was_lowest)
+        getNewDiagMin(Gamma_up_CPU, Gamma_up_size, Gamma_up_diag_min);
+      // here we have to check if Gamma_up_diag_min was one of the values that just got set to 1 if
+      // it is well that's bad for passing test_max_min
+      pushOnTrace("double delayed_spins[", std::to_string(delayed_index),
+                  "].e_spin_HS_field_[DN,UP] == [e_UP,e_UP] G_up:", std::to_string(Gamma_up_size - 2),
+                  ", G_up:", std::to_string(Gamma_up_size - 1));
+#endif
     }
 
     if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_DN and
@@ -1341,6 +1523,11 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
 
       CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_dn_CPU, Gamma_dn_size - 1);
       CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_up_CPU, Gamma_up_size - 1);
+#ifndef NDEBUG
+      pushOnTrace("double delayed_spins[", std::to_string(delayed_index),
+                  "].e_spin_HS_field_[DN,UP] == [e_DN,e_UP] G_dn:", std::to_string(Gamma_dn_size - 1),
+                  ", G_up:", std::to_string(Gamma_up_size - 1));
+#endif
     }
 
     if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP and
@@ -1352,18 +1539,43 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
 
       CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_up_CPU, Gamma_up_size - 1);
       CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_dn_CPU, Gamma_dn_size - 1);
+#ifndef NDEBUG
+      pushOnTrace("double delayed_spins[", std::to_string(delayed_index),
+                  "].e_spin_HS_field_[DN,UP] == [e_UP,e_DN] G_dn",
+                  std::to_string(Gamma_dn_size - 1), ", G_up:", std::to_string(Gamma_up_size - 1));
+#endif //NDEBUG
     }
 
     if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_DN and
         delayed_spins[delayed_index].e_spin_HS_field_UP == e_DN) {
       Gamma_dn_diag_max = tmp_dn_diag_max < 1. ? 1. : tmp_dn_diag_max;
-      Gamma_dn_diag_min = tmp_dn_diag_min > 1. ? 1. : tmp_dn_diag_min;
+      Gamma_dn_diag_min = tmp_dn_diag_min > 1 ? 1. : tmp_dn_diag_min;
+
+#ifndef NDEBUG
+      bool was_lowest = false;
+      for (int n : {Gamma_dn_size - 2, Gamma_dn_size - 1})
+        if (Gamma_dn_CPU(n, n) == Gamma_dn_diag_min) {
+          was_lowest = true;
+          break;
+        }
+#endif
 
       CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_dn_CPU, Gamma_dn_size - 2);
       CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_dn_CPU, Gamma_dn_size - 1);
+#ifndef NDEBUG
+      if (was_lowest)
+        getNewDiagMin(Gamma_dn_CPU, Gamma_dn_size, Gamma_dn_diag_min);
+      pushOnTrace("double delayed_spins[", std::to_string(delayed_index),
+                  "].e_spin_HS_field_[DN,UP] == [e_DN,e_DN] G_dn:", std::to_string(Gamma_dn_size - 2),
+                  ", G_dn:", std::to_string(Gamma_dn_size - 1));
+#endif
     }
+    // Here I think Gamma has to go back to the GPU
+    // In the current code it doesn't until the loop over delayed spins ends.  I don't know
+    // why this is wrong as everything seems to be happening on the CPU, but
+    // epirically it seems elements aren't ending up as identity on the GPU that should.
   }
-}  // namespace ctaux
+}
 
 template <dca::linalg::DeviceType device_t, class Parameters, class Data>
 void CtauxWalker<device_t, Parameters, Data>::apply_bennett_on_Gamma_matrices(int& /*Gamma_up_size*/,
@@ -1603,9 +1815,7 @@ template <dca::linalg::DeviceType device_t, class Parameters, class Data>
 template <typename AccumType>
 const linalg::util::GpuEvent* CtauxWalker<device_t, Parameters, Data>::computeM(
     std::array<linalg::Matrix<AccumType, device_t>, 2>& Ms) {
-  // Stream 1 waits on stream 0.
-  sync_streams_event_.record(linalg::util::getStream(thread_id, 0));
-  sync_streams_event_.block(linalg::util::getStream(thread_id, 1));
+  linalg::util::syncStream(thread_id, stream_id);
 
   for (int s = 0; s < 2; ++s) {
     const auto& config = get_configuration().get(s == 0 ? e_DN : e_UP);
@@ -1620,18 +1830,15 @@ const linalg::util::GpuEvent* CtauxWalker<device_t, Parameters, Data>::computeM(
     M.resizeNoCopy(N.size());
 
     if (device_t == linalg::GPU) {
-      exp_v_minus_one_dev_[s].setAsync(exp_v_minus_one_[s], thread_id, s);
-      dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_dev_[s], N, M, thread_id, s);
+      exp_v_minus_one_dev_[s].setAsync(exp_v_minus_one_[s], thread_id, stream_id);
+      dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_dev_[s], N, M, thread_id,
+                                                  stream_id);
     }
     else {
       dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_[s], N, M);
     }
   }
-
-  m_computed_events_[1].record(linalg::util::getStream(thread_id, 1));
-  m_computed_events_[1].block(linalg::util::getStream(thread_id, 0));
-
-  m_computed_events_[0].record(linalg::util::getStream(thread_id, 0));
+  m_computed_events_[0].record(linalg::util::getStream(thread_id, stream_id));
   return &m_computed_events_[0];
 }
 
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp
index d41523f84..c4cfe721d 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/structs/ct_aux_hs_configuration.hpp
@@ -34,6 +34,8 @@ namespace solver {
 namespace ctaux {
 // dca::phys::solver::ctaux::
 
+/** Represents the Hubbard-Stratonovitch vertices in a particular configuration.
+ */
 template <class parameters_type>
 class CT_AUX_HS_configuration {
 public:
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp
index 857e32e0a..63a5820b8 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp
@@ -82,24 +82,36 @@ class CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar> {
   // exp_delta_V);
   auto solve_Gamma(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU,
                    Scalar exp_delta_V, Real& max, Real& min) -> Real;
-  auto solve_Gamma_blocked(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU,
+
+  /** Solve gamma using blocked submatrix updates
+   *  \param[in] int
+   *  \param[inout] Gamma_LU
+   *  \param[ioout] max
+   *  \param[inout] min
+   */
+  auto solve_Gamma_blocked(const int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU,
                            Scalar exp_delta_V, Real& max, Real& min) -> Scalar;
 
   auto apply_bennett_on_Gamma(int k, int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU,
                               Scalar phani_gamma, Real& max, Real& min) -> Real;
 
+  /** Debugging function
+   *  requires: n > 0
+   */
+  static bool test_max_min(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU, Real max,
+                    Real min);
+
 private:
   void solve_Gamma_slow(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
+  void solve_Gamma_slow(int n, Scalar* Gamma_LU, int lda);
   void solve_Gamma_fast(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
+  void solve_Gamma_fast(int n, Scalar* A, int LD);
   void solve_Gamma_BLAS(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
+  void solve_Gamma_BLAS(int n, Scalar* Gamma_LU, int lda);
 
-  void solve_Gamma_fast(int n, Scalar* A, int LD);
 
   void solve_Gamma_blocked(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
 
-  bool test_max_min(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU, Real max,
-                    Real min);
-
   /** Return x/y.
    *  nothing but that is done for real number but for complex division an implementation consistent with cuda
    *  and magma libraries is used.
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc
index b6aa9281a..25862b83d 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc
@@ -129,7 +129,7 @@ void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters>::copy_rows(
   assert(N_new_spins.nrCols() == N.nrCols());
   assert(N_new_spins.nrRows() == permutation.size());
   assert(permutation.size() <= identity.size());
-
+  dca::linalg::util::syncStream(thread_id, stream_id);
   dca::linalg::matrixop::copyRows(N, permutation, N_new_spins, identity, thread_id, stream_id);
 }
 
@@ -139,6 +139,7 @@ void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters>::compute_G_cols(
     dca::linalg::Matrix<Scalar, dca::linalg::GPU>& G,
     dca::linalg::Matrix<Scalar, dca::linalg::GPU>& G_cols) {
   exp_V.setAsync(exp_V_CPU, linalg::util::getStream(thread_id, stream_id));
+  dca::linalg::util::syncStream(thread_id, stream_id);
 
   assert(N.nrRows() == G.nrRows());
   assert(N.nrRows() == G_cols.nrRows());
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp
index b869876da..752c3283a 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp
@@ -53,7 +53,7 @@ class DMatrixBuilder<linalg::GPU, Scalar> final : public DMatrixBuilder<linalg::
   // See DMatrixBuilder<linalg::CPU, Scalar>::computeG0.
   // Out: G0. Device matrix
   void computeG0(Matrix& G0, const details::DeviceConfiguration& configuration, int n_init,
-                 bool right_section, GpuStream stream) const override;
+                 bool right_section, const GpuStream& stream) const;
 
 private:
   const G0Interpolation<linalg::GPU, Scalar>& g0_ref_;
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp
index 8157478a1..d749dfc93 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp
@@ -30,7 +30,17 @@ namespace details {
 template <typename Scalar, typename SignType>
 void buildG0Matrix(linalg::MatrixView<Scalar, linalg::GPU> G0, const int n_init,
                    const bool right_section, DeviceConfiguration config,
-                   DeviceInterpolationData<Scalar, SignType> g0_interp, dca::linalg::util::GpuStream stream);
+                   DeviceInterpolationData<Scalar, SignType> g0_interp, const dca::linalg::util::GpuStream& stream);
+extern  template void buildG0Matrix(linalg::MatrixView<float, linalg::GPU>, const int, const bool,
+                            DeviceConfiguration, DeviceInterpolationData<float, signed char>, const dca::linalg::util::GpuStream&);
+extern  template void buildG0Matrix(linalg::MatrixView<double, linalg::GPU>, const int, const bool,
+                            DeviceConfiguration, DeviceInterpolationData<double, std::int8_t>, const dca::linalg::util::GpuStream&);
+extern  template void buildG0Matrix(linalg::MatrixView<std::complex<float>, linalg::GPU>, const int,
+                            const bool, DeviceConfiguration,
+                            DeviceInterpolationData<std::complex<float>, std::complex<float>>, const dca::linalg::util::GpuStream&);
+extern  template void buildG0Matrix(linalg::MatrixView<std::complex<double>, linalg::GPU>, const int,
+                            const bool, DeviceConfiguration,
+                            DeviceInterpolationData<std::complex<double>, std::complex<double>>, const dca::linalg::util::GpuStream&);
 
 }  // namespace details
 }  // namespace ctint
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp
index 1984fe0c6..679941d6f 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp
@@ -30,7 +30,7 @@ namespace details {
 template <typename Scalar, typename Real>
 void computeG0(linalg::MatrixView<Scalar, linalg::GPU>& g0_mat,
                DeviceInterpolationData<Scalar, SignType<Scalar>> g0, const Real* t_l, const int* b_l,
-               const int* r_lf, const Real* t_r, const int* b_r, const int* r_r, const dca::linalg::util::GpuStream stream);
+               const int* r_lf, const Real* t_r, const int* b_r, const int* r_r, const dca::linalg::util::GpuStream& stream);
 
   
 
diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp
index e75b4fba6..ef891428d 100644
--- a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp
@@ -96,7 +96,7 @@ class SpAccumulator<Parameters, linalg::GPU>
   }
 
   auto get_streams() {
-    return std::array<linalg::util::GpuStream*, 2>{&streams_[0], &streams_[1]};
+    return std::array<linalg::util::GpuStream*, 2>{&streams_[0], &streams_[0]};
   }
 
   // Returns the allocated device memory in bytes.
@@ -122,7 +122,7 @@ class SpAccumulator<Parameters, linalg::GPU>
    */
   void finalizeFunction(std::array<NfftType, 2>& ft_objs, MFunction& function, bool m_sqr);
 
-  std::array<linalg::util::GpuStream, 2> streams_;
+  std::array<linalg::util::GpuStream, 1> streams_;
   /** gpu M_r_t */
   std::array<NfftType, 2> cached_nfft_obj_;
   /** \todo Don't always pay the memory cost even when not collect single measurement G's */
@@ -137,9 +137,9 @@ SpAccumulator<Parameters, linalg::GPU>::SpAccumulator(const Parameters& paramete
     : BaseClass(parameters_ref, accumulate_m_sqr),
       streams_(),
       cached_nfft_obj_{NfftType(parameters_.get_beta(), streams_[0], accumulate_m_sqr),
-                       NfftType(parameters_.get_beta(), streams_[1], accumulate_m_sqr)},
+                       NfftType(parameters_.get_beta(), streams_[0], accumulate_m_sqr)},
       single_measurement_M_r_t_device_{NfftType(parameters_.get_beta(), streams_[0], false),
-                                       NfftType(parameters_.get_beta(), streams_[1], false)} {
+                                       NfftType(parameters_.get_beta(), streams_[0], false)} {
   single_measurement_M_r_w_.reset(new MFunction("M_r_w"));
 }
 
@@ -189,7 +189,7 @@ void SpAccumulator<Parameters, linalg::GPU>::accumulate(
     const std::array<Configuration, 2>& configs, const Scalar factor) {
   std::array<linalg::Matrix<Scalar, linalg::GPU>, 2> M_dev;
   for (int s = 0; s < 2; ++s)
-    M_dev[s].setAsync(Ms[s], streams_[s]);
+    M_dev[s].setAsync(Ms[s], streams_[0]);
 
   accumulate(M_dev, configs, factor);
 }
diff --git a/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp b/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp
index 7053def2f..825abdcba 100644
--- a/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp
@@ -310,9 +310,10 @@ double StdThreadQmciClusterSolver<QmciSolver>::finalize(dca_info_struct_t& dca_i
   // CTINT calculates its error here maybe
   double L2_Sigma_difference = QmciSolver::finalize(dca_info_struct);
 
-  if (dca_iteration_ == parameters_.get_dca_iterations() - 1)
+  if (dca_iteration_ == parameters_.get_dca_iterations() - 1) {
+    std::cout << "Writing Configurations.\n";
     writeConfigurations();
-
+  }
   // autocorrelation_data_.sumConcurrency(concurrency_);
 
   if (BaseClass::writer_ && *BaseClass::writer_ && concurrency_.id() == concurrency_.first()) {
diff --git a/include/dca/phys/models/general_interaction.hpp b/include/dca/phys/models/general_interaction.hpp
index 50fa15a50..141e8e368 100644
--- a/include/dca/phys/models/general_interaction.hpp
+++ b/include/dca/phys/models/general_interaction.hpp
@@ -72,7 +72,7 @@ void general_interaction<parameters_type>::set_vertex(
                                                   func::dmn_variadic<BandDmn, SpinDmn>, RDmn>>&
         H_interaction) {
   // Create the vector of correlated spin-orbitals once.
-  static const std::vector<int> correlated_orbitals =
+  const std::vector<int> correlated_orbitals =
       general_interaction<parameters_type>::make_correlated_orbitals(parameters, H_interaction);
 
   // Get a random pair of correlated spin-orbitals.
diff --git a/include/dca/phys/parameters/model_parameters_bilayer_hubbard.inc b/include/dca/phys/parameters/model_parameters_bilayer_hubbard.inc
index 18ad1e557..0c0a2eeba 100644
--- a/include/dca/phys/parameters/model_parameters_bilayer_hubbard.inc
+++ b/include/dca/phys/parameters/model_parameters_bilayer_hubbard.inc
@@ -156,5 +156,22 @@ void ModelParameters<models::TightBindingModel<models::bilayer_lattice<PointGrou
     reader_or_writer.close_group();
   }
   catch (const std::exception& r_e) {
+    std::cout << "\n  bilayer-Hubbard-model parameter group is not defined!\n" << std::endl;
+    throw std::logic_error(__PRETTY_FUNCTION__);
   }
+
+  if (ReaderOrWriter::is_reader) {
+    std::cout << "\n\n"
+              << "\tbilayer-Hubbard-model : \n"
+              << "\t--------------- \n\n"
+              << "\t\t t  : " << t_ << "\n"
+              << "\t\t t_prime  : " << t_prime_ << "\n"
+              << "\t\t t_perp  : " << t_perp_ << "\n"
+              << "\t\t U : " << U_ << "\n"
+              << "\t\t V : " << V_ << "\n"
+              << "\t\t V-prime : " << V_prime_ << "\n"
+              << "\n"
+              << std::endl;
+  }
+
 }
diff --git a/src/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.cpp b/src/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.cpp
index d389cd387..5eca7770c 100644
--- a/src/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.cpp
+++ b/src/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.cpp
@@ -73,7 +73,7 @@ void CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::compute_Gamma(
 
         Scalar N_ij = N(configuration_e_spin_index_i, configuration_e_spin_index_j);
 
-        Gamma(i, j) = (N_ij * exp_V[j] - delta) / (exp_V[j] - Real(1.));
+        Gamma(i, j) = consistentScalarDiv((N_ij * exp_V[j] - delta), (exp_V[j] - Real(1.)));
       }
       else {
         Gamma(i, j) =
@@ -100,10 +100,10 @@ void CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::compute_Gamma(
         //     return quot;
         //   };
 
-          Scalar gamma_k = exp_delta_V[j];
-	  auto y = gamma_k - dca::util::RealAlias<Scalar>(1.0);
-          Scalar inter_gamma = consistentScalarDiv(gamma_k, y);  //(gamma_k) / (gamma_k - Real(1.));
-          Gamma(i, j) -= inter_gamma;                          //(gamma_k) / (gamma_k - Real(1.));
+        Scalar gamma_k = exp_delta_V[j];
+        auto y = gamma_k - dca::util::RealAlias<Scalar>(1.0);
+        Scalar inter_gamma = consistentScalarDiv(gamma_k, y);  //(gamma_k) / (gamma_k - Real(1.));
+        Gamma(i, j) -= inter_gamma;                            //(gamma_k) / (gamma_k - Real(1.));
         // }
         // else {
         //   Scalar gamma_k = exp_delta_V[j];
@@ -149,24 +149,35 @@ bool CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::test_max_min(
     min = Gamma_val < min ? Gamma_val : min;
   }
 
-  if (std::abs(max_ref - max) < 1.e-12 and std::fabs(min_ref - min) < 1.e-12)
+  // It's hard to get the min test right and it's not actually a failure state
+  // and std::fabs(min_ref - min) < 1.e-12)
+  if (std::abs(max_ref - max) < 1.e-12)
     return true;
   else {
-    std::cout << __FUNCTION__ << " for Gamma_LU has failed!\n";
-    std::cout << "Has failed!\n";
+    std::cout << __FUNCTION__ << " for Gamma_LU has failed.!\n";
     std::cout.precision(16);
     std::cout << "\n\t n : " << n << "\n";
+    for (int i = 1; i < n + 1; i++) {
+      Gamma_val = std::abs(Gamma_LU(i, i));
+      std::cout << Gamma_val << ", ";
+    }
+    std::cout << '\n';
     std::cout << std::scientific;
     std::cout << "max"
-              << "\t"
+              << "\t\t"
               << "max_ref"
-              << "\t"
+              << "\t\t"
               << "std::fabs(max_ref - max)" << '\n';
-    std::cout << max << "\t" << max_ref << "\t" << std::fabs(max_ref - max) << '\n';
-    std::cout << min << "\t" << min_ref << "\t" << std::fabs(min_ref - min) << '\n';
+    std::cout << max_ref << "\t" << max << "\t" << std::fabs(max_ref - max) << '\n';
+    std::cout << "min"
+              << "\t\t"
+              << "min_ref"
+              << "\t\t"
+              << "std::fabs(min_ref - min)" << '\n';
+    std::cout << min_ref << "\t" << min << "\t" << std::fabs(min_ref - min) << '\n';
     std::cout << std::endl;
 
-    Gamma_LU.print();
+    // Gamma_LU.print();
 
     return false;
   }
@@ -177,8 +188,8 @@ auto CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma(
     int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU, Scalar exp_delta_V, Real& max,
     Real& min) -> Real {
   // solve_Gamma_slow(n, Gamma_LU);
-  solve_Gamma_fast(n, Gamma_LU);
-  // solve_Gamma_BLAS(n, Gamma_LU);
+  // solve_Gamma_fast(n, Gamma_LU);
+  solve_Gamma_BLAS(n, Gamma_LU);
 
   Scalar Gamma_LU_n_n = Gamma_LU(n, n);
   Real Gamma_val = std::abs(Gamma_LU_n_n);
@@ -199,10 +210,14 @@ auto CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma(
     min = Gamma_val;
   }
 
+  // Here this is fine since we don't reset Gamma_LU to identity as in the blocked solve
 #ifndef NDEBUG
-  if (!test_max_min(n, Gamma_LU, max, min))
-    throw std::runtime_error("solve_Gamma_blocked test_max_min on Gamma_LU failed!");
+  if (!test_max_min(n, Gamma_LU, max, min)) {
+    std::cerr << "solve_Gamma test_max_min on Gamma_LU failed!\n";
+    throw std::runtime_error("solve_Gamma test_max_min on Gamma_LU failed!");
+  }
 #endif
+
   Scalar phani_gamma = exp_delta_V - Real(1.);
   Scalar determinant_ratio = -phani_gamma * Gamma_LU_n_n;
 
@@ -268,6 +283,47 @@ void CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma_slow(
   }
 }
 
+template <typename Scalar>
+void CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma_slow(int n, Scalar* Gamma_LU, int LD) {
+  {
+    Scalar* y = &Gamma_LU[0 + n * LD];  // Gamma_LU.ptr(0, n);
+    Scalar* x = &Gamma_LU[n];           // Gamma_LU.ptr(n, 0);
+
+    {
+      if (false) {  // serial
+        for (int i = 0; i < n; i++)
+          for (int j = 0; j < i; j++)
+            y[i] -= Gamma_LU[i + j * LD] * y[j];
+      }
+      else {  // parallell
+        for (int j = 0; j < n; j++)
+          for (int i = j + 1; i < n; i++)
+            y[i] -= Gamma_LU[i + j * LD] * y[j];
+      }
+    }
+
+    {
+      if (true) {  // serial
+        for (int j = 0; j < n; j++) {
+          for (int i = 0; i < j; i++)
+            x[j * LD] -= x[i * LD] * Gamma_LU[i + j * LD];
+          x[j * LD] /= Gamma_LU[j + j * LD];
+        }
+      }
+      else {  // parallell
+        for (int i = 0; i < n; i++) {
+          x[i * LD] /= Gamma_LU[i + i * LD];
+          for (int j = i + 1; j < n; j++)
+            x[j * LD] -= x[i * LD] * Gamma_LU[i + j * LD];
+        }
+      }
+    }
+
+    for (int i = 0; i < n; i++)
+      Gamma_LU[n + n * LD] -= x[i * LD] * y[i];
+  }
+}
+
 /*!                 /            |   \          /            |      \ /            |            \
  *                  |            |   |          |            |      | |            |      |
  *  \Gamma_{n+1} := |  \Gamma_n  | s |  ---\    |     L_n    |   0  | |     U_n    |   y  |
@@ -383,10 +439,19 @@ void CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma_BLAS(
   }
 }
 
+template <typename Scalar>
+void CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma_BLAS(
+    int n, Scalar* Gamma_LU /*, Scalar exp_delta_V*/, int lda) {
+  {
+    dca::linalg::blas::trsv("L", "N", "U", n, Gamma_LU, lda, Gamma_LU, 1);
+    dca::linalg::blas::trsv("U", "T", "N", n, Gamma_LU, lda, Gamma_LU, lda);
+  }
+}
+
 template <typename Scalar>
 auto CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma_blocked(
-    int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU, Scalar exp_delta_V, Real& max,
-    Real& min) -> Scalar {
+    int const n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU, Scalar exp_delta_V,
+    Real& max, Real& min) -> Scalar {
   // std::cout << "\t(" << min << ", " << max << " ) ";
 
   solve_Gamma_blocked(n, Gamma_LU);
@@ -407,13 +472,20 @@ auto CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma_blocked(
       // matrix.
       // Since the current diagonal element should not be considered for max/min, we need to already
       // update the Gamma matrix (which will set it to 1).
-      CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_LU, n);
-
+      // CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::set_to_identity(Gamma_LU, n);
       return Scalar(1.e-16);
     }
     else {
       max = new_max;
       min = new_min;
+
+#ifndef NDEBUG
+      // // This has to be here since it will fail almost always when we set Gamma_LU to identity.
+      // if (!test_max_min(n, Gamma_LU, max, min)) {
+      // 	std::cerr << "solve_Gamma_blocked test_max_min on Gamma_LU failed!\n";
+      //   //throw std::runtime_error("solve_Gamma_blocked test_max_min on Gamma_LU failed!");
+      // }
+#endif
     }
   }
   else {
@@ -421,13 +493,7 @@ auto CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma_blocked(
     min = Gamma_val;
   }
 
-  // std::cout << min << ", " << max << ")\t";
-#ifndef NDEBUG
-  if (!test_max_min(n, Gamma_LU, max, min))
-    throw std::runtime_error("solve_Gamma_blocked test_max_min on Gamma_LU failed!");
-#endif
-
-  auto phani_gamma = exp_delta_V - Real(1.);
+  auto phani_gamma = exp_delta_V - Scalar(1.);
   auto determinant_ratio = -phani_gamma * Gamma_LU_n_n;
 
   return determinant_ratio;
@@ -503,6 +569,7 @@ void CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::solve_Gamma_blocked(
     }
 
     solve_Gamma_fast(l, A_00, LD);
+    // solve_Gamma_slow(l, A_00, LD);
 
     {
       Scalar xy = 0;
@@ -610,7 +677,7 @@ auto CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::apply_bennett_on_Gamma(
 
   Scalar ratio = 1.;
   for (int i = 0; i < n; ++i)
-    ratio *= consistentScalarDiv(Gamma_LU(i,i), d_ptr[i]);
+    ratio *= consistentScalarDiv(Gamma_LU(i, i), d_ptr[i]);
 
   {
     Real Gamma_val = std::abs(Gamma_LU(0, 0));
@@ -630,7 +697,7 @@ auto CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar>::apply_bennett_on_Gamma(
   }
 
   const Scalar phani_gamma = exp_delta_V - Real(1.);
-  const Scalar det_ratio = consistentScalarDiv(-ratio,phani_gamma);
+  const Scalar det_ratio = consistentScalarDiv(-ratio, phani_gamma);
 
   if (std::abs(std::imag(det_ratio)) > std::numeric_limits<Real>::epsilon() * 1000) {
     throw(std::logic_error("The determinant is complex."));
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
index 2a2f90837..2b59fffa6 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.cpp
@@ -40,7 +40,7 @@ template <typename Scalar>
 void DMatrixBuilder<GPU, Scalar>::computeG0(Matrix& G0,
                                             const details::DeviceConfiguration& configuration,
                                             const int n_init, bool right_section,
-                                            GpuStream stream) const {
+                                            const GpuStream& stream) const {
   if (G0.nrRows() * G0.nrCols() == 0)
     return;
   details::buildG0Matrix(linalg::MatrixView<Scalar, linalg::GPU>(G0), n_init, right_section,
diff --git a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
index efcf3fab6..632639e10 100644
--- a/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_kernels.cu
@@ -66,7 +66,7 @@ __global__ void buildG0MatrixKernel(linalg::MatrixView<Scalar, linalg::GPU> G0,
 template <typename Scalar, typename SignType>
 void buildG0Matrix(linalg::MatrixView<Scalar, linalg::GPU> g0, const int n_init,
                    const bool right_section, DeviceConfiguration config,
-                   DeviceInterpolationData<Scalar, SignType> g0_interp, GpuStream stream) {
+                   DeviceInterpolationData<Scalar, SignType> g0_interp, const GpuStream& stream) {
   // assert(CtintHelper::is_initialized());
   const auto blocks = dca::util::getBlockSize(g0.nrRows(), g0.nrCols());
 
@@ -75,16 +75,16 @@ void buildG0Matrix(linalg::MatrixView<Scalar, linalg::GPU> g0, const int n_init,
   checkErrorsCudaDebug();
 }
 
-  template void buildG0Matrix(linalg::MatrixView<float, linalg::GPU>, const int, const bool,
-                            DeviceConfiguration, DeviceInterpolationData<float, signed char>, GpuStream);
-  template void buildG0Matrix(linalg::MatrixView<double, linalg::GPU>, const int, const bool,
-                            DeviceConfiguration, DeviceInterpolationData<double, std::int8_t>, GpuStream);
-  template void buildG0Matrix(linalg::MatrixView<std::complex<float>, linalg::GPU>, const int,
+template void buildG0Matrix(linalg::MatrixView<float, linalg::GPU>, const int, const bool,
+                            DeviceConfiguration, DeviceInterpolationData<float, signed char>, const GpuStream&);
+template void buildG0Matrix(linalg::MatrixView<double, linalg::GPU>, const int, const bool,
+                            DeviceConfiguration, DeviceInterpolationData<double, std::int8_t>, const GpuStream&);
+template void buildG0Matrix(linalg::MatrixView<std::complex<float>, linalg::GPU>, const int,
                             const bool, DeviceConfiguration,
-                            DeviceInterpolationData<std::complex<float>, std::complex<float>>, GpuStream);
-  template void buildG0Matrix(linalg::MatrixView<std::complex<double>, linalg::GPU>, const int,
+                            DeviceInterpolationData<std::complex<float>, std::complex<float>>, const GpuStream&);
+template void buildG0Matrix(linalg::MatrixView<std::complex<double>, linalg::GPU>, const int,
                             const bool, DeviceConfiguration,
-                            DeviceInterpolationData<std::complex<double>, std::complex<double>>, GpuStream);
+                            DeviceInterpolationData<std::complex<double>, std::complex<double>>, const GpuStream&);
 
 }  // namespace details
 }  // namespace ctint
diff --git a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator_kernels.cu b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator_kernels.cu
index ef1e431f5..5c9cb93fc 100644
--- a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator_kernels.cu
@@ -45,7 +45,7 @@ template <typename Scalar, typename Real, typename SignType>
 void computeG0(linalg::MatrixView<Scalar, linalg::GPU>& g0_mat,
                const DeviceInterpolationData<Scalar, SignType> g0, const Real* t_l,
                const int* b_l, const int* r_l, const Real* t_r, const int* b_r, const int* r_r,
-               const GpuStream stream) {
+               const GpuStream& stream) {
   assert(SolverHelper::initialized());
   auto blocks = dca::util::get2DBlockSize(g0_mat.nrRows(), g0_mat.nrCols(), 32);
 
@@ -56,19 +56,19 @@ void computeG0(linalg::MatrixView<Scalar, linalg::GPU>& g0_mat,
   template<> void computeG0<double, double, std::int8_t>(linalg::MatrixView<double, linalg::GPU>&,
 					    const DeviceInterpolationData<double, std::int8_t>,
                                         const double*, const int*, const int*, const double*,
-                                        const int*, const int*, const GpuStream);
+                                        const int*, const int*, const GpuStream&);
   template<> void computeG0<float, float, std::int8_t>(linalg::MatrixView<float, linalg::GPU>&,
 					  const DeviceInterpolationData<float, std::int8_t>, const float*,
                                       const int*, const int*, const float*, const int*, const int*,
-                                      const GpuStream);
+                                      const GpuStream&);
   template<> void computeG0<std::complex<double>, double, std::complex<double>>(
     linalg::MatrixView<std::complex<double>, linalg::GPU>&,
     const DeviceInterpolationData<std::complex<double>, std::complex<double>>, const double*, const int*,
-    const int*, const double*, const int*, const int*, GpuStream);
+    const int*, const double*, const int*, const int*, const GpuStream&);
   template<> void computeG0<std::complex<float>, float, std::complex<float>>(
     linalg::MatrixView<std::complex<float>, linalg::GPU>&,
     const DeviceInterpolationData<std::complex<float>, std::complex<float>>, const float*, const int*, const int*,
-    const float*, const int*, const int*, GpuStream);
+    const float*, const int*, const int*, const GpuStream&);
 
 
 }  // namespace details
diff --git a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu
index 39259bc02..412cff348 100644
--- a/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu
+++ b/src/phys/dca_step/cluster_solver/shared_tools/accumulation/tp/tp_accumulator_kernels.cu
@@ -983,77 +983,76 @@ template double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_
 
 // complex g0
 
-template <>
-double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex<float>>(
+template double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex<float>>(
     std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
     const std::complex<float>* G_down, const int ldgd, const std::complex<float> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex<float>>(
     std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
     const std::complex<float>* G_down, const int ldgd, const std::complex<float> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_CHARGE, std::complex<float>>(
     std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
     const std::complex<float>* G_down, const int ldgd, const std::complex<float> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
                 std::complex<float>>(std::complex<float>* G4, const std::complex<float>* G_up,
                                      const int ldgu, const std::complex<float>* G_down,
                                      const int ldgd, const std::complex<float> factor, bool atomic,
                                      cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<float>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
                 std::complex<float>>(std::complex<float>* G4, const std::complex<float>* G_up,
                                      const int ldgu, const std::complex<float>* G_down,
                                      const int ldgd, const std::complex<float> factor, bool atomic,
                                      cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<float>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<float>>(
     std::complex<float>* G4, const std::complex<float>* G_up, const int ldgu,
     const std::complex<float>* G_down, const int ldgd, const std::complex<float> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_TRANSVERSE, std::complex<double>>(
     std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
     const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_MAGNETIC, std::complex<double>>(
     std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
     const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_CHARGE, std::complex<double>>(
     std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
     const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
     bool atomic, cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_UP,
                 std::complex<double>>(std::complex<double>* G4, const std::complex<double>* G_up,
                                       const int ldgu, const std::complex<double>* G_down,
                                       const int ldgd, const std::complex<double> factor, bool atomic,
                                       cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<double>, FourPointType::PARTICLE_HOLE_LONGITUDINAL_UP_DOWN,
                 std::complex<double>>(std::complex<double>* G4, const std::complex<double>* G_up,
                                       const int ldgu, const std::complex<double>* G_down,
                                       const int ldgd, const std::complex<double> factor, bool atomic,
                                       cudaStream_t stream, std::size_t start, std::size_t end);
 
-template <>
+template
 double updateG4<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::complex<double>>(
     std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
     const std::complex<double>* G_down, const int ldgd, const std::complex<double> factor,
@@ -1070,6 +1069,11 @@ template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PAR
     const std::complex<double> factor, bool atomic,
     cudaStream_t stream, std::size_t start, std::size_t end);
 
+template double updateG4NoSpin<std::complex<double>, FourPointType::PARTICLE_PARTICLE_UP_DOWN, std::int8_t>(
+    std::complex<double>* G4, const std::complex<double>* G_up, const int ldgu,
+    const std::int8_t factor, bool atomic,
+    cudaStream_t stream, std::size_t start, std::size_t end);
+
 
 
 // template<> double updateG4< FourPointType::PARTICLE_HOLE_TRANSVERSE>(
diff --git a/test/integration/cluster_solver/ctint/ctint_square_lattice_test_gpu.cpp b/test/integration/cluster_solver/ctint/ctint_square_lattice_test_gpu.cpp
index 0a0bb8ef4..79464fcf9 100644
--- a/test/integration/cluster_solver/ctint/ctint_square_lattice_test_gpu.cpp
+++ b/test/integration/cluster_solver/ctint/ctint_square_lattice_test_gpu.cpp
@@ -10,7 +10,8 @@
 // Compare GPU and CPU runs with CT-INT.
 // Model: square lattice with single band and double occupancy repulsion U.
 
-#include <cuda_profiler_api.h>
+//#include <cuda_profiler_api.h>
+#include "dca/platform/dca_gpu.h"
 #include <iostream>
 #include <string>
 #include "dca/testing/gtest_h_w_warning_blocking.h"
@@ -73,9 +74,9 @@ TEST(SquareLatticeTest, GpuSolver) {
   dca::phys::solver::CtintClusterSolver<dca::linalg::GPU, Parameters, true> qmc_solver_gpu(
       parameters, data_gpu, nullptr);
   qmc_solver_gpu.initialize(0);
-  cudaProfilerStart();
+  //cudaProfilerStart();
   qmc_solver_gpu.integrate();
-  cudaProfilerStop();
+  //cudaProfilerStop();
   qmc_solver_gpu.finalize();
   EXPECT_NEAR(1.0, qmc_solver_gpu.computeDensity(), 1e-3);
 
diff --git a/test/integration/cluster_solver/stdthread_qmci/gpu/stdthread_ctint_gpu_tp_test.cpp b/test/integration/cluster_solver/stdthread_qmci/gpu/stdthread_ctint_gpu_tp_test.cpp
index 42bf103c2..004a48c49 100644
--- a/test/integration/cluster_solver/stdthread_qmci/gpu/stdthread_ctint_gpu_tp_test.cpp
+++ b/test/integration/cluster_solver/stdthread_qmci/gpu/stdthread_ctint_gpu_tp_test.cpp
@@ -13,7 +13,7 @@
 #include <string>
 
 #include "dca/testing/gtest_h_w_warning_blocking.h"
-
+#include "dca/platform/dca_gpu.h"
 #include "dca/platform/dca_gpu_complex.h"
 #include "dca/function/function.hpp"
 #include "dca/function/util/difference.hpp"
diff --git a/test/unit/linalg/CMakeLists.txt b/test/unit/linalg/CMakeLists.txt
index 4d13dae2d..daf550d02 100644
--- a/test/unit/linalg/CMakeLists.txt
+++ b/test/unit/linalg/CMakeLists.txt
@@ -49,7 +49,7 @@ dca_add_gtest(matrixop_complex_gpu_test
               CUDA
               LIBS ${DCA_LIBS} lapack lapack_kernels blas_kernels gpu_utils ${MAGMA_LIBARY} lapack_kernels)
 
-dca_add_gtest(reshapable_matrix_cpu_test GTEST_MAIN LIBS ${UTIL_LIBS} ${DCA_LIBS} magma::magma)
+dca_add_gtest(reshapable_matrix_cpu_test GTEST_MAIN LIBS ${UTIL_LIBS} ${DCA_LIBS})
 
 dca_add_gtest(reshapable_matrix_gpu_test CUDA GTEST_MAIN LIBS ${DCA_LIBS} gpu_utils magma::magma)
 
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
index 106fa4a03..7f8478f60 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
@@ -1,5 +1,5 @@
-// Copyright (C) 2018 ETH Zurich
-// Copyright (C) 2018 UT-Battelle, LLC
+// Copyright (C) 2024 ETH Zurich
+// Copyright (C) 2024 UT-Battelle, LLC
 // All rights reserved.
 //
 // See LICENSE.txt for terms of usage.
@@ -7,75 +7,98 @@
 //
 // Author: Jérémie Bouquet   (bouquetj@gmail.com).
 //         Giovanni Balduzzi (gbalduzz@itp.phys.ethz.ch).
+//         Peter W. Doak     (doakpw@ornl.gov)
 //
 // This class tests the GPU walker used by the ctint cluster solver by comparing it with the CPU
 // version.
 
 #include "dca/platform/dca_gpu.h"
+using Scalar = double;
 #include "test/mock_mcconfig.hpp"
 namespace dca {
 namespace config {
-using McOptions = MockMcOptions<double>;
+using McOptions = MockMcOptions<Scalar>;
 }  // namespace config
 }  // namespace dca
 
+#include "test/unit/phys/dca_step/cluster_solver/test_setup.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp"
-
+#include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp"
 #include "dca/testing/gtest_h_w_warning_blocking.h"
 
+#include "walker_wrapper.hpp"
+#include "walker_wrapper_submatrix.hpp"
 #include "dca/linalg/matrixop.hpp"
-#include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp"
-#include "test/unit/phys/dca_step/cluster_solver/test_setup.hpp"
-#include "walker_wrapper_submatrix.hpp"
+
+constexpr char input_name[] =
+    DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json";
 
 using dca::linalg::CPU;
 using dca::linalg::GPU;
 
-constexpr char input_name[] =
-    DCA_SOURCE_DIR "/test/unit/phys/dca_step/cluster_solver/ctint/walker/submatrix_input.json";
+template <typename SCALAR>
+struct CtINTWalkerSubmatrixGPUTestT : public ::testing::Test {
+  using G0Setup = dca::testing::G0SetupBare<SCALAR, dca::testing::LatticeBilayer,
+                                            dca::ClusterSolverId::CT_INT, input_name>;
+  virtual void SetUp() {
+    host_setup.SetUp();
+    gpu_setup.SetUp();
+  }
 
-template <typename Scalar>
-using CtintWalkerSubmatrixGpuTest =
-  typename dca::testing::G0Setup<Scalar, dca::testing::LatticeBilayer, dca::ClusterSolverId::CT_INT, input_name>;
+  virtual void TearDown() {}
+  G0Setup host_setup;
+  G0Setup gpu_setup;
+};
 
 using namespace dca::phys::solver;
 
-using FloatingPointTypes = ::testing::Types<double>;
-TYPED_TEST_CASE(CtintWalkerSubmatrixGpuTest, FloatingPointTypes);
+template <typename Scalar>
+using CtintWalkerSubmatrixGpuTest = CtINTWalkerSubmatrixGPUTestT<Scalar>;
+
+// Currently testing float isn't really possible due to the way the Scalar type is
+// carried through from mc_options. See test_setup.hpp PD
+using ScalarTypes = ::testing::Types<double>; //double,
+TYPED_TEST_CASE(CtintWalkerSubmatrixGpuTest, ScalarTypes);
 
 // Compare the submatrix update with a direct computation of the M matrix, and compare the
 // acceptance probability to
 // the CTINT walker with no submatrix update.
 TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
-  using Real = TypeParam;
-  using Parameters = typename TestFixture::Parameters;
-
+  using Scalar = TypeParam;
+  using Parameters = typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::Parameters;
+  using Walker = testing::phys::solver::ctint::WalkerWrapper<Scalar, Parameters>;
+  using Matrix = typename Walker::Matrix;
+  using MatrixPair = std::array<Matrix, 2>;
   using SbmWalkerCpu =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Real, Parameters, dca::linalg::CPU>;
+    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
   using SbmWalkerGpu =
-    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Real, Parameters, dca::linalg::GPU>;
+    testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::GPU>;
 
   std::vector<double> setup_rngs{0., 0.00, 0.9,  0.5, 0.01, 0,    0.75, 0.02,
                                  0,  0.6,  0.03, 1,   0.99, 0.04, 0.99};
-  typename TestFixture::RngType rng(setup_rngs);
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::RngType cpu_rng(setup_rngs);
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::RngType gpu_rng(setup_rngs);
 
-  auto& data = *TestFixture::data_;
-  auto& parameters = TestFixture::parameters_;
+  auto& cpu_data = *this->host_setup.data_;
+  auto& gpu_data = *this->gpu_setup.data_;
+  auto& cpu_parameters = this->host_setup.parameters_;
+  auto& gpu_parameters = this->gpu_setup.parameters_;
 
-  const auto g0_func = dca::phys::solver::ctint::details::shrinkG0(data.G0_r_t);
-  G0Interpolation<CPU, Real> g0_cpu(g0_func);
-  G0Interpolation<GPU, Real> g0_gpu(g0_func);
-  typename TestFixture::LabelDomain label_dmn;
+  const auto g0_func_cpu = dca::phys::solver::ctint::details::shrinkG0(cpu_data.G0_r_t);
+  G0Interpolation<CPU, Scalar> g0_cpu(g0_func_cpu);
+  const auto g0_func_gpu = dca::phys::solver::ctint::details::shrinkG0(gpu_data.G0_r_t);
+  G0Interpolation<GPU, Scalar> g0_gpu(g0_func_gpu);
+  typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::LabelDomain label_dmn;
 
   // TODO: improve API.
-  SbmWalkerCpu::setDMatrixBuilder(g0_cpu);
-  SbmWalkerCpu::setDMatrixAlpha(parameters.getAlphas(), false);
-  SbmWalkerGpu::setDMatrixBuilder(g0_gpu);
-  SbmWalkerGpu::setDMatrixAlpha(parameters.getAlphas(), false);
+  Walker::setDMatrixBuilder(g0_cpu);
+  Walker::setDMatrixAlpha(cpu_parameters.getAlphas(), false);
+  Walker::setInteractionVertices(cpu_data, cpu_parameters);
 
-  SbmWalkerCpu::setInteractionVertices(data, parameters);
-  SbmWalkerGpu::setInteractionVertices(data, parameters);
+  SbmWalkerGpu::setDMatrixBuilder(g0_gpu);
+  SbmWalkerGpu::setDMatrixAlpha(gpu_parameters.getAlphas(), false);
+  SbmWalkerGpu::setInteractionVertices(gpu_data, gpu_parameters);
 
   // ************************************
   // Test vertex insertion / removal ****
@@ -99,20 +122,20 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
   };
 
   for (const int initial_size : std::array<int, 2>{0, 5}) {
-    parameters.setInitialConfigurationSize(initial_size);
-
+    cpu_parameters.setInitialConfigurationSize(initial_size);
+    gpu_parameters.setInitialConfigurationSize(initial_size);
     for (int steps = 1; steps <= 8; ++steps) {
-      rng.setNewValues(setup_rngs);
-      SbmWalkerCpu walker_cpu(parameters, rng);
-      rng.setNewValues(setup_rngs);
-      SbmWalkerGpu walker_gpu(parameters, rng);
+      cpu_rng.setNewValues(setup_rngs);
+      SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng);
+      gpu_rng.setNewValues(setup_rngs);
+      SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng);
 
-      rng.setNewValues(rng_vals);
+      cpu_rng.setNewValues(rng_vals);
       walker_cpu.doStep(steps);
-      rng.setNewValues(rng_vals);
+      gpu_rng.setNewValues(rng_vals);
       walker_gpu.doStep(steps);
 
-      constexpr Real tolerance = std::numeric_limits<Real>::epsilon() * 100;
+      constexpr Scalar tolerance = std::numeric_limits<Scalar>::epsilon() * 100;
 
       auto M_cpu = walker_cpu.getM();
       auto M_gpu = walker_gpu.getM();
diff --git a/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt b/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt
index d29861b2c..8b5395b3d 100644
--- a/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt
+++ b/test/unit/phys/parameters/four_point_parameters/CMakeLists.txt
@@ -3,4 +3,4 @@
 # why this test pulls in a bunch of magma references is a mystery
 dca_add_gtest(four_point_parameters_test
   GTEST_MAIN
-  LIBS json enumerations magma::magma ${DCA_GPU_LIBS})
+  LIBS json enumerations ${DCA_GPU_LIBS})