ValeevGroup
diff --git a/‎INSTALL.md
+1-1 b/‎INSTALL.md
+1-1
diff --git a/‎external/versions.cmake
+2-2 b/‎external/versions.cmake
+2-2
diff --git a/‎src/CMakeLists.txt
+2 b/‎src/CMakeLists.txt
+2
diff --git a/‎src/TiledArray/conversions/eigen.h
+3-5 b/‎src/TiledArray/conversions/eigen.h
+3-5
diff --git a/‎src/TiledArray/cp/cp.h
+29-50 b/‎src/TiledArray/cp/cp.h
+29-50
diff --git a/‎src/TiledArray/cp/cp_als.h
+1-1 b/‎src/TiledArray/cp/cp_als.h
+1-1
diff --git a/‎src/TiledArray/dist_array.h
+18-17 b/‎src/TiledArray/dist_array.h
+18-17
diff --git a/‎src/TiledArray/einsum/tiledarray.h
+2-2 b/‎src/TiledArray/einsum/tiledarray.h
+2-2
diff --git a/‎src/TiledArray/fwd.h
+2 b/‎src/TiledArray/fwd.h
+2
diff --git a/‎src/TiledArray/tensor.h
+14-2 b/‎src/TiledArray/tensor.h
+14-2
@@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
   - Boost.Container: header-only
   - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing*
   - Boost.Range: header-only, *only used for unit testing*
-- [BTAS](http://github.com/ValeevGroup/BTAS), tag 474ddc095cbea12a1d28aca5435703dd9f69b166 . If usable BTAS installation is not found, TiledArray will download and compile
+- [BTAS](http://github.com/ValeevGroup/BTAS), tag 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3 . If usable BTAS installation is not found, TiledArray will download and compile
   BTAS from source. *This is the recommended way to compile BTAS for all users*.
 - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 0b44ef319643cb9721fbe17d294987c146e6460e .
   Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
 
@@ -24,8 +24,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 29a2bf3d3c2670c608b7bfdf2299d76fbc20e041)
 set(TA_TRACKED_MADNESS_VERSION 0.10.1)
 set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 
-set(TA_TRACKED_BTAS_TAG 474ddc095cbea12a1d28aca5435703dd9f69b166)
-set(TA_TRACKED_BTAS_PREVIOUS_TAG 2917aa21465a93ae6f399874f247b5fe31d6b693)
+set(TA_TRACKED_BTAS_TAG 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3)
+set(TA_TRACKED_BTAS_PREVIOUS_TAG 474ddc095cbea12a1d28aca5435703dd9f69b166)
 
 set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da)
 set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796)
 
@@ -195,10 +195,12 @@ TiledArray/util/initializer_list.h
 TiledArray/util/logger.h
 TiledArray/util/ptr_registry.cpp
 TiledArray/util/ptr_registry.h
+TiledArray/util/random.cpp
 TiledArray/util/random.h
 TiledArray/util/singleton.h
 TiledArray/util/threads.h
 TiledArray/util/threads.cpp
+TiledArray/util/thread_specific.h
 TiledArray/util/time.h
 TiledArray/util/vector.h
 )
 
@@ -963,10 +963,10 @@ DistArray_ eigen_tensor_to_array(
 /// replicated TiledArray::DistArray. Usage:
 /// \code
 /// TiledArray::TArrayD
-/// array(world, trange);
+/// array(world, trange_3d);
 /// // Set tiles of array ...
 ///
-/// auto t = array_to_eigen_tensor(array);
+/// auto t = array_to_eigen_tensor<Eigen::Tensor<double, 3>(array);
 /// \endcode
 /// \tparam Tile the tile type of \c src
 /// \tparam Policy the policy type of \c src
@@ -980,13 +980,11 @@ DistArray_ eigen_tensor_to_array(
 ///            create the Eigen::Tensor on every rank (this requires
 ///            that \c src.is_replicated()==true )
 /// \return Eigen::Tensor object containing the data of \c src , if my rank
-/// equals
-///         \c target_rank or \c target_rank==-1 ,
+///         equals \c target_rank or \c target_rank==-1 ,
 ///         default-initialized Eigen::Tensor otherwise.
 template <typename Tensor, typename Tile, typename Policy>
 Tensor array_to_eigen_tensor(const TiledArray::DistArray<Tile, Policy>& src,
                              int target_rank = -1) {
-
   TA_ASSERT(src.tiles_range().rank() == Tensor::NumDimensions);
 
   // Test preconditions
 
@@ -8,42 +8,10 @@
 #include <TiledArray/conversions/btas.h>
 #include <TiledArray/expressions/einsum.h>
 #include <tiledarray.h>
-#include <random>
 
 namespace TiledArray::cp {
+
 namespace detail {
-// A seed for the random number generator.
-static inline unsigned int& random_seed_accessor() {
-  static unsigned int value = 3;
-  return value;
-}
-
-// given a rank and block size, this computes a
-// trange for the rank dimension to be used to make the CP factors.
-static inline TiledRange1 compute_trange1(size_t rank, size_t rank_block_size) {
-  std::size_t nblocks = (rank + rank_block_size - 1) / rank_block_size;
-  auto dv = std::div((int)(rank + nblocks - 1), (int)nblocks);
-  auto avg_block_size = dv.quot - 1, num_avg_plus_one = dv.rem + 1;
-
-  TiledArray::TiledRange1 new_trange1;
-  {
-    std::vector<std::size_t> new_trange1_v;
-    new_trange1_v.reserve(nblocks + 1);
-    auto block_counter = 0;
-    for (auto i = 0; i < num_avg_plus_one;
-         ++i, block_counter += avg_block_size + 1) {
-      new_trange1_v.emplace_back(block_counter);
-    }
-    for (auto i = num_avg_plus_one; i < nblocks;
-         ++i, block_counter += avg_block_size) {
-      new_trange1_v.emplace_back(block_counter);
-    }
-    new_trange1_v.emplace_back(rank);
-    new_trange1 =
-        TiledArray::TiledRange1(new_trange1_v.begin(), new_trange1_v.end());
-  }
-  return new_trange1;
-}
 
 static inline char intToAlphabet(int i) { return static_cast<char>('a' + i); }
 
@@ -111,13 +79,13 @@ class CP {
     if (build_rank) {
       size_t cur_rank = 1;
       do {
-        rank_trange = detail::compute_trange1(cur_rank, rank_block_size);
+        rank_trange = TiledArray::compute_trange1(cur_rank, rank_block_size);
         build_guess(cur_rank, rank_trange);
         ALS(cur_rank, 100, verbose);
         ++cur_rank;
       } while (cur_rank < rank);
     } else {
-      rank_trange = detail::compute_trange1(rank, rank_block_size);
+      rank_trange = TiledArray::compute_trange1(rank, rank_block_size);
       build_guess(rank, rank_trange);
       ALS(rank, 100, verbose);
     }
@@ -143,7 +111,7 @@ class CP {
     double epsilon = 1.0;
     fit_tol = epsilonALS;
     do {
-      auto rank_trange = detail::compute_trange1(cur_rank, rank_block_size);
+      auto rank_trange = compute_trange1(cur_rank, rank_block_size);
       build_guess(cur_rank, rank_trange);
       ALS(cur_rank, 100, verbose);
       ++cur_rank;
@@ -196,9 +164,10 @@ class CP {
       final_fit,          // The final fit of the ALS
                           // optimization at fixed rank.
       fit_tol,            // Tolerance for the ALS solver
-      converged_num,      // How many times the ALS solver
-                          // has changed less than the tolerance
       norm_reference;     // used in determining the CP fit.
+  std::size_t converged_num =
+      0;  // How many times the ALS solver
+          // has changed less than the tolerance in a row
 
   /// This function is determined by the specific CP solver.
   /// builds the rank @c rank CP approximation and stores
@@ -227,14 +196,12 @@ class CP {
     auto lambda = std::vector<typename Tile::value_type>(
         rank, (typename Tile::value_type)0);
     if (world.rank() == 0) {
-      std::mt19937 generator(detail::random_seed_accessor());
-      std::uniform_real_distribution<> distribution(-1.0, 1.0);
       auto factor_ptr = factor.data();
       size_t offset = 0;
       for (auto r = 0; r < rank; ++r, offset += mode_size) {
         auto lam_ptr = lambda.data() + r;
         for (auto m = offset; m < offset + mode_size; ++m) {
-          auto val = distribution(generator);
+          auto val = TiledArray::drand() * 2 - 1;  // random number in [-1,1]
           *(factor_ptr + m) = val;
           *lam_ptr += val * val;
         }
@@ -364,7 +331,7 @@ class CP {
   /// \returns bool : is the change in fit less than the ALS tolerance?
   virtual bool check_fit(bool verbose = false) {
     // Compute the inner product T * T_CP
-    double inner_prod = MTtKRP("r,n").dot(unNormalized_Factor("r,n"));
+    const auto ref_dot_cp = MTtKRP("r,n").dot(unNormalized_Factor("r,n"));
     // compute the square of the CP tensor (can use the grammian)
     auto factor_norm = [&]() {
       auto gram_ptr = partial_grammian.begin();
@@ -380,27 +347,39 @@ class CP {
       return result;
     };
     // compute the error in the loss function and find the fit
-    double normFactors = factor_norm(),
-           normResidual =
-               sqrt(abs(norm_reference * norm_reference +
-                        normFactors * normFactors - 2.0 * inner_prod)),
-           fit = 1.0 - (normResidual / norm_reference),
-           fit_change = abs(prev_fit - fit);
+    const auto norm_cp = factor_norm();  // ||T_CP||_2
+    const auto squared_norm_error = norm_reference * norm_reference +
+                                    norm_cp * norm_cp -
+                                    2.0 * ref_dot_cp;  // ||T - T_CP||_2^2
+    // N.B. squared_norm_error is very noisy
+    // TA_ASSERT(squared_norm_error >= - 1e-8);
+    const auto norm_error = sqrt(abs(squared_norm_error));
+    const auto fit = 1.0 - (norm_error / norm_reference);
+    const auto fit_change = fit - prev_fit;
     prev_fit = fit;
     // print fit data if required
     if (verbose) {
-      std::cout << fit << "\t" << fit_change << std::endl;
+      std::cout << MTtKRP.world().rank() << ": fit=" << fit
+                << " fit_change=" << fit_change << std::endl;
     }
 
     // if the change in fit is less than the tolerance try to return true.
-    if (fit_change < fit_tol) {
+    if (abs(fit_change) < fit_tol) {
       converged_num++;
       if (converged_num == 2) {
         converged_num = 0;
         final_fit = prev_fit;
         prev_fit = 1.0;
+        if (verbose)
+          std::cout << MTtKRP.world().rank() << ": converged" << std::endl;
         return true;
+      } else {
+        TA_ASSERT(converged_num == 1);
+        if (verbose)
+          std::cout << MTtKRP.world().rank() << ": pre-converged" << std::endl;
       }
+    } else {
+      converged_num = 0;
     }
     return false;
   }
 
@@ -12,7 +12,7 @@
  * takes a reference order-N tensor and decomposes it into a
  * set of order-2 tensors all coupled by a hyperdimension called the rank.
  * These factors are optimized using an alternating least squares
- * algorithm. This class is derived form the base CP class
+ * algorithm.
  *
  * @tparam Tile typing for the DistArray tiles
  * @tparam Policy policy of the DistArray
 
@@ -890,20 +890,16 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   ///
   /// \tparam T The type of random value to generate. Defaults to
   ///           element_type.
-  /// \tparam <anonymous> A template type parameter which will be deduced as
-  ///                     void only if MakeRandom knows how to generate random
-  ///                     values of type T. If MakeRandom does not know how to
-  ///                     generate random values of type T, SFINAE will disable
-  ///                     this function.
   /// \param[in] skip_set If false, will throw if any tiles are already set
   /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong
   ///                              throw guarantee.
   /// \throw TiledArray::Exception if skip_set is false and a local tile is
   ///                              already initialized. Weak throw guarantee.
-  template <typename T = element_type,
+  template <HostExecutor Exec = HostExecutor::Default,
+            typename T = element_type,
             typename = detail::enable_if_can_make_random_t<T>>
   void fill_random(bool skip_set = false) {
-    init_elements(
+    init_elements<Exec>(
         [](const auto&) { return detail::MakeRandom<T>::generate_value(); });
   }
 
@@ -943,7 +939,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   ///                              guarantee.
   /// \throw TiledArray::Exception if a tile is already set and skip_set is
   ///                              false. Weak throw guarantee.
-  template <typename Op>
+  template <HostExecutor Exec = HostExecutor::Default, typename Op>
   void init_tiles(Op&& op, bool skip_set = false) {
     // lifetime management of op depends on whether it is a lvalue ref (i.e. has
     // an external owner) or an rvalue ref
@@ -957,15 +953,20 @@ class DistArray : public madness::archive::ParallelSerializableObject {
       const auto& index = *it;
       if (!pimpl_->is_zero(index)) {
         if (skip_set) {
-          auto fut = find(index);
+          auto fut = find_local(index);
           if (fut.probe()) continue;
         }
-        Future<value_type> tile = pimpl_->world().taskq.add(
-            [pimpl = pimpl_, index = ordinal_type(index),
-             op_shared_handle]() -> value_type {
-              return op_shared_handle(pimpl->trange().make_tile_range(index));
-            });
-        set(index, std::move(tile));
+        if constexpr (Exec == HostExecutor::MADWorld) {
+          Future<value_type> tile = pimpl_->world().taskq.add(
+              [pimpl = pimpl_, index = ordinal_type(index),
+               op_shared_handle]() -> value_type {
+                return op_shared_handle(pimpl->trange().make_tile_range(index));
+              });
+          set(index, std::move(tile));
+        } else {
+          static_assert(Exec == HostExecutor::Thread);
+          set(index, op_shared_handle(trange().make_tile_range(index)));
+        }
       }
     }
   }
@@ -994,10 +995,10 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// \throw TiledArray::Exception if skip_set is false and a local, non-zero
   ///                              tile is already initialized. Weak throw
   ///                              guarantee.
-  template <typename Op>
+  template <HostExecutor Exec = HostExecutor::Default, typename Op>
   void init_elements(Op&& op, bool skip_set = false) {
     auto op_shared_handle = make_op_shared_handle(std::forward<Op>(op));
-    init_tiles(
+    init_tiles<Exec>(
         [op = std::move(op_shared_handle)](
             const TiledArray::Range& range) -> value_type {
           // Initialize the tile with the given range object
 
@@ -225,7 +225,7 @@ auto einsum(expressions::TsrExpr<Array_> A, expressions::TsrExpr<Array_> B,
         if (!term.array.is_local(idx)) continue;
         if (term.array.is_zero(idx)) continue;
         // TODO no need for immediate evaluation
-        auto tile = term.array.find(idx).get();
+        auto tile = term.array.find_local(idx).get();
         if (P) tile = tile.permute(P);
         auto shape = term.ei_tiled_range.tile(ei);
         tile = tile.reshape(shape, batch);
@@ -247,7 +247,7 @@ auto einsum(expressions::TsrExpr<Array_> A, expressions::TsrExpr<Array_> B,
       if (!C.ei.is_local(e)) continue;
       if (C.ei.is_zero(e)) continue;
       // TODO no need for immediate evaluation
-      auto tile = C.ei.find(e).get();
+      auto tile = C.ei.find_local(e).get();
       assert(tile.batch_size() == batch);
       const Permutation &P = C.permutation;
       auto c = apply(P, h + e);
 
@@ -149,6 +149,8 @@ using Array
     [[deprecated("use TiledArray::DistArray or TiledArray::TArray<T>")]] =
         DistArray<Tile, Policy>;
 
+enum class HostExecutor { Thread, MADWorld, Default = MADWorld };
+
 }  // namespace TiledArray
 
 #ifndef TILEDARRAY_DISABLE_NAMESPACE_TA
 
@@ -62,8 +62,20 @@ template <typename T, typename std::enable_if<detail::is_tensor<T>::value &&
 inline std::ostream& operator<<(std::ostream& os, const T& t) {
   os << t.range() << " { ";
   const auto n = t.range().volume();
-  for (auto ord = 0ul; ord < n; ++ord) os << t.at_ordinal(ord) << " ";
-
+  std::size_t offset = 0ul;
+  const auto more_than_1_batch = t.batch_size() > 1;
+  for (auto b = 0ul; b != t.batch_size(); ++b) {
+    if (more_than_1_batch) {
+      os << "[batch " << b << "]{ ";
+    }
+    for (auto ord = 0ul; ord < n; ++ord) {
+      os << t.data()[offset + ord] << " ";
+    }
+    if (more_than_1_batch) {
+      os << "} ";
+    }
+    offset += n;
+  }
   os << "}";
 
   return os;
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`* takes a reference order-N tensor and decomposes it into a`
`13`	`13`	`* set of order-2 tensors all coupled by a hyperdimension called the rank.`
`14`	`14`	`* These factors are optimized using an alternating least squares`
`15`		`- * algorithm. This class is derived form the base CP class`
	`15`	`+ * algorithm.`
`16`	`16`	`*`
`17`	`17`	`* @tparam Tile typing for the DistArray tiles`
`18`	`18`	`* @tparam Policy policy of the DistArray`