expression suite: fixups for contractions w/ zero-volume contraction range and w/ nonzero-volume contraction range producing zero-volume result

evaleev · evaleev · commit fa830a183178 · 2024-02-16T11:53:45.000-05:00
diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h
@@ -889,45 +889,63 @@ class Summa
 
   /// Initialize reduce tasks and construct broadcast groups
   ordinal_type initialize(const DenseShape&) {
-    // Construct static broadcast groups for dense arguments
-    const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
-    if (k_ > 0) col_group_ = proc_grid_.make_col_group(col_did);
-    const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
-    if (k_ > 0) row_group_ = proc_grid_.make_row_group(row_did);
+    // if contraction is over zero-volume range just initialize tiles to zero
+    if (k_ == 0) {
+      ordinal_type tile_count = 0;
+      const auto& tiles_range = this->trange().tiles_range();
+      for (auto&& tile_idx : tiles_range) {
+        auto tile_ord = tiles_range.ordinal(tile_idx);
+        if (this->is_local(tile_ord)) {
+          this->world().taskq.add([this, tile_ord, tile_idx]() {
+            this->set_tile(tile_ord,
+                           value_type(this->trange().tile(tile_idx),
+                                      typename value_type::value_type{}));
+          });
+          ++tile_count;
+        }
+      }
+      return tile_count;
+    } else {
+      // Construct static broadcast groups for dense arguments
+      const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
+      col_group_ = proc_grid_.make_col_group(col_did);
+      const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
+      row_group_ = proc_grid_.make_row_group(row_did);
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-    std::stringstream ss;
-    ss << "init: rank=" << TensorImpl_::world().rank() << "\n    col_group_=("
-       << col_did.first << ", " << col_did.second << ") { ";
-    for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
-      ss << col_group_.world_rank(gproc) << " ";
-    ss << "}\n    row_group_=(" << row_did.first << ", " << row_did.second
-       << ") { ";
-    for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
-      ss << row_group_.world_rank(gproc) << " ";
-    ss << "}\n";
-    printf(ss.str().c_str());
+      std::stringstream ss;
+      ss << "init: rank=" << TensorImpl_::world().rank() << "\n    col_group_=("
+         << col_did.first << ", " << col_did.second << ") { ";
+      for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
+        ss << col_group_.world_rank(gproc) << " ";
+      ss << "}\n    row_group_=(" << row_did.first << ", " << row_did.second
+         << ") { ";
+      for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
+        ss << row_group_.world_rank(gproc) << " ";
+      ss << "}\n";
+      printf(ss.str().c_str());
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
 
-    // Allocate memory for the reduce pair tasks.
-    std::allocator<ReducePairTask<op_type>> alloc;
-    reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
+      // Allocate memory for the reduce pair tasks.
+      std::allocator<ReducePairTask<op_type>> alloc;
+      reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
 
-    // Iterate over all local tiles
-    const ordinal_type n = proc_grid_.local_size();
-    for (ordinal_type t = 0ul; t < n; ++t) {
-      // Initialize the reduction task
-      ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
-          reduce_tasks_ + t;
-      new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_
+      // Iterate over all local tiles
+      const ordinal_type n = proc_grid_.local_size();
+      for (ordinal_type t = 0ul; t < n; ++t) {
+        // Initialize the reduction task
+        ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
+            reduce_tasks_ + t;
+        new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-                                                ,
-                                                nullptr, t
+                                                  ,
+                                                  nullptr, t
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-      );
-    }
+        );
+      }
 
-    return proc_grid_.local_size();
+      return proc_grid_.local_size();
+    }
   }
 
   /// Initialize reduce tasks
@@ -938,6 +956,9 @@ class Summa
     ss << "    initialize rank=" << TensorImpl_::world().rank() << " tiles={ ";
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
 
+    // fast return if there is no work to do
+    if (k_ == 0) return 0;
+
     // Allocate memory for the reduce pair tasks.
     std::allocator<ReducePairTask<op_type>> alloc;
     reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
@@ -1705,60 +1726,79 @@ class Summa
           std::max(ProcGrid::size_type(2),
                    std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols()));
 
-      // corner case: empty result
-      if (k_ == 0) return 0;
-
-      // Construct the first SUMMA iteration task
-      if (TensorImpl_::shape().is_dense()) {
-        // We cannot have more iterations than there are blocks in the k
-        // dimension
-        if (depth > k_) depth = k_;
-
-        // Modify the number of concurrent iterations based on the available
-        // memory.
-        depth = mem_bound_depth(depth, 0.0f, 0.0f);
-
-        // Enforce user defined depth bound
-        if (max_depth_) depth = std::min(depth, max_depth_);
-
-        TensorImpl_::world().taskq.add(
-            new DenseStepTask(shared_from_this(), depth));
-      } else {
-        // Increase the depth based on the amount of sparsity in an iteration.
+      // watch out for the corner case: contraction over zero-volume range
+      // producing nonzero-volume result ... in that case there is nothing to do
+      // the appropriate initialization was performed in the initialize() method
+      if (k_ != 0) {
+        // Construct the first SUMMA iteration task
+        if (TensorImpl_::shape().is_dense()) {
+          // We cannot have more iterations than there are blocks in the k
+          // dimension
+          if (depth > k_) depth = k_;
+
+          // Modify the number of concurrent iterations based on the available
+          // memory.
+          depth = mem_bound_depth(depth, 0.0f, 0.0f);
+
+          // Enforce user defined depth bound
+          if (max_depth_) depth = std::min(depth, max_depth_);
+
+          TensorImpl_::world().taskq.add(
+              new DenseStepTask(shared_from_this(), depth));
+        } else {
+          // Increase the depth based on the amount of sparsity in an iteration.
 
-        // Get the sparsity fractions for the left- and right-hand arguments.
-        const float left_sparsity = left_.shape().sparsity();
-        const float right_sparsity = right_.shape().sparsity();
+          // Get the sparsity fractions for the left- and right-hand arguments.
+          const float left_sparsity = left_.shape().sparsity();
+          const float right_sparsity = right_.shape().sparsity();
 
-        // Compute the fraction of non-zero result tiles in a single SUMMA
-        // iteration.
-        const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) *
-                                    (1.0f - std::min(right_sparsity, 0.9f));
+          // Compute the fraction of non-zero result tiles in a single SUMMA
+          // iteration.
+          const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) *
+                                      (1.0f - std::min(right_sparsity, 0.9f));
 
-        // Compute the new depth based on sparsity of the arguments
-        depth =
-            float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f;
+          // Compute the new depth based on sparsity of the arguments
+          depth = float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) +
+                  0.5f;
 
-        // We cannot have more iterations than there are blocks in the k
-        // dimension
-        if (depth > k_) depth = k_;
+          // We cannot have more iterations than there are blocks in the k
+          // dimension
+          if (depth > k_) depth = k_;
 
-        // Modify the number of concurrent iterations based on the available
-        // memory and sparsity of the argument tensors.
-        depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
+          // Modify the number of concurrent iterations based on the available
+          // memory and sparsity of the argument tensors.
+          depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
 
-        // Enforce user defined depth bound
-        if (max_depth_) depth = std::min(depth, max_depth_);
+          // Enforce user defined depth bound
+          if (max_depth_) depth = std::min(depth, max_depth_);
 
-        TensorImpl_::world().taskq.add(
-            new SparseStepTask(shared_from_this(), depth));
-      }
+          TensorImpl_::world().taskq.add(
+              new SparseStepTask(shared_from_this(), depth));
+        }
+      }  // k_ != 0
     }
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
     printf("eval: start wait children rank=%i\n", TensorImpl_::world().rank());
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
 
+    // corner case: if left or right are zero-volume no tasks were scheduled, so
+    // need to discard all of their tiles manually
+    if (left_.range().volume() == 0) {
+      for (auto&& tile_idx : right_.range()) {
+        auto tile_ord = right_.range().ordinal(tile_idx);
+        if (right_.is_local(tile_ord) && !right_.is_zero(tile_ord))
+          right_.discard(tile_ord);
+      }
+    }
+    if (right_.range().volume() == 0) {
+      for (auto&& tile_idx : left_.range()) {
+        auto tile_ord = left_.range().ordinal(tile_idx);
+        if (left_.is_local(tile_ord) && !left_.is_zero(tile_ord))
+          left_.discard(tile_ord);
+      }
+    }
+
     // Wait for child tensors to be evaluated, and process tasks while waiting.
     left_.wait();
     right_.wait();
diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
@@ -343,16 +343,26 @@ class ContEngine : public BinaryEngine<Derived> {
       n *= right_element_size[i];
     }
 
-    // Construct the process grid.
-    proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n);
-
-    // Initialize children
-    left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_));
-    right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_));
-
-    // Initialize the process map if not already defined
-    if (!pmap) pmap = proc_grid_.make_pmap();
-    ExprEngine_::init_distribution(world, pmap);
+    // corner case: zero-volume result ... easier to skip proc_grid_
+    // construction alltogether
+    if (M == 0 || N == 0) {
+      left_.init_distribution(world, {});
+      right_.init_distribution(world, {});
+      ExprEngine_::init_distribution(
+          world, (pmap ? pmap : policy::default_pmap(*world, M * N)));
+    } else {  // M!=0 && N!=0
+
+      // Construct the process grid.
+      proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n);
+
+      // Initialize children
+      left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_));
+      right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_));
+
+      // Initialize the process map if not already defined
+      if (!pmap) pmap = proc_grid_.make_pmap();
+      ExprEngine_::init_distribution(world, pmap);
+    }
   }
 
   /// Tiled range factory function
diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h
@@ -63,10 +63,12 @@ struct ExpressionsFixture : public TiledRangeFixture {
         s_tr1_2(make_random_sparseshape(trange1)),
         s_tr2(make_random_sparseshape(trange2)),
         s_trC(make_random_sparseshape(trangeC)),
+        s_trC_f(make_random_sparseshape(trangeC_f)),
         a(*GlobalFixture::world, tr, s_tr_1),
         b(*GlobalFixture::world, tr, s_tr_2),
         c(*GlobalFixture::world, tr, s_tr_2),
         aC(*GlobalFixture::world, trangeC, s_trC),
+        aC_f(*GlobalFixture::world, trangeC_f, s_trC_f),
         u(*GlobalFixture::world, trange1, s_tr1_1),
         v(*GlobalFixture::world, trange1, s_tr1_2),
         w(*GlobalFixture::world, trange2, s_tr2) {
@@ -92,12 +94,14 @@ struct ExpressionsFixture : public TiledRangeFixture {
         u(*GlobalFixture::world, trange1),
         v(*GlobalFixture::world, trange1),
         w(*GlobalFixture::world, trange2),
-        aC(*GlobalFixture::world, trangeC) {
+        aC(*GlobalFixture::world, trangeC),
+        aC_f(*GlobalFixture::world, trangeC_f) {
     random_fill(a);
     random_fill(b);
     random_fill(u);
     random_fill(v);
     random_fill(aC);
+    random_fill(aC_f);
     GlobalFixture::world->gop.fence();
   }
 
@@ -221,19 +225,25 @@ struct ExpressionsFixture : public TiledRangeFixture {
   // contains empty trange1
   const TiledRange trangeC{TiledRange1{0, 2, 5, 10}, TiledRange1{},
                            TiledRange1{0, 2, 7, 11}};
+  // like trC, but with all dimension nonempty
+  const TiledRange trangeC_f{trangeC.dim(0), TiledRange1{0, 4, 7},
+                             trangeC.dim(2)};
+
   SparseShape<float> s_tr_1;
   SparseShape<float> s_tr_2;
   SparseShape<float> s_tr1_1;
   SparseShape<float> s_tr1_2;
   SparseShape<float> s_tr2;
   SparseShape<float> s_trC;
+  SparseShape<float> s_trC_f;
   TArray a;
   TArray b;
   TArray c;
   TArray u;
   TArray v;
   TArray w;
   TArray aC;
+  TArray aC_f;
 };  // ExpressionsFixture
 
 #endif  // TILEDARRAY_TEST_EXPRESSIONS_FIXTURE_H
diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h
@@ -2946,6 +2946,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) {
 BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {
   auto& c = F::c;
   auto& aC = F::aC;
+  auto& aC_f = F::aC_f;
 
   // unary/binary expressions
   {
@@ -2981,6 +2982,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {
     BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c"));
     // contraction over nonempty dims
     BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c"));
+    // contraction over nonempty dims, involving expressions with nonzero-volume
+    BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * (2. * aC_f("d,e,c")));
   }
 
   // reduction expressions

Original file line number	Diff line number	Diff line change
`@@ -2946,6 +2946,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) {`
`2946`	`2946`	`BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {`
`2947`	`2947`	`auto& c = F::c;`
`2948`	`2948`	`auto& aC = F::aC;`
	`2949`	`+ auto& aC_f = F::aC_f;`
`2949`	`2950`
`2950`	`2951`	`// unary/binary expressions`
`2951`	`2952`	`{`
`@@ -2981,6 +2982,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {`
`2981`	`2982`	`BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c"));`
`2982`	`2983`	`// contraction over nonempty dims`
`2983`	`2984`	`BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c"));`
	`2985`	`+ // contraction over nonempty dims, involving expressions with nonzero-volume`
	`2986`	`+ BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * (2. * aC_f("d,e,c")));`
`2984`	`2987`	`}`
`2985`	`2988`
`2986`	`2989`	`// reduction expressions`