@@ -889,45 +889,63 @@ class Summa
889
889
890
890
// / Initialize reduce tasks and construct broadcast groups
891
891
ordinal_type initialize (const DenseShape&) {
892
- // Construct static broadcast groups for dense arguments
893
- const madness::DistributedID col_did (DistEvalImpl_::id (), 0ul );
894
- if (k_ > 0 ) col_group_ = proc_grid_.make_col_group (col_did);
895
- const madness::DistributedID row_did (DistEvalImpl_::id (), k_);
896
- if (k_ > 0 ) row_group_ = proc_grid_.make_row_group (row_did);
892
+ // if contraction is over zero-volume range just initialize tiles to zero
893
+ if (k_ == 0 ) {
894
+ ordinal_type tile_count = 0 ;
895
+ const auto & tiles_range = this ->trange ().tiles_range ();
896
+ for (auto && tile_idx : tiles_range) {
897
+ auto tile_ord = tiles_range.ordinal (tile_idx);
898
+ if (this ->is_local (tile_ord)) {
899
+ this ->world ().taskq .add ([this , tile_ord, tile_idx]() {
900
+ this ->set_tile (tile_ord,
901
+ value_type (this ->trange ().tile (tile_idx),
902
+ typename value_type::value_type{}));
903
+ });
904
+ ++tile_count;
905
+ }
906
+ }
907
+ return tile_count;
908
+ } else {
909
+ // Construct static broadcast groups for dense arguments
910
+ const madness::DistributedID col_did (DistEvalImpl_::id (), 0ul );
911
+ col_group_ = proc_grid_.make_col_group (col_did);
912
+ const madness::DistributedID row_did (DistEvalImpl_::id (), k_);
913
+ row_group_ = proc_grid_.make_row_group (row_did);
897
914
898
915
#ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
899
- std::stringstream ss;
900
- ss << " init: rank=" << TensorImpl_::world ().rank () << " \n col_group_=("
901
- << col_did.first << " , " << col_did.second << " ) { " ;
902
- for (ProcessID gproc = 0ul ; gproc < col_group_.size (); ++gproc)
903
- ss << col_group_.world_rank (gproc) << " " ;
904
- ss << " }\n row_group_=(" << row_did.first << " , " << row_did.second
905
- << " ) { " ;
906
- for (ProcessID gproc = 0ul ; gproc < row_group_.size (); ++gproc)
907
- ss << row_group_.world_rank (gproc) << " " ;
908
- ss << " }\n " ;
909
- printf (ss.str ().c_str ());
916
+ std::stringstream ss;
917
+ ss << " init: rank=" << TensorImpl_::world ().rank () << " \n col_group_=("
918
+ << col_did.first << " , " << col_did.second << " ) { " ;
919
+ for (ProcessID gproc = 0ul ; gproc < col_group_.size (); ++gproc)
920
+ ss << col_group_.world_rank (gproc) << " " ;
921
+ ss << " }\n row_group_=(" << row_did.first << " , " << row_did.second
922
+ << " ) { " ;
923
+ for (ProcessID gproc = 0ul ; gproc < row_group_.size (); ++gproc)
924
+ ss << row_group_.world_rank (gproc) << " " ;
925
+ ss << " }\n " ;
926
+ printf (ss.str ().c_str ());
910
927
#endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
911
928
912
- // Allocate memory for the reduce pair tasks.
913
- std::allocator<ReducePairTask<op_type>> alloc;
914
- reduce_tasks_ = alloc.allocate (proc_grid_.local_size ());
929
+ // Allocate memory for the reduce pair tasks.
930
+ std::allocator<ReducePairTask<op_type>> alloc;
931
+ reduce_tasks_ = alloc.allocate (proc_grid_.local_size ());
915
932
916
- // Iterate over all local tiles
917
- const ordinal_type n = proc_grid_.local_size ();
918
- for (ordinal_type t = 0ul ; t < n; ++t) {
919
- // Initialize the reduction task
920
- ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
921
- reduce_tasks_ + t;
922
- new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world (), op_
933
+ // Iterate over all local tiles
934
+ const ordinal_type n = proc_grid_.local_size ();
935
+ for (ordinal_type t = 0ul ; t < n; ++t) {
936
+ // Initialize the reduction task
937
+ ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
938
+ reduce_tasks_ + t;
939
+ new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world (), op_
923
940
#ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
924
- ,
925
- nullptr , t
941
+ ,
942
+ nullptr , t
926
943
#endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
927
- );
928
- }
944
+ );
945
+ }
929
946
930
- return proc_grid_.local_size ();
947
+ return proc_grid_.local_size ();
948
+ }
931
949
}
932
950
933
951
// / Initialize reduce tasks
@@ -938,6 +956,9 @@ class Summa
938
956
ss << " initialize rank=" << TensorImpl_::world ().rank () << " tiles={ " ;
939
957
#endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
940
958
959
+ // fast return if there is no work to do
960
+ if (k_ == 0 ) return 0 ;
961
+
941
962
// Allocate memory for the reduce pair tasks.
942
963
std::allocator<ReducePairTask<op_type>> alloc;
943
964
reduce_tasks_ = alloc.allocate (proc_grid_.local_size ());
@@ -1705,60 +1726,79 @@ class Summa
1705
1726
std::max (ProcGrid::size_type (2 ),
1706
1727
std::min (proc_grid_.proc_rows (), proc_grid_.proc_cols ()));
1707
1728
1708
- // corner case: empty result
1709
- if (k_ == 0 ) return 0 ;
1710
-
1711
- // Construct the first SUMMA iteration task
1712
- if (TensorImpl_::shape ().is_dense ()) {
1713
- // We cannot have more iterations than there are blocks in the k
1714
- // dimension
1715
- if (depth > k_) depth = k_;
1716
-
1717
- // Modify the number of concurrent iterations based on the available
1718
- // memory.
1719
- depth = mem_bound_depth (depth, 0 .0f , 0 .0f );
1720
-
1721
- // Enforce user defined depth bound
1722
- if (max_depth_) depth = std::min (depth, max_depth_);
1723
-
1724
- TensorImpl_::world ().taskq .add (
1725
- new DenseStepTask (shared_from_this (), depth));
1726
- } else {
1727
- // Increase the depth based on the amount of sparsity in an iteration.
1729
+ // watch out for the corner case: contraction over zero-volume range
1730
+ // producing nonzero-volume result ... in that case there is nothing to do
1731
+ // the appropriate initialization was performed in the initialize() method
1732
+ if (k_ != 0 ) {
1733
+ // Construct the first SUMMA iteration task
1734
+ if (TensorImpl_::shape ().is_dense ()) {
1735
+ // We cannot have more iterations than there are blocks in the k
1736
+ // dimension
1737
+ if (depth > k_) depth = k_;
1738
+
1739
+ // Modify the number of concurrent iterations based on the available
1740
+ // memory.
1741
+ depth = mem_bound_depth (depth, 0 .0f , 0 .0f );
1742
+
1743
+ // Enforce user defined depth bound
1744
+ if (max_depth_) depth = std::min (depth, max_depth_);
1745
+
1746
+ TensorImpl_::world ().taskq .add (
1747
+ new DenseStepTask (shared_from_this (), depth));
1748
+ } else {
1749
+ // Increase the depth based on the amount of sparsity in an iteration.
1728
1750
1729
- // Get the sparsity fractions for the left- and right-hand arguments.
1730
- const float left_sparsity = left_.shape ().sparsity ();
1731
- const float right_sparsity = right_.shape ().sparsity ();
1751
+ // Get the sparsity fractions for the left- and right-hand arguments.
1752
+ const float left_sparsity = left_.shape ().sparsity ();
1753
+ const float right_sparsity = right_.shape ().sparsity ();
1732
1754
1733
- // Compute the fraction of non-zero result tiles in a single SUMMA
1734
- // iteration.
1735
- const float frac_non_zero = (1 .0f - std::min (left_sparsity, 0 .9f )) *
1736
- (1 .0f - std::min (right_sparsity, 0 .9f ));
1755
+ // Compute the fraction of non-zero result tiles in a single SUMMA
1756
+ // iteration.
1757
+ const float frac_non_zero = (1 .0f - std::min (left_sparsity, 0 .9f )) *
1758
+ (1 .0f - std::min (right_sparsity, 0 .9f ));
1737
1759
1738
- // Compute the new depth based on sparsity of the arguments
1739
- depth =
1740
- float (depth) * ( 1 . 0f - 1 . 35638f * std::log2 (frac_non_zero)) + 0 .5f ;
1760
+ // Compute the new depth based on sparsity of the arguments
1761
+ depth = float (depth) * ( 1 . 0f - 1 . 35638f * std::log2 (frac_non_zero)) +
1762
+ 0 .5f ;
1741
1763
1742
- // We cannot have more iterations than there are blocks in the k
1743
- // dimension
1744
- if (depth > k_) depth = k_;
1764
+ // We cannot have more iterations than there are blocks in the k
1765
+ // dimension
1766
+ if (depth > k_) depth = k_;
1745
1767
1746
- // Modify the number of concurrent iterations based on the available
1747
- // memory and sparsity of the argument tensors.
1748
- depth = mem_bound_depth (depth, left_sparsity, right_sparsity);
1768
+ // Modify the number of concurrent iterations based on the available
1769
+ // memory and sparsity of the argument tensors.
1770
+ depth = mem_bound_depth (depth, left_sparsity, right_sparsity);
1749
1771
1750
- // Enforce user defined depth bound
1751
- if (max_depth_) depth = std::min (depth, max_depth_);
1772
+ // Enforce user defined depth bound
1773
+ if (max_depth_) depth = std::min (depth, max_depth_);
1752
1774
1753
- TensorImpl_::world ().taskq .add (
1754
- new SparseStepTask (shared_from_this (), depth));
1755
- }
1775
+ TensorImpl_::world ().taskq .add (
1776
+ new SparseStepTask (shared_from_this (), depth));
1777
+ }
1778
+ } // k_ != 0
1756
1779
}
1757
1780
1758
1781
#ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1759
1782
printf (" eval: start wait children rank=%i\n " , TensorImpl_::world ().rank ());
1760
1783
#endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
1761
1784
1785
+ // corner case: if left or right are zero-volume no tasks were scheduled, so
1786
+ // need to discard all of their tiles manually
1787
+ if (left_.range ().volume () == 0 ) {
1788
+ for (auto && tile_idx : right_.range ()) {
1789
+ auto tile_ord = right_.range ().ordinal (tile_idx);
1790
+ if (right_.is_local (tile_ord) && !right_.is_zero (tile_ord))
1791
+ right_.discard (tile_ord);
1792
+ }
1793
+ }
1794
+ if (right_.range ().volume () == 0 ) {
1795
+ for (auto && tile_idx : left_.range ()) {
1796
+ auto tile_ord = left_.range ().ordinal (tile_idx);
1797
+ if (left_.is_local (tile_ord) && !left_.is_zero (tile_ord))
1798
+ left_.discard (tile_ord);
1799
+ }
1800
+ }
1801
+
1762
1802
// Wait for child tensors to be evaluated, and process tasks while waiting.
1763
1803
left_.wait ();
1764
1804
right_.wait ();
0 commit comments