From c6c220dcb9d7d8bac38ec4b78b26386c95fbf23e Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Thu, 10 Jun 2021 11:46:32 -0400 Subject: [PATCH] cleanup some unprotected GPU usage (#122) --- .../level1/Copy/TranslateBetweenGrids.hpp | 814 +++++++++--------- 1 file changed, 413 insertions(+), 401 deletions(-) diff --git a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp index 5263d9539..c74cc2936 100644 --- a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp +++ b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp @@ -9,6 +9,7 @@ #ifndef EL_BLAS_COPY_TRANSLATEBETWEENGRIDS_HPP #define EL_BLAS_COPY_TRANSLATEBETWEENGRIDS_HPP +#include "core/environment/decl.hpp" namespace El { namespace copy @@ -245,9 +246,9 @@ void TranslateBetweenGridsAllreduceBasic { // /* - Logically, the values in B_vector are summed together and copied to A. + Logically, the values in B_vector are summed together and copied to A. This function is specific to the LBANN with implementation for specific cases. - + Subgrids in B_vector are assumed to evenly divide the grid in A. This is a basic allreduce implementation with no overlapped communication. */ @@ -281,7 +282,7 @@ void TranslateBetweenGridsAllreduceBasic const Int posInSubGrid = B->Grid().VCRank(); const Int myLocalRankB = posInSubGrid; const Int posInGrid = A.Grid().VCRank(); - A.Resize(m,n); + A.Resize(m,n); mpi::Comm const& viewingCommA = A.Grid().ViewingComm(); @@ -290,7 +291,7 @@ void TranslateBetweenGridsAllreduceBasic const Int rowGCD = GCD(rowStrideB, rowStrideA); const Int rowLCM = rowStrideB*rowStrideA / rowGCD; - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsAllreduceBasic: ", @@ -298,7 +299,7 @@ void TranslateBetweenGridsAllreduceBasic } // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 std::vector require_data(sizeB,false); @@ -371,7 +372,7 @@ void TranslateBetweenGridsAllreduceBasic } Synchronize(syncInfoA); } - + } @@ -391,7 +392,7 @@ void TranslateBetweenGridsAllreduceOptComm Logically, the values in B_vector are summed together and copied to A. This function is specific to LBANN sub-graph parallelism with implementations for specific cases. - + Subgrids in B_vector are assumed to evenly divide the grid in A. This is a segmented allreduce implementation that requires a a communicator over corresponding processes in the subgrids. @@ -426,14 +427,14 @@ void TranslateBetweenGridsAllreduceOptComm const Int posInGrid = A.Grid().VCRank(); - A.Resize(m,n); - + A.Resize(m,n); + Int rowStrideA = A.RowStride(); const Int sizeA = A.Grid().VCSize(); const Int rowGCD = GCD(rowStrideB, rowStrideA); const Int rowLCM = rowStrideB*rowStrideA / rowGCD; - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsAllreduceOptComm: ", @@ -442,10 +443,10 @@ void TranslateBetweenGridsAllreduceOptComm // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 - + const Int index_from = int(std::floor(posInGrid/sizeB)); SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); @@ -501,7 +502,7 @@ void TranslateBetweenGridsAllreduceOpt Logically, the values in B_vector are summed together and copied to A. This function is specific to LBANN sub-graph parallelism with implementations for specific cases. - + Subgrids in B_vector are assumed to evenly divide the grid in A. This is a segmented allreduce implementation that internally splits a communicator. Note that communicator splitting is expensive on GPU. @@ -538,8 +539,8 @@ void TranslateBetweenGridsAllreduceOpt const Int posInGrid = A.Grid().ViewingRank(); - A.Resize(m,n); - + A.Resize(m,n); + mpi::Comm const& viewingCommA = A.Grid().ViewingComm(); @@ -548,7 +549,7 @@ void TranslateBetweenGridsAllreduceOpt const Int rowGCD = GCD(rowStrideB, rowStrideA); const Int rowLCM = rowStrideB*rowStrideA / rowGCD; - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsAllreduceOpt: ", @@ -556,10 +557,10 @@ void TranslateBetweenGridsAllreduceOpt } // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 - + const Int index_from = int(std::floor(posInGrid/sizeB)); @@ -614,11 +615,11 @@ void TranslateBetweenGridsAllreduce std::vector>>& B_Vector, int version) { - //Some better logic can be written here + //Some better logic can be written here // 0: Best Algo // 1: Basic Allreduce without communication overlap // 2: Opt Allreduce but have large overhead for GPU (spliting comm) - // + // if(version==0 || version == 1) @@ -644,20 +645,20 @@ void TranslateBetweenGridsAllreduce std::vector>>& B_Vector, mpi::Comm const& allreduceComm, SyncInfo & syncGeneral, int version) { - //Some better logic can be written here + //Some better logic can be written here // 0: Best Algo - // 1: if spliting comm is given then we have only one algo for now - // + // 1: if spliting comm is given then we have only one algo for now + // if(version==0 || version == 1) { TranslateBetweenGridsAllreduceOptComm(A, - B_Vector, + B_Vector, allreduceComm, syncGeneral); } - + else { LogicError("TranslateBetweenGridsAllreduce: ", @@ -671,15 +672,15 @@ void TranslateBetweenGridsScatterComm std::vector>>& B_Vector, int splitDim, mpi::Comm const& ScatterComm, SyncInfo & syncGeneral) { /* - Scatter data in Column-Major ordering along the last dimension + Scatter data in Column-Major ordering along the last dimension Last dimension should be divisible number of child layers - Size of B_vector is equal to the number of child layers + Size of B_vector is equal to the number of child layers + + Subgrids in B_vector are assumed to be subset of resources in A grid - Subgrids in B_vector are assumed to be subset of resources in A grid + Resources are assumed to be distribted equally among different subgrids - Resources are assumed to be distribted equally among different subgrids - */ EL_DEBUG_CSE @@ -701,7 +702,7 @@ void TranslateBetweenGridsScatterComm B_Vector[i]->Resize(int(m/numChildLayers),n); } - + if(m%splitDim != 0) { LogicError("TranslateBetweenGridsScatterComm: ", @@ -713,7 +714,7 @@ void TranslateBetweenGridsScatterComm "Split dimension must be divisible by number of children layers or number of splits"); } - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsScatterComm: ", @@ -755,19 +756,19 @@ void TranslateBetweenGridsScatterComm // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 std::vector index_to_put(sizeA,-1); for(Int i = 0; i < int(rowLCM/sizeB); ++i) - { + { index_to_put[i] = i; } SyncInfo syncInfoB = SyncInfoFromMatrix(recvTransposedMatrix); - + int partialHeight = int(splitDim/ scatterCommSize); int partialChildHeight = int(partialHeight / numMatricesInSubGrid); @@ -776,11 +777,11 @@ void TranslateBetweenGridsScatterComm for(Int localDataRankA = 0; localDataRankA < int(rowLCM/sizeB); localDataRankA++) { - //comm is useless parameter in this function - //Aluminum infer comm from sync object - + //comm is useless parameter in this function + //Aluminum infer comm from sync object + + - mpi::Scatter((T *)transposedMatrix.Buffer(), sendCounts, (T *)recvTransposedMatrix.Buffer(), sendCounts, localDataRankA, ScatterComm, syncGeneral); @@ -799,7 +800,7 @@ void TranslateBetweenGridsScatterComm syncInfoB); } - + Synchronize(syncInfoB); @@ -817,15 +818,15 @@ void TranslateBetweenGridsScatterOptComm std::vector>>& B_Vector, int splitDim, mpi::Comm const& ScatterComm, SyncInfo & syncGeneral) { /* - Scatter data in Column-Major ordering along the last dimension + Scatter data in Column-Major ordering along the last dimension Last dimension should be divisible number of child layers - Size of B_vector is equal to the number of child layers + Size of B_vector is equal to the number of child layers - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid - Resources are assumed to be distribted equally among different subgrids - + Resources are assumed to be distribted equally among different subgrids + */ EL_DEBUG_CSE @@ -839,7 +840,7 @@ void TranslateBetweenGridsScatterOptComm Int rowStrideA = A.RowStride(); - + const Int numChildLayers = int(B_Vector.size()); const Int sizeA = A.Grid().VCSize(); @@ -848,7 +849,7 @@ void TranslateBetweenGridsScatterOptComm B_Vector[i]->Resize(int(m/numChildLayers),n); } - + if(m%splitDim != 0) { LogicError("TranslateBetweenGridsScatterOptComm: ", @@ -860,7 +861,7 @@ void TranslateBetweenGridsScatterOptComm "Split dimension must be divisible by number of children layers or number of splits"); } - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsScatterOptComm: ", @@ -918,14 +919,14 @@ void TranslateBetweenGridsScatterOptComm // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 std::vector index_to_put(sizeA,-1); for(Int i = 0; i < int(rowLCM/sizeB); ++i) - { + { index_to_put[i] = i; } @@ -955,7 +956,7 @@ void TranslateBetweenGridsScatterOptComm Synchronize(syncGeneral); - + for(Int childLayerSubGrid = 0; childLayerSubGrid < numMatricesInSubGrid; ++childLayerSubGrid) { @@ -988,14 +989,14 @@ void TranslateBetweenGridsSliceGatherOptComm std::vector>>& B_Vector, int splitDim, mpi::Comm const& gatherComm, SyncInfo & syncGeneral) { /* - Scatter data in Column-Major ordering along the last dimension + Scatter data in Column-Major ordering along the last dimension Last dimension should be divisible number of child layers - Size of B_vector is equal to the number of child layers + Size of B_vector is equal to the number of child layers + + Subgrids in B_vector are assumed to be subset of resources in A grid + Resources are assumed to be distribted equally among different subgrids - Subgrids in B_vector are assumed to be subset of resources in A grid - Resources are assumed to be distribted equally among different subgrids - - Uses Allgather to perform slice layer functionality as AllGather has better implementation + Uses Allgather to perform slice layer functionality as AllGather has better implementation */ EL_DEBUG_CSE const Int m = A.Height(); @@ -1006,7 +1007,7 @@ void TranslateBetweenGridsSliceGatherOptComm Int rowStrideA = A.RowStride(); - + const Int numChildLayers = int(B_Vector.size()); const Int sizeA = A.Grid().VCSize(); @@ -1015,7 +1016,7 @@ void TranslateBetweenGridsSliceGatherOptComm B_Vector[i]->Resize(int(m/numChildLayers),n); } - + if(m%splitDim != 0) { LogicError("TranslateBetweenGridsSliceGatherOptComm: ", @@ -1026,7 +1027,7 @@ void TranslateBetweenGridsSliceGatherOptComm LogicError("TranslateBetweenGridsSliceGatherOptComm: ", "Split dimension must be divisible by number of children layers or number of splits"); } - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsSliceGatherOptComm: ", @@ -1077,16 +1078,16 @@ void TranslateBetweenGridsSliceGatherOptComm // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 std::vector index_to_put(sizeA,-1); - + for(Int i = 0; i < int(rowLCM/sizeB); ++i) - { + { index_to_put[i] = i; } @@ -1102,11 +1103,11 @@ void TranslateBetweenGridsSliceGatherOptComm } - mpi::AllGather(sendTransposedMatrix.Buffer(), mLocA*nLocA, - recvBuf, mLocA*nLocA, + mpi::AllGather(sendTransposedMatrix.Buffer(), mLocA*nLocA, + recvBuf, mLocA*nLocA, gatherComm, syncGeneral); - + Matrix tempMatrix(gatherCommSize , childLayerSplitHeight * nLocA * (mLocA/splitDim)); @@ -1141,16 +1142,16 @@ void TranslateBetweenGridsScatterCommParentSmall std::vector>>& B_Vector, int splitDim, mpi::Comm const& ScatterComm, SyncInfo & syncGeneral) { /* - Scatter data in Column-Major ordering along the last dimension + Scatter data in Column-Major ordering along the last dimension Last dimension should be divisible number of child layers - Size of B_vector is equal to the number of child layers + Size of B_vector is equal to the number of child layers A : 0 1 2 3 - B1: 0 1 + B1: 0 1 B2: 2 3 - B3: 4 5 - B4: 6 7 - + B3: 4 5 + B4: 6 7 + */ EL_DEBUG_CSE @@ -1173,17 +1174,17 @@ void TranslateBetweenGridsScatterCommParentSmall SyncInfo syncGeneralMetaData = SyncInfo(); const bool inAGrid = A.Participating(); - + Int recvMetaData[4]; Int metaData[4]; if(inAGrid) { - + metaData[0] = m; metaData[1] = n; metaData[2] = mLocA; - metaData[3] = nLocA; + metaData[3] = nLocA; } else { @@ -1193,14 +1194,14 @@ void TranslateBetweenGridsScatterCommParentSmall metaData[3] = 0; } - - + + const std::vector sendMetaData (metaData,metaData + 4 ); - + SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); Synchronize(syncGeneral); - + mpi::AllReduce( sendMetaData.data(), recvMetaData, 4, mpi::MAX, viewingCommA,syncGeneralMetaData); Synchronize(syncGeneralMetaData); @@ -1216,7 +1217,7 @@ void TranslateBetweenGridsScatterCommParentSmall B_Vector[i]->Resize(int(m/numChildLayers),n); } - + if(m%splitDim != 0) { LogicError("TranslateBetweenGridsScatterCommParentSmall: ", @@ -1227,7 +1228,7 @@ void TranslateBetweenGridsScatterCommParentSmall LogicError("TranslateBetweenGridsScatterCommParentSmall: ", "Split dimension must be divisible by number of children layers or number of splits"); } - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsScatterCommParentSmall: ", @@ -1263,7 +1264,7 @@ void TranslateBetweenGridsScatterCommParentSmall Transpose(conversionMatrix,transposedMatrix); } - + conversionMatrix.Resize(int(mLocA/scatterCommSize),nLocA); @@ -1283,19 +1284,19 @@ void TranslateBetweenGridsScatterCommParentSmall // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 std::vector index_to_put(sizeA,-1); for(Int i = 0; i < int(rowLCM/sizeB); ++i) - { + { index_to_put[i] = i; } SyncInfo syncInfoB = SyncInfoFromMatrix(recvTransposedMatrix); - + int partialHeight = int(splitDim/ scatterCommSize); int partialChildHeight = int(partialHeight / numMatricesInSubGrid); @@ -1304,11 +1305,11 @@ void TranslateBetweenGridsScatterCommParentSmall for(Int localDataRankA = 0; localDataRankA < int(rowLCM/sizeB); localDataRankA++) { - //comm is useless parameter in this function - //Aluminum infer comm from sync object - + //comm is useless parameter in this function + //Aluminum infer comm from sync object + + - mpi::Scatter((T *)transposedMatrix.Buffer(), sendCounts, (T *)recvTransposedMatrix.Buffer(), sendCounts, localDataRankA, ScatterComm, syncGeneral); @@ -1327,7 +1328,7 @@ void TranslateBetweenGridsScatterCommParentSmall syncInfoB); } - + Synchronize(syncInfoB); @@ -1344,17 +1345,17 @@ void TranslateBetweenGridsSliceGatherParentSmall std::vector>>& B_Vector, int splitDim, mpi::Comm const& gatherComm, SyncInfo & syncGeneral) { /* - Scatter data in Column-Major ordering along the last dimension + Scatter data in Column-Major ordering along the last dimension Last dimension should be divisible number of child layers - Size of B_vector is equal to the number of child layers + Size of B_vector is equal to the number of child layers A : 0 1 2 3 - B1: 0 1 + B1: 0 1 B2: 2 3 - B3: 4 5 - B4: 6 7 - - Uses Allgather to perform slice layer functionality as AllGather has better implementation + B3: 4 5 + B4: 6 7 + + Uses Allgather to perform slice layer functionality as AllGather has better implementation */ EL_DEBUG_CSE Int m = A.Height(); @@ -1366,13 +1367,13 @@ void TranslateBetweenGridsSliceGatherParentSmall - + const Int numChildLayers = int(B_Vector.size()); const Int sizeA = A.Grid().VCSize(); - - + + if(m%splitDim != 0) { LogicError("TranslateBetweenGridsSliceGatherParentSmall: ", @@ -1386,17 +1387,17 @@ void TranslateBetweenGridsSliceGatherParentSmall SyncInfo syncGeneralMetaData = SyncInfo(); const bool inAGrid = A.Participating(); - + Int recvMetaData[6]; Int metaData[6]; if(inAGrid) { - + metaData[0] = m; metaData[1] = n; metaData[2] = mLocA; - metaData[3] = nLocA; + metaData[3] = nLocA; } else { @@ -1406,14 +1407,14 @@ void TranslateBetweenGridsSliceGatherParentSmall metaData[3] = 0; } - - + + const std::vector sendMetaData (metaData,metaData + 4 ); - + Synchronize(syncGeneral); - + mpi::AllReduce( sendMetaData.data(), recvMetaData, 4, mpi::MAX, viewingCommA,syncGeneralMetaData); Synchronize(syncGeneralMetaData); @@ -1424,7 +1425,7 @@ void TranslateBetweenGridsSliceGatherParentSmall nLocA = recvMetaData[3]; - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsSliceGatherParentSmall: ", @@ -1438,8 +1439,8 @@ void TranslateBetweenGridsSliceGatherParentSmall } const Int gatherCommSize = mpi::Size( gatherComm ); - //number of relevant ranks from which data should be received - + //number of relevant ranks from which data should be received + const int numMatricesInSubGrid = int(numChildLayers / gatherCommSize); std::vector indexBVec; @@ -1475,7 +1476,7 @@ void TranslateBetweenGridsSliceGatherParentSmall Transpose(A.LockedMatrix(),sendTransposedMatrix); } - + @@ -1500,7 +1501,7 @@ void TranslateBetweenGridsSliceGatherParentSmall // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 @@ -1518,8 +1519,8 @@ void TranslateBetweenGridsSliceGatherParentSmall } - mpi::AllGather(sendTransposedMatrix.Buffer(), mLocA*nLocA, - recvBuf, mLocA*nLocA, + mpi::AllGather(sendTransposedMatrix.Buffer(), mLocA*nLocA, + recvBuf, mLocA*nLocA, gatherComm, syncGeneral); Matrix tempMatrix(numRanksToRecv , childLayerSplitHeight * nLocA * (mLocA/splitDim)); @@ -1554,17 +1555,17 @@ void TranslateBetweenGridsScatterCommSameSizeSubGrids std::vector>>& B_Vector, int splitDim, mpi::Comm const& ScatterComm, SyncInfo & syncGeneral) { /* - Scatter data in Column-Major ordering along the last dimension + Scatter data in Column-Major ordering along the last dimension Last dimension should be divisible number of child layers - Size of B_vector is equal to the number of child layers + Size of B_vector is equal to the number of child layers - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid - Parent Grid: 1 2 3 4 (must match the ranks of subgrid1) - Subgrid1: 1 2 3 4 - Subgrid2: 5 6 7 8 - Subgrid3: 9 10 11 12 - Subgrid4: 13 14 15 16 + Parent Grid: 1 2 3 4 (must match the ranks of subgrid1) + Subgrid1: 1 2 3 4 + Subgrid2: 5 6 7 8 + Subgrid3: 9 10 11 12 + Subgrid4: 13 14 15 16 */ EL_DEBUG_CSE Int m = A.Height(); @@ -1586,18 +1587,18 @@ void TranslateBetweenGridsScatterCommSameSizeSubGrids SyncInfo syncGeneralMetaData = SyncInfo(); const bool inAGrid = A.Participating(); - + Int recvMetaData[6]; Int metaData[6]; if(inAGrid) { - + metaData[0] = m; metaData[1] = n; metaData[2] = mLocA; - metaData[3] = nLocA; + metaData[3] = nLocA; } else { @@ -1606,13 +1607,13 @@ void TranslateBetweenGridsScatterCommSameSizeSubGrids metaData[2] = 0; metaData[3] = 0; } - + const std::vector sendMetaData (metaData,metaData + 4 ); - + Synchronize(syncGeneral); - + mpi::AllReduce( sendMetaData.data(), recvMetaData, 4, mpi::MAX, viewingCommA,syncGeneralMetaData); Synchronize(syncGeneralMetaData); @@ -1630,15 +1631,15 @@ void TranslateBetweenGridsScatterCommSameSizeSubGrids } - if( (inAGrid ==true && B_Vector[0]->Participating()==false) || + if( (inAGrid ==true && B_Vector[0]->Participating()==false) || (inAGrid ==false && B_Vector[0]->Participating()==true) ) { - //A grid should have same ranks as of subgrid1 + //A grid should have same ranks as of subgrid1 LogicError("TranslateBetweenGridsScatteSameSizeSubGrids: ", "Owning ranks in Grid A should be same as of Owning Ranks in Subgrid 1"); } - + if(m%splitDim != 0) { LogicError("TranslateBetweenGridsScatteSameSizeSubGrids: ", @@ -1650,7 +1651,7 @@ void TranslateBetweenGridsScatterCommSameSizeSubGrids "Split dimension must be divisible by number of children layers or number of splits"); } - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsScatteSameSizeSubGrids: ", @@ -1689,7 +1690,7 @@ void TranslateBetweenGridsScatterCommSameSizeSubGrids Transpose(conversionMatrix,transposedMatrix); - + } @@ -1699,9 +1700,9 @@ void TranslateBetweenGridsScatterCommSameSizeSubGrids conversionMatrix.Resize((mLocA/splitDim) * nLocA, splitDim/numChildLayers); - - mpi::Scatter( (T *)transposedMatrix.Buffer(), sendCounts, - recvBuf, sendCounts, + + mpi::Scatter( (T *)transposedMatrix.Buffer(), sendCounts, + recvBuf, sendCounts, 0, ScatterComm, syncGeneral); @@ -1742,17 +1743,17 @@ void TranslateBetweenGridsSliceBroadcastCommSameSizeSubGrids std::vector>>& B_Vector, int splitDim, mpi::Comm const& ScatterComm, SyncInfo & syncGeneral) { /* - Scatter data in Column-Major ordering along the last dimension + Scatter data in Column-Major ordering along the last dimension Last dimension should be divisible number of child layers - Size of B_vector is equal to the number of child layers + Size of B_vector is equal to the number of child layers - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid - Parent Grid: 1 2 3 4 (must match the ranks of subgrid1) - Subgrid1: 1 2 3 4 - Subgrid2: 5 6 7 8 - Subgrid3: 9 10 11 12 - Subgrid4: 13 14 15 16 + Parent Grid: 1 2 3 4 (must match the ranks of subgrid1) + Subgrid1: 1 2 3 4 + Subgrid2: 5 6 7 8 + Subgrid3: 9 10 11 12 + Subgrid4: 13 14 15 16 */ EL_DEBUG_CSE Int m = A.Height(); @@ -1774,18 +1775,18 @@ void TranslateBetweenGridsSliceBroadcastCommSameSizeSubGrids SyncInfo syncGeneralMetaData = SyncInfo(); const bool inAGrid = A.Participating(); - + Int recvMetaData[6]; Int metaData[6]; if(inAGrid) { - + metaData[0] = m; metaData[1] = n; metaData[2] = mLocA; - metaData[3] = nLocA; + metaData[3] = nLocA; } else { @@ -1794,13 +1795,13 @@ void TranslateBetweenGridsSliceBroadcastCommSameSizeSubGrids metaData[2] = 0; metaData[3] = 0; } - + const std::vector sendMetaData (metaData,metaData + 4 ); - + Synchronize(syncGeneral); - + mpi::AllReduce( sendMetaData.data(), recvMetaData, 4, mpi::MAX, viewingCommA,syncGeneralMetaData); Synchronize(syncGeneralMetaData); @@ -1818,15 +1819,15 @@ void TranslateBetweenGridsSliceBroadcastCommSameSizeSubGrids } - if( (inAGrid ==true && B_Vector[0]->Participating()==false) || + if( (inAGrid ==true && B_Vector[0]->Participating()==false) || (inAGrid ==false && B_Vector[0]->Participating()==true) ) { - //A gird should have same ranks as of subgrid1 + //A gird should have same ranks as of subgrid1 LogicError("TranslateBetweenGridsScatteSameSizeSubGrids: ", "Owning ranks in Grid A should be same as of Owning Ranks in Subgrid 1"); } - + if(m%splitDim != 0) { LogicError("TranslateBetweenGridsScatteSameSizeSubGrids: ", @@ -1837,7 +1838,7 @@ void TranslateBetweenGridsSliceBroadcastCommSameSizeSubGrids LogicError("TranslateBetweenGridsScatteSameSizeSubGrids: ", "Split dimension must be divisible by number of children layers or number of splits"); } - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsScatteSameSizeSubGrids: ", @@ -1876,11 +1877,11 @@ void TranslateBetweenGridsSliceBroadcastCommSameSizeSubGrids Transpose(conversionMatrix,transposedMatrix); - + } - + @@ -1897,8 +1898,8 @@ void TranslateBetweenGridsSliceBroadcastCommSameSizeSubGrids mpi::Broadcast((T *)transposedMatrix.Buffer(), sendCounts, 0, ScatterComm, syncGeneral); - - + + Synchronize(syncGeneral); int perChildLayerSize = (mLocA/splitDim) * nLocA * (splitDim/numChildLayers); @@ -1939,15 +1940,15 @@ void TranslateBetweenGridsSliceConcatAlongFirstDim std::vector>>& B_Vector, int splitDim, mpi::Comm const& gatherComm, SyncInfo & syncGeneral) { /* - Scatter data in Column-Major ordering along the last dimension + Scatter data in Column-Major ordering along the last dimension when given data is sliced along the first dimension Last dimension should be divisible number of child layers - Size of B_vector is equal to the number of child layers + Size of B_vector is equal to the number of child layers + + Subgrids in B_vector are assumed to be subset of resources in A grid + Resources are assumed to be distribted equally among different subgrids - Subgrids in B_vector are assumed to be subset of resources in A grid - Resources are assumed to be distribted equally among different subgrids - - Uses Allgather to perform slice layer functionality as AllGather has better implementation + Uses Allgather to perform slice layer functionality as AllGather has better implementation */ EL_DEBUG_CSE @@ -1961,7 +1962,7 @@ void TranslateBetweenGridsSliceConcatAlongFirstDim - + const Int numChildLayers = int(B_Vector.size()); const Int sizeA = A.Grid().VCSize(); @@ -1970,7 +1971,7 @@ void TranslateBetweenGridsSliceConcatAlongFirstDim B_Vector[i]->Resize(int(m/numChildLayers),n); } - + if(m%splitDim != 0) { LogicError("TranslateBetweenGridsSliceConcatAlongFirstDim: ", @@ -1981,7 +1982,7 @@ void TranslateBetweenGridsSliceConcatAlongFirstDim LogicError("TranslateBetweenGridsSliceConcatAlongFirstDim: ", "Split dimension must be divisible by number of children layers or number of splits"); } - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsSliceConcatAlongFirstDim: ", @@ -2022,7 +2023,7 @@ void TranslateBetweenGridsSliceConcatAlongFirstDim std::vector index_to_put(sizeA,-1); - + const Int childLayerSplitHeight = splitDim / numChildLayers; @@ -2032,17 +2033,17 @@ void TranslateBetweenGridsSliceConcatAlongFirstDim std::vector> conversionMatrixVector; for(Int childLayerSubGrid = 0; childLayerSubGrid < numMatricesInSubGrid; ++childLayerSubGrid) { - + // conversionMatrixVector.push_back(Matrix(nLocA, (mLocA*gatherCommSize)/numChildLayers)); conversionMatrixVector.push_back(Matrix(nLocA, (mLocA)/numChildLayers)); } - mpi::AllGather(sendTransposedMatrix.Buffer(), mLocA*nLocA, - recvTransposedMatrix.Buffer(), mLocA*nLocA, + mpi::AllGather(sendTransposedMatrix.Buffer(), mLocA*nLocA, + recvTransposedMatrix.Buffer(), mLocA*nLocA, gatherComm, syncGeneral); - + @@ -2089,34 +2090,34 @@ void TranslateBetweenGridsScatter std::vector>>& B_Vector, int splitDim, mpi::Comm const& ScatterComm, SyncInfo & syncGeneral , int version) { - //Some better logic can be written here + //Some better logic can be written here // 0: Best Algo // 1: Basic Scatter // 2: Broadcast - // + // DistMatrix* B = dynamic_cast*>( &(*B_Vector[0])); - + const Int sizeA = A.Grid().VCSize(); const Int sizeB = B->Grid().VCSize(); const Int commSize = El::mpi::Size(ScatterComm); if(sizeA == sizeB) { - //SubGrid VCSizc is equal to Parent Grid + //SubGrid VCSizc is equal to Parent Grid if(version==0 || version == 3) { TranslateBetweenGridsScatterCommSameSizeSubGrids(A, - B_Vector, - splitDim, + B_Vector, + splitDim, ScatterComm, syncGeneral); } else if (version == 2 || version == 1 ) { TranslateBetweenGridsSliceBroadcastCommSameSizeSubGrids(A, - B_Vector, - splitDim, + B_Vector, + splitDim, ScatterComm, syncGeneral); } @@ -2133,16 +2134,16 @@ void TranslateBetweenGridsScatter if(version==0 || version == 3) { TranslateBetweenGridsScatterCommParentSmall(A, - B_Vector, - splitDim, + B_Vector, + splitDim, ScatterComm, syncGeneral); } else if (version == 2 || version == 1 ) { TranslateBetweenGridsSliceGatherParentSmall(A, - B_Vector, - splitDim, + B_Vector, + splitDim, ScatterComm, syncGeneral); } @@ -2155,30 +2156,30 @@ void TranslateBetweenGridsScatter // 0: Best Algo // 1: Basic Scatter // 2: Scater with Opt (interleave for transpose) - // 3: Gather + // 3: Gather else { if(version==0 || version == 3) { TranslateBetweenGridsSliceGatherOptComm(A, - B_Vector, - splitDim, + B_Vector, + splitDim, ScatterComm, syncGeneral); } else if (version == 2) { TranslateBetweenGridsScatterOptComm(A, - B_Vector, - splitDim, + B_Vector, + splitDim, ScatterComm, syncGeneral); } else if(version == 1) { TranslateBetweenGridsScatterOptComm(A, - B_Vector, - splitDim, + B_Vector, + splitDim, ScatterComm, syncGeneral); } @@ -2188,7 +2189,7 @@ void TranslateBetweenGridsScatter "Invalid version, it has to be [0,1,2,3], 0: Default"); } } - + } template @@ -2199,13 +2200,13 @@ void TranslateBetweenGridsSliceCol /* Scatter data in Column-Major ordering along the Columns of Elemental matrix - Used to scatter data from input layer to subgrids in Topology aware design - + Used to scatter data from input layer to subgrids in Topology aware design + Size of B_vector is equal to the Number of subgraph subgrids (not number of branches in subgrpah) - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid - Resources are assumed to be distributed equally among different subgrids + Resources are assumed to be distributed equally among different subgrids It is a local operation. No Communication needed. */ @@ -2233,7 +2234,11 @@ void TranslateBetweenGridsSliceCol } else { - SyncInfo syncInfoB; +#ifdef HYDROGEN_HAVE_GPU + auto const& syncInfoB = + SyncInfoFromMatrix( + dynamic_cast const&>( + B.LockedMatrix())); copy::util::InterleaveMatrix( mLocA, nLocA, @@ -2241,8 +2246,11 @@ void TranslateBetweenGridsSliceCol B.Buffer(), 1, mLocA, syncInfoB); +#else + LogicError("Invalid device"); +#endif } - + } @@ -2254,13 +2262,13 @@ void TranslateBetweenGridsSliceColVector /* Scatter data in Column-Major ordering along the Columns of Elemental matrix - Used to scatter data from input layer to subgrids in Topology aware design - + Used to scatter data from input layer to subgrids in Topology aware design + Size of B_vector is equal to the Number of subgraph subgrids (not number of branches in subgrpah) - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid - Resources are assumed to be distribted equally among different subgrids + Resources are assumed to be distribted equally among different subgrids It is a local operation. No Communication needed. */ @@ -2311,22 +2319,22 @@ void TranslateBetweenGridsGatherComm std::vector>>& B_Vector, int splitDim, mpi::Comm const& gatherComm, SyncInfo & syncGeneral) { /* - Gather data in Column-Major ordering along the last dimension - - Size of B_vector is equal to the number of parent layers + Gather data in Column-Major ordering along the last dimension + + Size of B_vector is equal to the number of parent layers - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid - Resources are assumed to be distribted equally among different subgrids + Resources are assumed to be distribted equally among different subgrids - or + or A : 0 1 2 3 - B1: 0 1 + B1: 0 1 B2: 2 3 - B3: 4 5 - B4: 6 7 - + B3: 4 5 + B4: 6 7 + */ EL_DEBUG_CSE @@ -2357,12 +2365,12 @@ void TranslateBetweenGridsGatherComm - + const Int sizeA = A.Grid().VCSize(); const Int index_from = int(std::floor(posInGrid/sizeB)); - A.Resize(m*numParentLayers,n); + A.Resize(m*numParentLayers,n); SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); @@ -2387,17 +2395,17 @@ void TranslateBetweenGridsGatherComm "Height in B matrix must be divisible by splitDim"); } - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsGatherComm: ", "Number of columns should be evenly divided by the size of the grid A (parent grid)"); } const int totalSizeComm = mLocB * numParentLayers * int(n/sizeB); - + const int maxSendSize = mLocB*nLocB * numMatricesInSubGrid; - simple_buffer send_buf(maxSendSize, syncInfoA); + simple_buffer send_buf(maxSendSize, syncInfoA); T* sendBuf = send_buf.data(); @@ -2418,8 +2426,8 @@ void TranslateBetweenGridsGatherComm } - mpi::AllGather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, - recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, + mpi::AllGather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, + recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, gatherComm, syncGeneral); @@ -2436,7 +2444,7 @@ void TranslateBetweenGridsGatherComm 1, m*numParentLayers, syncInfoA); } - + @@ -2449,17 +2457,17 @@ void TranslateBetweenGridsGatherOptComm std::vector>>& B_Vector, int splitDim, mpi::Comm const& gatherComm, SyncInfo & syncGeneral) { /* - Gather data in Column-Major ordering along the last dimension - - Size of B_vector is equal to the number of parent layers + Gather data in Column-Major ordering along the last dimension + + Size of B_vector is equal to the number of parent layers - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid + + Resources are assumed to be distribted equally among different subgrids - Resources are assumed to be distribted equally among different subgrids - */ - // This Function has some bugs in Interleave function + // This Function has some bugs in Interleave function EL_DEBUG_CSE std::vector indexBVec; @@ -2481,7 +2489,7 @@ void TranslateBetweenGridsGatherOptComm const Int indexB = indexBVec[0]; DistMatrix* B = dynamic_cast*>( &(*B_Vector[indexB])); - + const Int sizeB = B->Grid().VCSize(); const Int m = B->Height(); @@ -2491,12 +2499,12 @@ void TranslateBetweenGridsGatherOptComm const Int posInGrid = A.Grid().VCRank(); - + const Int sizeA = A.Grid().VCSize(); const Int index_from = int(std::floor(posInGrid/sizeB)); - A.Resize(m*numParentLayers,n); + A.Resize(m*numParentLayers,n); SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); @@ -2523,11 +2531,11 @@ void TranslateBetweenGridsGatherOptComm } const int totalSizeComm = mLocB * numParentLayers * int(n/sizeB); - + const int maxSendSize = mLocB*nLocB * numMatricesInSubGrid; - simple_buffer send_buf(maxSendSize, syncGeneral); + simple_buffer send_buf(maxSendSize, syncGeneral); T* sendBuf = send_buf.data(); @@ -2537,20 +2545,20 @@ void TranslateBetweenGridsGatherOptComm for(Int parentLayerSubGrid = 0; parentLayerSubGrid < numMatricesInSubGrid; ++parentLayerSubGrid) { - + copy::util::InterleaveMatrix( splitDim, int((mLocB*nLocB)/splitDim), dynamic_cast*>( &(*B_Vector[indexBVec[parentLayerSubGrid]]))->LockedBuffer() , 1, splitDim, sendBuf + parentLayerSubGrid*mLocB*nLocB, - int((mLocB*nLocB)/splitDim), 1, + int((mLocB*nLocB)/splitDim), 1, syncInfoBVector[parentLayerSubGrid]); } Synchronize(syncGeneral); - mpi::AllGather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, - recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, + mpi::AllGather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, + recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, gatherComm, syncGeneral); @@ -2577,17 +2585,17 @@ void TranslateBetweenGridsGatherCommSameSizeSubGrids std::vector>>& B_Vector, int splitDim, mpi::Comm const& gatherComm, SyncInfo & syncGeneral) { /* - Gather data in Column-Major ordering along the last dimension - - Size of B_vector is equal to the number of parent layers - - Parent Grid: 1 2 3 4 (must match the ranks of subgrid1) - Subgrid1: 1 2 3 4 - Subgrid2: 5 6 7 8 - Subgrid3: 9 10 11 12 - Subgrid4: 13 14 15 16 - - + Gather data in Column-Major ordering along the last dimension + + Size of B_vector is equal to the number of parent layers + + Parent Grid: 1 2 3 4 (must match the ranks of subgrid1) + Subgrid1: 1 2 3 4 + Subgrid2: 5 6 7 8 + Subgrid3: 9 10 11 12 + Subgrid4: 13 14 15 16 + + */ EL_DEBUG_CSE @@ -2618,12 +2626,12 @@ void TranslateBetweenGridsGatherCommSameSizeSubGrids - + const Int sizeA = A.Grid().VCSize(); - A.Resize(m*numParentLayers,n); + A.Resize(m*numParentLayers,n); SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); @@ -2631,10 +2639,10 @@ void TranslateBetweenGridsGatherCommSameSizeSubGrids const int numMatricesInSubGrid = int(numParentLayers / gatherCommSize); - if( (inAGrid ==true && B_Vector[0]->Participating()==false) || + if( (inAGrid ==true && B_Vector[0]->Participating()==false) || (inAGrid ==false && B_Vector[0]->Participating()==true) ) { - //A gird should have same ranks as of subgrid1 + //A gird should have same ranks as of subgrid1 LogicError("TranslateBetweenGridsGatherCommSameSizeSubGrids: ", "Owning ranks in Grid A/Parent Grid should be same as of Owning Ranks in Subgrid 1"); @@ -2659,10 +2667,10 @@ void TranslateBetweenGridsGatherCommSameSizeSubGrids } const int totalSizeComm = mLocB * numParentLayers * int(n/sizeB); - + const int maxSendSize = mLocB*nLocB * numMatricesInSubGrid; - simple_buffer send_buf(maxSendSize, syncInfoA); + simple_buffer send_buf(maxSendSize, syncInfoA); T* sendBuf = send_buf.data(); @@ -2682,8 +2690,8 @@ void TranslateBetweenGridsGatherCommSameSizeSubGrids } - mpi::Gather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, - recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, + mpi::Gather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, + recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, 0, gatherComm, syncGeneral); @@ -2713,17 +2721,17 @@ void TranslateBetweenGridsAllGatherCommSameSizeSubGrids std::vector>>& B_Vector, int splitDim, mpi::Comm const& gatherComm, SyncInfo & syncGeneral) { /* - Gather data in Column-Major ordering along the last dimension - - Size of B_vector is equal to the number of parent layers - - Parent Grid: 1 2 3 4 (must match the ranks of subgrid1) - Subgrid1: 1 2 3 4 - Subgrid2: 5 6 7 8 - Subgrid3: 9 10 11 12 - Subgrid4: 13 14 15 16 - - + Gather data in Column-Major ordering along the last dimension + + Size of B_vector is equal to the number of parent layers + + Parent Grid: 1 2 3 4 (must match the ranks of subgrid1) + Subgrid1: 1 2 3 4 + Subgrid2: 5 6 7 8 + Subgrid3: 9 10 11 12 + Subgrid4: 13 14 15 16 + + */ EL_DEBUG_CSE @@ -2754,12 +2762,12 @@ void TranslateBetweenGridsAllGatherCommSameSizeSubGrids - + const Int sizeA = A.Grid().VCSize(); - A.Resize(m*numParentLayers,n); + A.Resize(m*numParentLayers,n); SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); @@ -2767,10 +2775,10 @@ void TranslateBetweenGridsAllGatherCommSameSizeSubGrids const int numMatricesInSubGrid = int(numParentLayers / gatherCommSize); - if( (inAGrid ==true && B_Vector[0]->Participating()==false) || + if( (inAGrid ==true && B_Vector[0]->Participating()==false) || (inAGrid ==false && B_Vector[0]->Participating()==true) ) { - //A gird should have same ranks as of subgrid1 + //A gird should have same ranks as of subgrid1 LogicError("TranslateBetweenGridsAllGatherCommSameSizeSubGrids: ", "Owning ranks in Grid A/Parent Grid should be same as of Owning Ranks in Subgrid 1"); @@ -2795,10 +2803,10 @@ void TranslateBetweenGridsAllGatherCommSameSizeSubGrids } const int totalSizeComm = mLocB * numParentLayers * int(n/sizeB); - + const int maxSendSize = mLocB*nLocB * numMatricesInSubGrid; - simple_buffer send_buf(maxSendSize, syncInfoA); + simple_buffer send_buf(maxSendSize, syncInfoA); T* sendBuf = send_buf.data(); @@ -2818,8 +2826,8 @@ void TranslateBetweenGridsAllGatherCommSameSizeSubGrids } - mpi::AllGather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, - recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, + mpi::AllGather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, + recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, gatherComm, syncGeneral); @@ -2843,15 +2851,15 @@ void TranslateBetweenGridsAllGatherCommSameSizeSubGrids template void TranslateBetweenConatSliceFirstChannel (DistMatrix & A, - std::vector>>& B_Vector, - int splitDim, - mpi::Comm const& gatherComm, + std::vector>>& B_Vector, + int splitDim, + mpi::Comm const& gatherComm, SyncInfo & syncGeneral) { - // it Slice the data along first channel of features + // it Slice the data along first channel of features // given the slices alog the last channel // Concat layer for D3 design - // + // EL_DEBUG_CSE std::vector indexBVec; const Int numParentLayers = int(B_Vector.size()); @@ -2885,7 +2893,7 @@ void TranslateBetweenConatSliceFirstChannel - A.Resize(m*numParentLayers,n); + A.Resize(m*numParentLayers,n); SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); @@ -2908,11 +2916,11 @@ void TranslateBetweenConatSliceFirstChannel } const int totalSizeComm = mLocB * numParentLayers * int(n/sizeB); - + const int maxSendSize = mLocB*nLocB * numMatricesInSubGrid; - simple_buffer send_buf(maxSendSize, syncInfoA); + simple_buffer send_buf(maxSendSize, syncInfoA); T* sendBuf = send_buf.data(); @@ -2936,8 +2944,8 @@ void TranslateBetweenConatSliceFirstChannel } - mpi::AllGather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, - recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, + mpi::AllGather(sendBuf, mLocB*nLocB*numMatricesInSubGrid, + recvTransposedMatrix.Buffer(), mLocB*nLocB*numMatricesInSubGrid, gatherComm, syncGeneral); @@ -2967,14 +2975,14 @@ void TranslateBetweenGridsGather (DistMatrix & A, std::vector>>& B_Vector, int splitDim, mpi::Comm const& gatherComm, SyncInfo & syncGeneral, int version) { - //Some better logic can be written here + //Some better logic can be written here DistMatrix* B = dynamic_cast*>( &(*B_Vector[0])); // 0: Best Algo - // 1: Basic Gather + // 1: Basic Gather // 2: AllGather implementation (NCCL does not have Gather) const Int sizeA = A.Grid().VCSize(); @@ -2985,8 +2993,8 @@ void TranslateBetweenGridsGather if(version==0 || version == 1) { TranslateBetweenGridsGatherCommSameSizeSubGrids(A, - B_Vector, - splitDim, + B_Vector, + splitDim, gatherComm, syncGeneral); } @@ -2994,8 +3002,8 @@ void TranslateBetweenGridsGather { TranslateBetweenGridsAllGatherCommSameSizeSubGrids(A, - B_Vector, - splitDim, + B_Vector, + splitDim, gatherComm, syncGeneral); @@ -3007,32 +3015,32 @@ void TranslateBetweenGridsGather } } // 0: Best Algo - // 1: Basic Gather + // 1: Basic Gather // 2: Gather with Opt (interleave for transpose) (FIX IT) // else if (commSize > sizeA/sizeB) { TranslateBetweenGridsGatherComm(A, - B_Vector, - splitDim, + B_Vector, + splitDim, gatherComm, - syncGeneral); + syncGeneral); } else { if(version==0 || version == 1) { TranslateBetweenGridsGatherComm(A, - B_Vector, - splitDim, + B_Vector, + splitDim, gatherComm, syncGeneral); } else if (version == 2) { TranslateBetweenGridsGatherOptComm(A, - B_Vector, - splitDim, + B_Vector, + splitDim, gatherComm, syncGeneral); } @@ -3044,9 +3052,9 @@ void TranslateBetweenGridsGather } - - + + } @@ -3059,10 +3067,10 @@ void TranslateBetweenGridsBroadcastOptComm // /* This function is specific to the LBANN with implementation for specific cases - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid Same terminology as Allreduce functions */ - + EL_DEBUG_CSE @@ -3078,7 +3086,7 @@ void TranslateBetweenGridsBroadcastOptComm const Int numSubGrids = int(B_Vector.size()); const Int sizeA = A.Grid().VCSize(); - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsBroadcastOptComm: ", @@ -3099,7 +3107,7 @@ void TranslateBetweenGridsBroadcastOptComm { LogicError("TranslateBetweenGridsBroadcast: ", "Error: rank is in multiple subgrids"); - + } indexB = i; } @@ -3118,18 +3126,18 @@ void TranslateBetweenGridsBroadcastOptComm SyncInfo syncGeneralMetaData = SyncInfo(); const bool inAGrid = A.Participating(); - + Int recvMetaData[6]; Int metaData[6]; if(inAGrid) { - + metaData[0] = m; metaData[1] = n; metaData[2] = mLocA; - metaData[3] = nLocA; + metaData[3] = nLocA; } else { @@ -3138,10 +3146,10 @@ void TranslateBetweenGridsBroadcastOptComm metaData[2] = 0; metaData[3] = 0; } - + const std::vector sendMetaData (metaData,metaData + 4 ); - + Synchronize(syncGeneral); if(sizeA != sizeB*B_Vector.size()) @@ -3159,14 +3167,14 @@ void TranslateBetweenGridsBroadcastOptComm { B_Vector[i]->Resize(m,n); } - - + + // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 @@ -3174,7 +3182,7 @@ void TranslateBetweenGridsBroadcastOptComm for(Int i = 0; i < int(rowLCM/sizeB); ++i) - { + { index_to_put[i] = i; } @@ -3193,7 +3201,7 @@ void TranslateBetweenGridsBroadcastOptComm simple_buffer recv_buf(inBGrid ? maxSendSize : 0, syncInfoB); T* sendBuf = send_buf.data(); - + const int myLocalDataRankA = int(std::floor(posInGrid/sizeB)); for(Int localDataRankA = 0; localDataRankA < int(rowLCM/sizeB); localDataRankA++) @@ -3209,19 +3217,19 @@ void TranslateBetweenGridsBroadcastOptComm 1, A.LDim(), sendBuf, 1, mLocA, syncInfoA); - } + } } - - //comm is useless parameter in this function - //Aluminum infer comm from sunc object + + //comm is useless parameter in this function + //Aluminum infer comm from sunc object Broadcast(sendBuf, mLocA*nLocA, localDataRankA, broadcastComm, syncGeneral); Synchronize(syncGeneral); - - + + int sendWidth = int(n / rowLCM); copy::util::InterleaveMatrix( m, sendWidth, @@ -3230,7 +3238,7 @@ void TranslateBetweenGridsBroadcastOptComm 1, (rowLCM/sizeB)*B->LDim(), syncInfoB); - + Synchronize(syncInfoB); } @@ -3246,10 +3254,10 @@ void TranslateBetweenGridsBroadcastBasic // /* This function is specific to the LBANN with implementation for specific cases - Subgrids in B_vector are assumed to be subset of resources in A grid + Subgrids in B_vector are assumed to be subset of resources in A grid Same terminology as Allreduce functions */ - + EL_DEBUG_CSE @@ -3266,7 +3274,7 @@ void TranslateBetweenGridsBroadcastBasic const Int numSubGrids = int(B_Vector.size()); const Int sizeA = A.Grid().VCSize(); - //Asserting parent grid evenly divides number of columns + //Asserting parent grid evenly divides number of columns if(n%sizeA!=0) { LogicError("TranslateBetweenGridsBroadcastBasic: ", @@ -3291,7 +3299,7 @@ void TranslateBetweenGridsBroadcastBasic } DistMatrix* B = dynamic_cast*>( &(*B_Vector[indexB])); - const Int posInSubGrid = B->Grid().VCRank(); + const Int posInSubGrid = B->Grid().VCRank(); const Int rowStrideB = B->RowStride(); @@ -3302,18 +3310,18 @@ void TranslateBetweenGridsBroadcastBasic SyncInfo syncGeneralMetaData = SyncInfo(); - + Int recvMetaData[6]; Int metaData[6]; if(inAGrid) { - + metaData[0] = m; metaData[1] = n; metaData[2] = mLocA; - metaData[3] = nLocA; + metaData[3] = nLocA; } else { @@ -3322,10 +3330,10 @@ void TranslateBetweenGridsBroadcastBasic metaData[2] = 0; metaData[3] = 0; } - + const std::vector sendMetaData (metaData,metaData + 4 ); - + if(sizeA != sizeB*B_Vector.size()) @@ -3347,7 +3355,7 @@ void TranslateBetweenGridsBroadcastBasic // Parent Subgrid Size: 4 Child Subgrid Size: 3 - // Parent 0 1 2 3 0 1 2 3 0 1 2 3 + // Parent 0 1 2 3 0 1 2 3 0 1 2 3 // Child 0 1 2 0 1 2 0 1 2 0 1 2 std::vector require_data(sizeA,false); @@ -3370,8 +3378,8 @@ void TranslateBetweenGridsBroadcastBasic index_from[temp_require_data] = int(std::floor(total_iter/sizeA)); total_iter = total_iter + sizeB; temp_require_data = Mod(temp_require_data + sizeB, sizeA); - - + + } SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); @@ -3387,7 +3395,7 @@ void TranslateBetweenGridsBroadcastBasic T* sendBuf = send_buf.data(); //T* recvBuf = recv_buf.data(); - + for(Int localRankA = 0; localRankA < sizeA; localRankA++) @@ -3401,8 +3409,8 @@ void TranslateBetweenGridsBroadcastBasic sendBuf, 1, mLocA, syncInfoA); } - //comm is useless parameter in this function - //Aluminum infer comm from sunc object + //comm is useless parameter in this function + //Aluminum infer comm from sunc object Broadcast(sendBuf, mLocA*nLocA, localRankA, viewingCommA, syncInfoA); @@ -3433,11 +3441,11 @@ void TranslateBetweenGridsBroadcast (DistMatrix const& A, std::vector>>& B_Vector, int version) { - //Some better logic can be written here + //Some better logic can be written here // 0: Best Algo // 1: Basic Broadcast without communication overlap - // - // + // + // if(version==0 || version == 1) @@ -3459,20 +3467,20 @@ void TranslateBetweenGridsBroadcast std::vector>>& B_Vector, mpi::Comm const& broadcastComm, SyncInfo & syncGeneral, int version) { - //Some better logic can be written here + //Some better logic can be written here // 0: Best Algo - // 1: if spliting comm is given then we have only one algo for now - // + // 1: if spliting comm is given then we have only one algo for now + // if(version==0 || version == 1) { TranslateBetweenGridsBroadcastOptComm(A, - B_Vector, + B_Vector, broadcastComm, syncGeneral); } - + else { LogicError("TranslateBetweenGridsAllreduce: ", @@ -3561,7 +3569,7 @@ void TranslateBetweenGridsAsync const Int mLocA = A.LocalHeight(); const Int nLocA = A.LocalWidth(); - + mpi::Comm const& viewingCommB = B.Grid().ViewingComm(); mpi::Group owningGroupA = A.Grid().OwningGroup(); @@ -3570,7 +3578,7 @@ void TranslateBetweenGridsAsync // Compute the number of process rows and columns that each process // needs to send to. - + Int colStrideA = A.ColStride(); Int rowStrideA = A.RowStride(); @@ -3580,14 +3588,14 @@ void TranslateBetweenGridsAsync const bool inAGrid = A.Participating(); - + Int recvMetaData[6]; Int metaData[6]; if(inAGrid) { - + metaData[0] = m; metaData[1] = n; metaData[2] = colStrideA; @@ -3595,7 +3603,7 @@ void TranslateBetweenGridsAsync metaData[4] = colAlignA; metaData[5] = rowAlignA; - + } else { @@ -3606,13 +3614,13 @@ void TranslateBetweenGridsAsync metaData[4] = 0; metaData[5] = 0; } - + const std::vector sendMetaData (metaData,metaData + 6 ); - + SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); Synchronize(syncGeneral); - + mpi::AllReduce( sendMetaData.data(), recvMetaData, 6, mpi::MAX, viewingCommB,syncGeneral); Synchronize(syncGeneral); @@ -3675,7 +3683,7 @@ void TranslateBetweenGridsAsync const Int maxSendSize = (n/(rowStrideA*numRowSends)+1) * (m); - + // Translate the ranks from A's VC communicator to B's viewing so that // we can match send/recv communicators. Since A's VC communicator is not // necessarily defined on every process, we instead work with A's owning @@ -3710,17 +3718,17 @@ void TranslateBetweenGridsAsync if(inBGrid) requiredMemory += maxSendSize; - - + + std::vector> sendBufVector(numRowSends); - + for(Int i=0; i recv_buf(inBGrid ? maxSendSize : 0, syncInfoB); @@ -3728,8 +3736,8 @@ void TranslateBetweenGridsAsync - //Checking if process are in both A and B grids - // Just transfer the data directly + //Checking if process are in both A and B grids + // Just transfer the data directly for (Int rowSend = 0; rowSend < numRowSends; rowSend++) { const Int recvVCRank = Mod(A.Grid().Rank() + rowSend*rowStrideA, rowStrideB); @@ -3792,7 +3800,7 @@ void TranslateBetweenGridsAsync Synchronize(syncInfoA); sendRequestsUsed[rowSend] = true; - + mpi::ISend (sendBufVector[rowSend].data(), mLocA*sendWidth, recvViewingRank, viewingCommB, sendRequests[rowSend]); @@ -3805,7 +3813,7 @@ void TranslateBetweenGridsAsync //start receiving data from other processes sendRow = firstSendRow; - + for(Int rowRecv=0; rowRecv= Mod(n,rowLCM)) ? floor(n/rowLCM) : floor(n/rowLCM)+1; - + mpi::Recv( @@ -3838,7 +3846,7 @@ void TranslateBetweenGridsAsync sendCol = Mod(sendCol+rowStrideB,rowStrideA); sendRow = Mod(sendRow+rowStrideB,rowStrideA); - + } @@ -3849,7 +3857,7 @@ void TranslateBetweenGridsAsync { mpi::Wait(sendRequests[i]); } - + } sendBufVector.clear(); @@ -3868,7 +3876,7 @@ void TranslateBetweenGrids const Int mLocA = A.LocalHeight(); const Int nLocA = A.LocalWidth(); - + mpi::Comm const& viewingCommB = B.Grid().ViewingComm(); mpi::Group owningGroupA = A.Grid().OwningGroup(); @@ -3877,7 +3885,7 @@ void TranslateBetweenGrids // Compute the number of process rows and columns that each process // needs to send to. - + const Int colRankA = A.ColRank(); const Int rowRankA = A.RowRank(); Int colStrideA = A.ColStride(); @@ -3888,14 +3896,14 @@ void TranslateBetweenGrids const bool inAGrid = A.Participating(); - + Int recvMetaData[6]; Int metaData[6]; if(inAGrid) { - + metaData[0] = m; metaData[1] = n; metaData[2] = colStrideA; @@ -3903,7 +3911,7 @@ void TranslateBetweenGrids metaData[4] = colAlignA; metaData[5] = rowAlignA; - + } else { @@ -3914,13 +3922,13 @@ void TranslateBetweenGrids metaData[4] = 0; metaData[5] = 0; } - + const std::vector sendMetaData (metaData,metaData + 6 ); - + SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); Synchronize(syncGeneral); - + mpi::AllReduce( sendMetaData.data(), recvMetaData, 6, mpi::MAX, viewingCommB,syncGeneral); Synchronize(syncGeneral); @@ -3989,7 +3997,7 @@ void TranslateBetweenGrids const Int maxSendSize = (n/(rowStrideA*numRowSends)+1) * (m); - + // Translate the ranks from A's VC communicator to B's viewing so that // we can match send/recv communicators. Since A's VC communicator is not // necessarily defined on every process, we instead work with A's owning @@ -4032,7 +4040,7 @@ void TranslateBetweenGrids Int recvRow = 0; - //Ranks of processes to send data. + //Ranks of processes to send data. //Key: Process rank //value: column offset std::map sendProcessRanks; @@ -4054,7 +4062,7 @@ void TranslateBetweenGrids sendRow = Mod(sendRow+rowStrideB,rowStrideA); } - //Checking if process are in both A and B grids + //Checking if process are in both A and B grids for (Int rowSend = 0; rowSend < numRowSends; rowSend++) { const Int recvVCRank = Mod(A.Grid().Rank() + rowSend*rowStrideA, rowStrideB); @@ -4073,9 +4081,9 @@ void TranslateBetweenGrids if(rankMap[sendVCRank]==myRankViewing) break; } - - const Int recvWidth = ((rowRecv*rowStrideB + numInB)>= Mod(n,rowLCM)) ? + + const Int recvWidth = ((rowRecv*rowStrideB + numInB)>= Mod(n,rowLCM)) ? floor(n/rowLCM) : floor(n/rowLCM)+1; copy::util::InterleaveMatrix( @@ -4099,13 +4107,13 @@ void TranslateBetweenGrids { if(recvRankItr!= recvProcessRanks.end()) { - if( recvRankItr->first < myRankViewing || + if( recvRankItr->first < myRankViewing || (sendRankItr==sendProcessRanks.end() && recvRankItr->first > myRankViewing)) { - //Post recv operation + //Post recv operation if(inBGrid){ - const Int sendWidth = ((recvRankItr->second*rowStrideB + numInB)>= Mod(n,rowLCM)) ? + const Int sendWidth = ((recvRankItr->second*rowStrideB + numInB)>= Mod(n,rowLCM)) ? floor(n/rowLCM) : floor(n/rowLCM)+1; @@ -4121,17 +4129,17 @@ void TranslateBetweenGrids 1, (numRowRecvs)*B.LDim(), syncInfoB); - + } recvRankItr++; - + } else if (recvRankItr->first != myRankViewing && sendRankItr!=sendProcessRanks.end()) { //Post send operation if not done already - + //Pack Data if(sendRankItr->first!=myRankViewing && inAGrid) { @@ -4143,24 +4151,24 @@ void TranslateBetweenGrids 1, numRowSends*A.LDim(), sendBuf, 1, mLocA, syncInfoA); - + mpi::Send (sendBuf, mLocA*sendWidth, sendRankItr->first, viewingCommB,syncInfoA); - + } sendRankItr++; } else { - recvRankItr++; + recvRankItr++; } - }//only send operations are left + }//only send operations are left else { //Post send operation if not done already - + //Pack Data if(sendRankItr->first!=myRankViewing && inAGrid) { @@ -4173,12 +4181,12 @@ void TranslateBetweenGrids 1, numRowSends*A.LDim(), sendBuf, 1, mLocA, syncInfoA); - - + + mpi::Send (sendBuf, mLocA*sendWidth, sendRankItr->first, viewingCommB,syncInfoA); - + } sendRankItr++; @@ -4188,8 +4196,12 @@ void TranslateBetweenGrids } -template void TranslateBetweenGridsAsync (DistMatrix const& ,DistMatrix& ); +template void TranslateBetweenGridsAsync( + DistMatrix const&, + DistMatrix&); +#ifdef HYDROGEN_HAVE_GPU template void TranslateBetweenGridsAsync (DistMatrix const& ,DistMatrix& ); +#endif // HYDROGEN_HAVE_GPU template void TranslateBetweenGrids @@ -4223,11 +4235,11 @@ void TranslateBetweenGrids A.Grid().ViewingComm() : viewingCommSizeA == commSizeB ? A.Grid().ViewingComm(): - + commSizeA == viewingCommSizeB ? A.Grid().VCComm() : A.Grid().VCComm() - + ; @@ -4235,11 +4247,11 @@ void TranslateBetweenGrids B.Grid().ViewingComm(): viewingCommSizeA == commSizeB ? B.Grid().VCComm(): - + commSizeA == viewingCommSizeB ? B.Grid().ViewingComm() : B.Grid().VCComm() - + ; @@ -4247,11 +4259,11 @@ void TranslateBetweenGrids true : viewingCommSizeA == commSizeB ? true : - + commSizeA == viewingCommSizeB ? false : false - + ; @@ -4259,11 +4271,11 @@ void TranslateBetweenGrids true : viewingCommSizeA == commSizeB ? false : - + commSizeA == viewingCommSizeB ? true : false - + ; @@ -4272,7 +4284,7 @@ void TranslateBetweenGrids LogicError("communicators were not congruent"); - + const Int rankA = A.RedundantRank(); const Int rankB = B.RedundantRank();