Skip to content

Commit 28ee971

Browse files
committed
Avoid symmetric MPI_Igatherv
1 parent e12958a commit 28ee971

File tree

9 files changed

+57
-55
lines changed

9 files changed

+57
-55
lines changed
-16 KB
Binary file not shown.
-16 KB
Binary file not shown.

include/OGL/CommunicationPattern.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,13 @@ struct AllToAllPattern {
1919
std::vector<int> recv_offsets;
2020
};
2121

22-
/* @brief computes AllToAllPattern for repart comm from global allToAll pattern
22+
/* @brief computes AllToAllPattern for repartioned communincator from global
23+
* allToAll pattern by discarding all zero communication before and after the
24+
* repartioner scope.
2325
*
26+
* @param exec_handler The executor handler
27+
* @param allToAll The original allToAll pattern
28+
* @param start_rank the original comm_world rank
2429
*/
2530
AllToAllPattern compute_repart_allToall(const ExecutorHandler &exec_handler,
2631
const AllToAllPattern allToAll,

include/OGL/DevicePersistent/ExecutorHandler.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,10 @@ class ExecutorHandler
311311

312312
label get_ranks_per_gpu() const { return device_id_handler_.ranks_per_gpu; }
313313

314-
void set_ranks_per_gpu(label ranks_per_gpu) { device_id_handler_.ranks_per_gpu= ranks_per_gpu; }
314+
void set_ranks_per_gpu(label ranks_per_gpu)
315+
{
316+
device_id_handler_.ranks_per_gpu = ranks_per_gpu;
317+
}
315318

316319
label get_owner_rank() const { return device_id_handler_.global_owner(); }
317320

include/OGL/DevicePersistent/Vector.hpp

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ class PersistentVector
171171

172172
/** Copies the content of the distributed vector back to the original source
173173
**/
174-
MPI_Request copy_back()
174+
void copy_back()
175175
{
176176
auto exec = exec_.get_device_exec();
177177
auto rank = exec_.get_host_rank();
@@ -197,43 +197,26 @@ class PersistentVector
197197
// repartAllToAll.recv_counts[0] = 0;
198198
// }
199199

200+
// NOTE instead of all_to_all_v based communication MPI_Iscatterv
201+
// seems to be prefarable
200202
// communicate_values(exec, ref_exec, comm, comm_pattern,
201203
// get_vector()->get_local_values(),
202204
// const_cast<T *>(memory_), host_buffer);
203-
// std::cout << __FILE__
204-
// << " owner_rank " << exec_.get_owner_rank()
205-
// << " rank " << Pstream::myProcNo()
206-
// << " comm_pattern.send_counts: " << comm_pattern.send_counts
207-
// << " comm_pattern.recv_counts: " << comm_pattern.recv_counts
208-
// << " send_counts: " << repartAllToAll.send_counts
209-
// << " send_offsets: " << repartAllToAll.send_offsets
210-
// << " recv_counts: " << repartAllToAll.recv_counts
211-
// << "\n";
212-
213-
// auto start_rep = std::chrono::steady_clock::now();
205+
214206
label send_size = comm_pattern.send_offsets.back();
215207
auto send_view = gko::array<scalar>::const_view(
216208
exec, send_size, get_vector()->get_local_values());
217209
auto tmp = gko::array<scalar>(exec, send_size);
218210

219211
tmp = send_view;
220212
tmp.set_executor(ref_exec);
221-
// auto end_rep = std::chrono::steady_clock::now();
222-
// auto delta_t_rep =
223-
// std::chrono::duration_cast<std::chrono::microseconds>(end_rep -
224-
// start_rep).count() /1000.0; std::cout << __FILE__ << " copy back: "
225-
// << delta_t_rep << " [ms]\n";
226-
227213

228214
MPI_Request copy_back_req;
229-
MPI_Iscatterv(tmp.get_data(),
230-
// get_vector()->get_local_values(),
231-
repartAllToAll.send_counts.data(),
215+
MPI_Iscatterv(tmp.get_data(), repartAllToAll.send_counts.data(),
232216
repartAllToAll.send_offsets.data(), MPI_DOUBLE,
233217
const_cast<T *>(memory_), repartAllToAll.recv_counts[0],
234218
MPI_DOUBLE, 0, repart_comm->get(), &copy_back_req);
235219
MPI_Wait(&copy_back_req, MPI_STATUS_IGNORE);
236-
return copy_back_req;
237220
}
238221

239222
/** Writes the content of the distributed vector to disk

include/OGL/lduLduBase.hpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -306,15 +306,8 @@ class lduLduBase : public OGL_Info,
306306
delta_t_solve_ = delta_t_solve;
307307
}
308308

309-
// auto start_rep = std::chrono::steady_clock::now();
310309
TIME_WITH_FIELDNAME(verbose_, copy_x_back, this->fieldName(),
311310
dist_x.copy_back();)
312-
// auto copy_back_req = dist_x.copy_back();
313-
// auto end_rep = std::chrono::steady_clock::now();
314-
// auto delta_t_rep =
315-
// std::chrono::duration_cast<std::chrono::microseconds>(end_rep -
316-
// start_rep).count() /1000.0; std::cout << __FILE__ << " full copy
317-
// back: " << delta_t_rep << " [ms]\n";
318311

319312
auto bandwidth_copy_back =
320313
sizeof(scalar) * psi.size() / delta_t_copy_x_back / 1000.0;
@@ -346,8 +339,6 @@ class lduLduBase : public OGL_Info,
346339
std::to_string(bandwidth_copy_back) + std::string(" [GByte/s]");
347340
MLOG_0(verbose_, msg)
348341

349-
// MPI_Wait(&copy_back_req,MPI_STATUS_IGNORE);
350-
351342
return solverPerf;
352343
}
353344

src/MatrixWrapper/Distributed.cpp

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,15 @@ void generate_alltoall_update_data(
121121
std::vector<RepartDistMatrix::all_to_all_data> &update_data)
122122
{
123123
label linop_offset_store{0};
124-
for (size_t i = 0; i < 3; i++) {
124+
// NOTE in case of symmetric matrix 0 (upper) is same as 1 (lower)
125+
// thus we can start at 1
126+
label start = 0;
127+
for (size_t i = start; i < 3; i++) {
125128
label interface_size = in->get_rows()[i].size();
126129
label linop_idx = (fuse) ? 0 : in->get_id()[i];
127130
label linop_offset = (fuse) ? linop_offset_store : 0;
128131
auto comm_pattern = compute_gather_to_owner_counts(
129132
exec_handler, ranks_per_owner, interface_size);
130-
131133
size_t recv_size = comm_pattern.recv_offsets.back();
132134

133135
// NOTE Probably dont need to store linops[linop-idx] because we can
@@ -395,6 +397,9 @@ void update_impl(
395397
auto all_to_all_update = [repart_comm, ref_exec, device_exec,
396398
all_to_all_update_data, host_A, force_host_buffer,
397399
exec_handler, rank]() {
400+
// NOTE if symmetric (get it from host_A) we can skip id=0 and wait till
401+
// id=1 has been copied to use device copy
402+
//
398403
for (auto [id, comm_pattern, data_ptr] : all_to_all_update_data) {
399404
// auto start = std::chrono::steady_clock::now();
400405
auto repartAllToAll =
@@ -409,24 +414,43 @@ void update_impl(
409414
// communicate_values(ref_exec, device_exec, repart_comm,
410415
// repartAllToAll,
411416
// send_data_ptr, data_ptr, force_host_buffer);
412-
// if ( repart_comm->rank() == 0 ) {
413417
// std::cout << __FILE__ <<
414418
// " Pstream::rank " << Pstream::myProcNo() <<
415419
// " repart_rank() " << repart_comm->rank() <<
416420
// " send_offsets.back() " <<
421+
// " id " << id <<
417422
// repartAllToAll.send_offsets.back() << " recv_counts: " <<
418423
// repartAllToAll.recv_counts << " recv_offsets: " <<
419424
// repartAllToAll.recv_offsets <<
420425
// std::endl;
421-
// }
422-
MPI_Request request;
423-
424-
MPI_Igatherv(send_data_ptr, repartAllToAll.send_offsets.back(),
425-
MPI_DOUBLE, data_ptr,
426-
repartAllToAll.recv_counts.data(),
427-
repartAllToAll.recv_offsets.data(), MPI_DOUBLE, 0,
428-
repart_comm->get(), &request);
429-
MPI_Wait(&request, MPI_STATUS_IGNORE);
426+
427+
if (id == 0 && host_A->get_symmetric()) {
428+
} else {
429+
MPI_Request request;
430+
MPI_Igatherv(send_data_ptr, repartAllToAll.send_offsets.back(),
431+
MPI_DOUBLE, data_ptr,
432+
repartAllToAll.recv_counts.data(),
433+
repartAllToAll.recv_offsets.data(), MPI_DOUBLE, 0,
434+
repart_comm->get(), &request);
435+
MPI_Wait(&request, MPI_STATUS_IGNORE);
436+
}
437+
438+
// Perform symmetric inter device copy
439+
if (id == 1 && repart_comm->rank() == 0 &&
440+
host_A->get_symmetric()) {
441+
auto [zid, zcomm_pattern, zdata_ptr] =
442+
all_to_all_update_data[0];
443+
// copy recv size data from data_ptr to zdata_ptr
444+
//
445+
label recv_buffer_size = repartAllToAll.recv_offsets.back();
446+
auto l_view = gko::array<scalar>::view(
447+
device_exec, recv_buffer_size, data_ptr);
448+
449+
auto u_view = gko::array<scalar>::view(
450+
device_exec, recv_buffer_size, zdata_ptr);
451+
452+
u_view = l_view;
453+
}
430454
}
431455
};
432456

src/StoppingCriterion.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,6 @@ StoppingCriterion::OpenFOAMDistStoppingCriterion::compute_normfactor_dist(
5353
end_axref - start_axref)
5454
.count() /
5555
1.0;
56-
// std::cout << __FILE__ << " delta_t_axref " << delta_t_axref << " [mu
57-
// s]\n";
58-
5956
auto unity =
6057
gko::initialize<gko::matrix::Dense<scalar>>(1, {1.0}, device_exec);
6158

@@ -177,7 +174,6 @@ bool StoppingCriterion::OpenFOAMDistStoppingCriterion::check_impl(
177174
end_eval - start_eval)
178175
.count() /
179176
1.0;
180-
// std::cout << __FILE__ << "time " << *(parameters_.time) << " [mu s]\n";
181177
return result;
182178
}
183179

test/unit/MatrixWrapper/Distributed.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -324,13 +324,13 @@ TEST_P(DistMatL2D, canApplyCorrectly)
324324

325325
// Act
326326
bool active = repartitioner->get_repart_size() != 0;
327-
if (active){
328-
distributed->apply(b, x);
329-
auto res_x = std::vector<scalar>(
330-
x->get_local_vector()->get_const_values(),
331-
x->get_local_vector()->get_const_values() + local_vec_dim[0]);
327+
if (active) {
328+
distributed->apply(b, x);
329+
auto res_x = std::vector<scalar>(
330+
x->get_local_vector()->get_const_values(),
331+
x->get_local_vector()->get_const_values() + local_vec_dim[0]);
332332

333-
ASSERT_EQ(res_x, exp_x[name][fused][ranks_per_gpu][rank]);
333+
ASSERT_EQ(res_x, exp_x[name][fused][ranks_per_gpu][rank]);
334334
}
335335
}
336336

0 commit comments

Comments
 (0)