Skip to content

Commit cad89f8

Browse files
committed
osc/rdma: fix errors in derived datatype handling for accumulate
This commit fixes a number of bugs in the handling of derived datatypes when using MPI__Accumulate, MP_Get_accumulate, and MPI_Fetch_and_op. The following bugs are fixed: - Incorrect results when using MPI_OP_NO_OP with a 0 origin count. osc/rdma was not ignoring the source address, count, and datatype in the MPI_OP_NO_OP case as is required by the standard. - Correctly handle result_buffer=MPI_BOTTOM in ompi_osc_rdma_gacc_local(). - Test result_datatype in order to figure out whether a get is performed. References #6275 Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp> Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
1 parent 352b667 commit cad89f8

File tree

3 files changed

+74
-131
lines changed

3 files changed

+74
-131
lines changed

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 68 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
/*
33
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2016-2017 Research Organization for Information Science
6-
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2016-2019 Research Organization for Information Science
6+
* and Technology (RIST). All rights reserved.
77
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
8+
* Copyright (c) 2019 Triad National Security, LLC. All rights
9+
* reserved.
810
* $COPYRIGHT$
911
*
1012
* Additional copyrights may follow
@@ -50,71 +52,6 @@ struct ompi_osc_rdma_event_t {
5052

5153
typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t;
5254

53-
#if 0
54-
static void *ompi_osc_rdma_event_put (int fd, int flags, void *context)
55-
{
56-
ompi_osc_rdma_event_t *event = (ompi_osc_rdma_event_t *) context;
57-
int ret;
58-
59-
ret = event->module->selected_btl->btl_put (event->module->selected_btl, event->endpoint, event->local_address,
60-
event->remote_address, event->local_handle, event->remote_handle,
61-
event->length, 0, MCA_BTL_NO_ORDER, event->cbfunc, event->cbcontext,
62-
event->cbdata);
63-
if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
64-
/* done with this event */
65-
opal_event_del (&event->super);
66-
free (event);
67-
} else {
68-
/* re-activate the event */
69-
opal_event_active (&event->super, OPAL_EV_READ, 1);
70-
}
71-
72-
return NULL;
73-
}
74-
75-
static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
76-
ompi_osc_rdma_event_type_t event_type, void *local_address, mca_btl_base_registration_handle_t *local_handle,
77-
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
78-
uint64_t length, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
79-
void *cbdata)
80-
{
81-
ompi_osc_rdma_event_t *event = malloc (sizeof (*event));
82-
void *(*event_func) (int, int, void *);
83-
84-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "queueing event type %d", event_type);
85-
86-
if (OPAL_UNLIKELY(NULL == event)) {
87-
return OMPI_ERR_OUT_OF_RESOURCE;
88-
}
89-
90-
event->module = module;
91-
event->endpoint = endpoint;
92-
event->local_address = local_address;
93-
event->local_handle = local_handle;
94-
event->remote_address = remote_address;
95-
event->remote_handle = remote_handle;
96-
event->length = length;
97-
event->cbfunc = cbfunc;
98-
event->cbcontext = cbcontext;
99-
event->cbdata = cbdata;
100-
101-
switch (event_type) {
102-
case OMPI_OSC_RDMA_EVENT_TYPE_PUT:
103-
event_func = ompi_osc_rdma_event_put;
104-
break;
105-
default:
106-
opal_output(0, "osc/rdma: cannot queue unknown event type %d", event_type);
107-
abort ();
108-
}
109-
110-
opal_event_set (opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
111-
event_func, event);
112-
opal_event_active (&event->super, OPAL_EV_READ, 1);
113-
114-
return OMPI_SUCCESS;
115-
}
116-
#endif
117-
11855
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
11956
void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
12057
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
@@ -127,7 +64,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count
12764
do {
12865
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local region(s)");
12966

130-
if (NULL != result_buffer) {
67+
if (NULL != result_datatype) {
13168
/* get accumulate */
13269

13370
ret = ompi_datatype_sndrcv ((void *) (intptr_t) target_address, target_count, target_datatype,
@@ -184,7 +121,8 @@ static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *
184121

185122
static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const void *source, int source_count,
186123
ompi_datatype_t *source_datatype, void *result, int result_count,
187-
ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
124+
ompi_datatype_t *result_datatype, opal_convertor_t *result_convertor,
125+
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
188126
mca_btl_base_registration_handle_t *target_handle, int target_count,
189127
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
190128
{
@@ -219,8 +157,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
219157
uint32_t iov_count = 1;
220158
size_t size = request->len;
221159

222-
opal_convertor_unpack (&request->convertor, &iov, &iov_count, &size);
223-
opal_convertor_cleanup (&request->convertor);
160+
opal_convertor_unpack (result_convertor, &iov, &iov_count, &size);
224161
} else {
225162
/* copy contiguous data to the result buffer */
226163
ompi_datatype_sndrcv (ptr, len, MPI_BYTE, result, result_count, result_datatype);
@@ -264,7 +201,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
264201
struct iovec source_iovec[OMPI_OSC_RDMA_DECODE_MAX], target_iovec[OMPI_OSC_RDMA_DECODE_MAX];
265202
const size_t acc_limit = (mca_osc_rdma_component.buffer_size >> 3);
266203
uint32_t source_primitive_count, target_primitive_count;
267-
opal_convertor_t source_convertor, target_convertor;
204+
opal_convertor_t source_convertor, target_convertor, result_convertor;
268205
uint32_t source_iov_count, target_iov_count;
269206
uint32_t source_iov_index, target_iov_index;
270207
ompi_datatype_t *source_primitive, *target_primitive;
@@ -281,6 +218,13 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
281218
request->internal = true;
282219
}
283220

221+
if (&ompi_mpi_op_no_op.op == op) {
222+
/* NTH: just zero these out to catch any coding errors (they should be ignored in the no-op case) */
223+
source_count = 0;
224+
source_datatype = NULL;
225+
source_addr = NULL;
226+
}
227+
284228
request->cleanup = ompi_osc_rdma_gacc_master_cleanup;
285229
request->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC;
286230

@@ -303,7 +247,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
303247
}
304248

305249
ret = ompi_osc_rdma_gacc_contig (sync, source_addr, source_count, source_datatype, result_addr,
306-
result_count, result_datatype, peer, target_address,
250+
result_count, result_datatype, NULL, peer, target_address,
307251
target_handle, target_count, target_datatype, op,
308252
request);
309253
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
@@ -357,6 +301,20 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
357301
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
358302
return ret;
359303
}
304+
source_iov_count = 0;
305+
} else {
306+
source_iovec[0].iov_len = (size_t) -1;
307+
source_iovec[0].iov_base = NULL;
308+
source_iov_count = 1;
309+
}
310+
311+
if (result_datatype) {
312+
OBJ_CONSTRUCT(&result_convertor, opal_convertor_t);
313+
ret = opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count, result_addr,
314+
0, &result_convertor);
315+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
316+
return ret;
317+
}
360318
}
361319

362320
/* target_datatype can never be NULL */
@@ -372,59 +330,42 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
372330

373331
target_iov_index = 0;
374332
target_iov_count = 0;
333+
source_iov_index = 0;
375334
result_position = 0;
376335
subreq = NULL;
377336

378337
do {
379-
/* decode segments of the source data */
380-
source_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
381-
source_iov_index = 0;
382-
/* opal_convertor_raw returns done when it has reached the end of the data */
383-
if (!source_datatype) {
384-
done = true;
385-
source_iovec[0].iov_len = (size_t) -1;
386-
source_iovec[0].iov_base = NULL;
387-
source_iov_count = 1;
388-
} else {
389-
done = opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size);
390-
}
391-
392-
/* loop on the target segments until we have exhaused the decoded source data */
393-
while (source_iov_index != source_iov_count) {
394-
if (target_iov_index == target_iov_count) {
395-
/* decode segments of the target buffer */
396-
target_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
397-
target_iov_index = 0;
398-
(void) opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size);
338+
/* decode segments of the target buffer */
339+
target_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
340+
target_iov_index = 0;
341+
done = opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size);
342+
343+
/* loop on the source segments (if any) until we have exhaused the decoded target data */
344+
while (target_iov_index != target_iov_count) {
345+
if (source_iov_count == source_iov_index) {
346+
/* decode segments of the source data */
347+
source_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
348+
source_iov_index = 0;
349+
(void) opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size);
399350
}
400351

401352
/* we already checked that the target was large enough. this should be impossible */
402353
assert (0 != target_iov_count);
403354

404355
/* determine how much to put in this operation */
405-
acc_len = min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len);
406-
acc_len = min((size_t) acc_len, acc_limit);
356+
acc_len = min(min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit);
407357

408-
/* execute the get */
358+
/* execute the get-accumulate */
409359
if (!subreq) {
410360
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
411361
subreq->internal = true;
412362
subreq->parent_request = request;
363+
subreq->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC;
413364
(void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
414365
}
415366

416-
if (result_datatype) {
417-
/* prepare a convertor for this part of the result */
418-
opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count,
419-
result_addr, 0, &subreq->convertor);
420-
opal_convertor_set_position (&subreq->convertor, &result_position);
421-
subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC;
422-
} else {
423-
subreq->type = OMPI_OSC_RDMA_TYPE_ACC;
424-
}
425-
426367
ret = ompi_osc_rdma_gacc_contig (sync, source_iovec[source_iov_index].iov_base, acc_len / target_primitive->super.size,
427-
target_primitive, NULL, 0, NULL, peer,
368+
target_primitive, NULL, 0, NULL, &result_convertor, peer,
428369
(uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base, target_handle,
429370
acc_len / target_primitive->super.size, target_primitive, op, subreq);
430371
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@@ -444,13 +385,16 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
444385

445386
/* adjust io vectors */
446387
target_iovec[target_iov_index].iov_len -= acc_len;
447-
source_iovec[source_iov_index].iov_len -= acc_len;
448388
target_iovec[target_iov_index].iov_base = (void *)((intptr_t) target_iovec[target_iov_index].iov_base + acc_len);
449-
source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len);
389+
target_iov_index += (0 == target_iovec[target_iov_index].iov_len);
390+
450391
result_position += acc_len;
451392

452-
source_iov_index += !source_datatype || (0 == source_iovec[source_iov_index].iov_len);
453-
target_iov_index += (0 == target_iovec[target_iov_index].iov_len);
393+
if (source_datatype) {
394+
source_iov_index += (0 == source_iovec[source_iov_index].iov_len);
395+
source_iovec[source_iov_index].iov_len -= acc_len;
396+
source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len);
397+
}
454398
}
455399
} while (!done);
456400

@@ -462,6 +406,11 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
462406
OBJ_DESTRUCT(&source_convertor);
463407
}
464408

409+
if (result_datatype) {
410+
opal_convertor_cleanup (&result_convertor);
411+
OBJ_DESTRUCT(&result_convertor);
412+
}
413+
465414
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished scheduling rdma on non-contiguous datatype(s)");
466415

467416
opal_convertor_cleanup (&target_convertor);
@@ -587,9 +536,9 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
587536
new_value = old_value;
588537

589538
if (&ompi_mpi_op_replace.op == op) {
590-
memcpy ((void *)((intptr_t) &new_value + offset), origin_addr, extent);
539+
memcpy ((void *)((intptr_t) &new_value + offset), origin_addr + dt->super.true_lb, extent);
591540
} else if (&ompi_mpi_op_no_op.op != op) {
592-
ompi_op_reduce (op, (void *) origin_addr, (void*)((intptr_t) &new_value + offset), 1, dt);
541+
ompi_op_reduce (op, (void *) origin_addr + dt->super.true_lb, (void*)((intptr_t) &new_value + offset), 1, dt);
593542
}
594543

595544
ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
@@ -854,7 +803,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
854803
ompi_osc_rdma_module_t *module = sync->module;
855804
mca_btl_base_registration_handle_t *target_handle;
856805
uint64_t target_address;
857-
ptrdiff_t lb, origin_extent, target_span;
806+
ptrdiff_t lb, target_lb, origin_extent, target_span;
858807
bool lock_acquired = false;
859808
int ret;
860809

@@ -867,11 +816,11 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
867816
return OMPI_SUCCESS;
868817
}
869818

870-
target_span = opal_datatype_span(&target_datatype->super, target_count, &lb);
819+
target_span = opal_datatype_span(&target_datatype->super, target_count, &target_lb);
871820

872821
// a buffer defined by (buf, count, dt)
873822
// will have data starting at buf+offset and ending len bytes later:
874-
ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_span+lb, &target_address, &target_handle);
823+
ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_span+target_lb, &target_address, &target_handle);
875824
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
876825
return ret;
877826
}
@@ -895,10 +844,10 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
895844
if (origin_extent <= 8 && 1 == origin_count && !ompi_osc_rdma_peer_local_base (peer)) {
896845
if (module->acc_use_amo && ompi_datatype_is_predefined (origin_datatype)) {
897846
if (NULL == result_addr) {
898-
ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, origin_extent, peer, target_address,
847+
ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, origin_extent, peer, target_address+target_lb,
899848
target_handle, op, request, lock_acquired);
900849
} else {
901-
ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address,
850+
ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address+target_lb,
902851
target_handle, op, request, lock_acquired);
903852
}
904853

@@ -907,7 +856,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
907856
}
908857
}
909858

910-
ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address,
859+
ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address+target_lb,
911860
target_handle, op, request, lock_acquired);
912861
if (OMPI_SUCCESS == ret) {
913862
return OMPI_SUCCESS;

ompi/mca/osc/rdma/osc_rdma_request.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
* Copyright (c) 2016 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
9+
* Copyright (c) 2019 Triad National Security, LLC. All rights
10+
* reserved.
911
* $COPYRIGHT$
1012
*
1113
* Additional copyrights may follow
@@ -56,15 +58,7 @@ static void request_construct(ompi_osc_rdma_request_t *request)
5658
request->internal = false;
5759
request->cleanup = NULL;
5860
request->outstanding_requests = 0;
59-
OBJ_CONSTRUCT(&request->convertor, opal_convertor_t);
60-
}
61-
62-
static void request_destruct(ompi_osc_rdma_request_t *request)
63-
{
64-
OBJ_DESTRUCT(&request->convertor);
6561
}
6662

67-
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t,
68-
ompi_request_t,
69-
request_construct,
70-
request_destruct);
63+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t, ompi_request_t,
64+
request_construct, NULL);

ompi/mca/osc/rdma/osc_rdma_request.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
44
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
55
* reserved.
6+
* Copyright (c) 2019 Triad National Security, LLC. All rights
7+
* reserved.
68
* $COPYRIGHT$
79
*
810
* Additional copyrights may follow
@@ -51,8 +53,6 @@ struct ompi_osc_rdma_request_t {
5153
uint64_t target_address;
5254

5355
struct ompi_osc_rdma_request_t *parent_request;
54-
/* used for non-contiguous get accumulate operations */
55-
opal_convertor_t convertor;
5656

5757
/** synchronization object */
5858
struct ompi_osc_rdma_sync_t *sync;

0 commit comments

Comments
 (0)