Skip to content

Commit 8b534e9

Browse files
author
rhc54
committed
Merge pull request #1668 from rhc54/topic/slurm
When direct launching applications, we must allow the MPI layer to pr…
2 parents 9066d26 + 01ba861 commit 8b534e9

File tree

9 files changed

+187
-69
lines changed

9 files changed

+187
-69
lines changed

ompi/mca/rte/rte.h

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -205,27 +205,27 @@ OMPI_DECLSPEC extern mca_base_framework_t ompi_rte_base_framework;
205205
* progress while waiting, so we loop over opal_progress, letting
206206
* the RTE progress thread move the RTE along
207207
*/
208-
#define OMPI_WAIT_FOR_COMPLETION(flg) \
209-
do { \
210-
opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
211-
"%s waiting on RTE event at %s:%d", \
212-
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
213-
__FILE__, __LINE__); \
214-
while ((flg)) { \
215-
opal_progress(); \
216-
} \
208+
#define OMPI_WAIT_FOR_COMPLETION(flg) \
209+
do { \
210+
opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
211+
"%s waiting on RTE event at %s:%d", \
212+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
213+
__FILE__, __LINE__); \
214+
while ((flg)) { \
215+
opal_progress(); \
216+
} \
217217
}while(0);
218218

219-
#define OMPI_LAZY_WAIT_FOR_COMPLETION(flg) \
220-
do { \
221-
opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
222-
"%s lazy waiting on RTE event at %s:%d", \
223-
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
224-
__FILE__, __LINE__); \
225-
while ((flg)) { \
226-
opal_progress(); \
227-
usleep(100); \
228-
} \
219+
#define OMPI_LAZY_WAIT_FOR_COMPLETION(flg) \
220+
do { \
221+
opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
222+
"%s lazy waiting on RTE event at %s:%d", \
223+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
224+
__FILE__, __LINE__); \
225+
while ((flg)) { \
226+
opal_progress(); \
227+
usleep(100); \
228+
} \
229229
}while(0);
230230

231231
typedef struct {

ompi/runtime/ompi_mpi_finalize.c

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* Copyright (c) 2006 University of Houston. All rights reserved.
1717
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1818
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
19-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
19+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
2020
*
2121
* $COPYRIGHT$
2222
*
@@ -248,19 +248,20 @@ int ompi_mpi_finalize(void)
248248
more details). */
249249
if (NULL != opal_pmix.fence_nb) {
250250
active = true;
251-
/* Note that the non-blocking PMIx fence will cycle calling
252-
opal_progress(), which will allow any other pending
253-
communications/actions to complete. See
254-
https://github.com/open-mpi/ompi/issues/1576 for the
255-
original bug report. */
251+
/* Note that use of the non-blocking PMIx fence will
252+
* allow us to lazily cycle calling
253+
* opal_progress(), which will allow any other pending
254+
* communications/actions to complete. See
255+
* https://github.com/open-mpi/ompi/issues/1576 for the
256+
* original bug report. */
256257
opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
257-
OMPI_WAIT_FOR_COMPLETION(active);
258+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
258259
} else {
259260
/* However, we cannot guarantee that the provided PMIx has
260-
fence_nb. If it doesn't, then do the best we can: an MPI
261-
barrier on COMM_WORLD (which isn't the best because of the
262-
reasons cited above), followed by a blocking PMIx fence
263-
(which may not necessarily call opal_progress()). */
261+
* fence_nb. If it doesn't, then do the best we can: an MPI
262+
* barrier on COMM_WORLD (which isn't the best because of the
263+
* reasons cited above), followed by a blocking PMIx fence
264+
* (which does not call opal_progress()). */
264265
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
265266
comm->c_coll.coll_barrier(comm, comm->c_coll.coll_barrier_module);
266267

ompi/runtime/ompi_mpi_init.c

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,12 @@ static int ompi_register_mca_variables(void)
362362
return OMPI_SUCCESS;
363363
}
364364

365+
static void fence_release(int status, void *cbdata)
366+
{
367+
volatile bool *active = (volatile bool*)cbdata;
368+
*active = false;
369+
}
370+
365371
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
366372
{
367373
int ret;
@@ -370,6 +376,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
370376
char *error = NULL;
371377
char *cmd=NULL, *av=NULL;
372378
ompi_errhandler_errtrk_t errtrk;
379+
volatile bool active;
373380
OPAL_TIMING_DECLARE(tm);
374381
OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);
375382

@@ -634,13 +641,23 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
634641
* if data exchange is required. The modex occurs solely across procs
635642
* in our job. If a barrier is required, the "modex" function will
636643
* perform it internally */
637-
OPAL_MODEX();
644+
active = true;
645+
opal_pmix.commit();
646+
if (!opal_pmix_base_async_modex) {
647+
if (NULL != opal_pmix.fence_nb) {
648+
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
649+
fence_release, (void*)&active);
650+
OMPI_WAIT_FOR_COMPLETION(active);
651+
} else {
652+
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
653+
}
654+
}
638655

639656
OPAL_TIMING_MNEXT((&tm,"time from modex to first barrier"));
640657

641658
/* select buffered send allocator component to be used */
642659
if( OMPI_SUCCESS !=
643-
(ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
660+
(ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
644661
error = "mca_pml_base_bsend_init() failed";
645662
goto error;
646663
}
@@ -802,7 +819,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
802819
/* wait for everyone to reach this point - this is a hard
803820
* barrier requirement at this time, though we hope to relax
804821
* it at a later point */
805-
opal_pmix.fence(NULL, 0);
822+
active = true;
823+
opal_pmix.commit();
824+
if (NULL != opal_pmix.fence_nb) {
825+
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
826+
fence_release, (void*)&active);
827+
OMPI_WAIT_FOR_COMPLETION(active);
828+
} else {
829+
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
830+
}
806831

807832
/* check for timing request - get stop time and report elapsed
808833
time if so, then start the clock again */
@@ -839,10 +864,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
839864
e.g. hierarch, might create subcommunicators. The threadlevel
840865
requested by all processes is required in order to know
841866
which cid allocation algorithm can be used. */
842-
if ( OMPI_SUCCESS !=
843-
( ret = ompi_comm_cid_init ())) {
844-
error = "ompi_mpi_init: ompi_comm_cid_init failed";
845-
goto error;
867+
if (OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) {
868+
error = "ompi_mpi_init: ompi_comm_cid_init failed";
869+
goto error;
846870
}
847871

848872
/* Init coll for the comms. This has to be after dpm_base_select,

opal/mca/pmix/cray/pmix_cray.c

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ static int cray_resolve_peers(const char *nodename,
5757
opal_list_t *procs);
5858
static int cray_resolve_nodes(opal_jobid_t jobid, char **nodelist);
5959
static int cray_put(opal_pmix_scope_t scope, opal_value_t *kv);
60-
static int cray_fence(opal_list_t *procs, int collect_data);
60+
static int cray_fencenb(opal_list_t *procs, int collect_data,
61+
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
6162
static int cray_commit(void);
6263
static int cray_get(const opal_process_name_t *id,
6364
const char *key, opal_list_t *info,
@@ -90,8 +91,8 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
9091
.initialized = cray_initialized,
9192
.abort = cray_abort,
9293
.commit = cray_commit,
93-
.fence = cray_fence,
94-
.fence_nb = NULL,
94+
.fence = NULL,
95+
.fence_nb = cray_fencenb,
9596
.put = cray_put,
9697
.get = cray_get,
9798
.get_nb = cray_get_nb,
@@ -119,6 +120,17 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
119120
// usage accounting
120121
static int pmix_init_count = 0;
121122

123+
// local object
124+
typedef struct {
125+
opal_object_t super;
126+
opal_event_t ev;
127+
opal_pmix_op_cbfunc_t opcbfunc;
128+
void *cbdata;
129+
} pmi_opcaddy_t;
130+
OBJ_CLASS_INSTANCE(pmi_opcaddy_t,
131+
opal_object_t,
132+
NULL, NULL);
133+
122134
// PMI constant values:
123135
static int pmix_kvslen_max = 0;
124136
static int pmix_keylen_max = 0;
@@ -524,8 +536,9 @@ static int cray_commit(void)
524536
return OPAL_SUCCESS;
525537
}
526538

527-
static int cray_fence(opal_list_t *procs, int collect_data)
539+
static void fencenb(int sd, short args, void *cbdata)
528540
{
541+
pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
529542
int rc, cnt;
530543
int32_t i;
531544
int *all_lens = NULL;
@@ -562,7 +575,8 @@ static int cray_fence(opal_list_t *procs, int collect_data)
562575

563576
send_buffer = OBJ_NEW(opal_buffer_t);
564577
if (NULL == send_buffer) {
565-
return OPAL_ERR_OUT_OF_RESOURCE;
578+
rc = OPAL_ERR_OUT_OF_RESOURCE;
579+
goto fn_exit;
566580
}
567581

568582
opal_dss.copy_payload(send_buffer, mca_pmix_cray_component.cache_global);
@@ -680,7 +694,7 @@ static int cray_fence(opal_list_t *procs, int collect_data)
680694
* for every process in the job.
681695
*
682696
* we only need to set locality for each local rank as "not found"
683-
* equates to "non-local"
697+
* equates to "non-local"
684698
*/
685699

686700
for (i=0; i < pmix_nlranks; i++) {
@@ -744,7 +758,27 @@ static int cray_fence(opal_list_t *procs, int collect_data)
744758
if (r_bytes_and_ranks != NULL) {
745759
free(r_bytes_and_ranks);
746760
}
747-
return rc;
761+
if (NULL != op->opcbfunc) {
762+
op->opcbfunc(rc, op->cbdata);
763+
}
764+
OBJ_RELEASE(op);
765+
return;
766+
}
767+
768+
static int cray_fencenb(opal_list_t *procs, int collect_data,
769+
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
770+
{
771+
pmi_opcaddy_t *op;
772+
773+
/* thread-shift this so we don't block in Cray's barrier */
774+
op = OBJ_NEW(pmi_opcaddy_t);
775+
op->opcbfunc = cbfunc;
776+
op->cbdata = cbdata;
777+
event_assign(&op->ev, opal_pmix_base.evbase, -1,
778+
EV_WRITE, fencenb, op);
779+
event_active(&op->ev, EV_WRITE, 1);
780+
781+
return OPAL_SUCCESS;
748782
}
749783

750784
static int cray_get(const opal_process_name_t *id, const char *key, opal_list_t *info, opal_value_t **kv)

opal/mca/pmix/external/pmix_ext_client.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,8 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
369369
if (collect_data) {
370370
PMIX_INFO_CONSTRUCT(&info);
371371
(void)strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
372+
info.value.type = PMIX_BOOL;
373+
info.value.data.flag = true;
372374
iptr = &info;
373375
n = 1;
374376
} else {

opal/mca/pmix/pmix.h

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -250,21 +250,6 @@ extern int opal_pmix_base_exchange(opal_value_t *info,
250250
} \
251251
} while(0);
252252

253-
254-
/**
255-
* Provide a simplified macro for calling the fence function
256-
* that takes into account directives and availability of
257-
* non-blocking operations
258-
*/
259-
#define OPAL_MODEX() \
260-
do { \
261-
opal_pmix.commit(); \
262-
if (!opal_pmix_base_async_modex) { \
263-
opal_pmix.fence(NULL, \
264-
opal_pmix_collect_all_data); \
265-
} \
266-
} while(0);
267-
268253
/**
269254
* Provide a macro for accessing a base function that exchanges
270255
* data values between two procs using the PMIx Publish/Lookup

opal/mca/pmix/pmix114/pmix1_client.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,8 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
364364
if (collect_data) {
365365
PMIX_INFO_CONSTRUCT(&info);
366366
(void)strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
367+
info.value.type = PMIX_BOOL;
368+
info.value.data.flag = true;
367369
iptr = &info;
368370
n = 1;
369371
} else {

opal/mca/pmix/s1/pmix_s1.c

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ static int s1_initialized(void);
3636
static int s1_abort(int flag, const char msg[],
3737
opal_list_t *procs);
3838
static int s1_commit(void);
39-
static int s1_fence(opal_list_t *procs, int collect_data);
39+
static int s1_fencenb(opal_list_t *procs, int collect_data,
40+
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
4041
static int s1_put(opal_pmix_scope_t scope,
4142
opal_value_t *kv);
4243
static int s1_get(const opal_process_name_t *id,
@@ -59,7 +60,7 @@ const opal_pmix_base_module_t opal_pmix_s1_module = {
5960
.initialized = s1_initialized,
6061
.abort = s1_abort,
6162
.commit = s1_commit,
62-
.fence = s1_fence,
63+
.fence_nb = s1_fencenb,
6364
.put = s1_put,
6465
.get = s1_get,
6566
.publish = s1_publish,
@@ -78,6 +79,17 @@ const opal_pmix_base_module_t opal_pmix_s1_module = {
7879
// usage accounting
7980
static int pmix_init_count = 0;
8081

82+
// local object
83+
typedef struct {
84+
opal_object_t super;
85+
opal_event_t ev;
86+
opal_pmix_op_cbfunc_t opcbfunc;
87+
void *cbdata;
88+
} pmi_opcaddy_t;
89+
OBJ_CLASS_INSTANCE(pmi_opcaddy_t,
90+
opal_object_t,
91+
NULL, NULL);
92+
8193
// PMI constant values:
8294
static int pmix_kvslen_max = 0;
8395
static int pmix_keylen_max = 0;
@@ -512,8 +524,9 @@ static int s1_commit(void)
512524
return OPAL_SUCCESS;
513525
}
514526

515-
static int s1_fence(opal_list_t *procs, int collect_data)
527+
static void fencenb(int sd, short args, void *cbdata)
516528
{
529+
pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
517530
int rc;
518531
int32_t i;
519532
opal_value_t *kp, kvn;
@@ -527,7 +540,8 @@ static int s1_fence(opal_list_t *procs, int collect_data)
527540
/* use the PMI barrier function */
528541
if (PMI_SUCCESS != (rc = PMI_Barrier())) {
529542
OPAL_PMI_ERROR(rc, "PMI_Barrier");
530-
return OPAL_ERROR;
543+
rc = OPAL_ERROR;
544+
goto cleanup;
531545
}
532546

533547
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
@@ -548,7 +562,7 @@ static int s1_fence(opal_list_t *procs, int collect_data)
548562
&kp, pmix_kvs_name, pmix_vallen_max, kvs_get);
549563
if (OPAL_SUCCESS != rc) {
550564
OPAL_ERROR_LOG(rc);
551-
return rc;
565+
goto cleanup;
552566
}
553567
if (NULL == kp || NULL == kp->data.string) {
554568
/* if we share a node, but we don't know anything more, then
@@ -579,6 +593,27 @@ static int s1_fence(opal_list_t *procs, int collect_data)
579593
}
580594
}
581595

596+
cleanup:
597+
if (NULL != op->opcbfunc) {
598+
op->opcbfunc(rc, op->cbdata);
599+
}
600+
OBJ_RELEASE(op);
601+
return;
602+
}
603+
604+
static int s1_fencenb(opal_list_t *procs, int collect_data,
605+
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
606+
{
607+
pmi_opcaddy_t *op;
608+
609+
/* thread-shift this so we don't block in SLURM's barrier */
610+
op = OBJ_NEW(pmi_opcaddy_t);
611+
op->opcbfunc = cbfunc;
612+
op->cbdata = cbdata;
613+
event_assign(&op->ev, opal_pmix_base.evbase, -1,
614+
EV_WRITE, fencenb, op);
615+
event_active(&op->ev, EV_WRITE, 1);
616+
582617
return OPAL_SUCCESS;
583618
}
584619

0 commit comments

Comments
 (0)