@@ -116,6 +116,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
116116 opal_hash_table_set_value_ptr (& ompi_proc_hash , & proc -> super .proc_name , sizeof (proc -> super .proc_name ),
117117 proc );
118118
119+ /* by default we consider process to be remote */
120+ proc -> super .proc_flags = OPAL_PROC_NON_LOCAL ;
119121 * procp = proc ;
120122
121123 return OMPI_SUCCESS ;
@@ -133,26 +135,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
133135 */
134136int ompi_proc_complete_init_single (ompi_proc_t * proc )
135137{
136- uint16_t u16 , * u16ptr ;
137138 int ret ;
138139
139- u16ptr = & u16 ;
140-
141140 if ((OMPI_CAST_RTE_NAME (& proc -> super .proc_name )-> jobid == OMPI_PROC_MY_NAME -> jobid ) &&
142141 (OMPI_CAST_RTE_NAME (& proc -> super .proc_name )-> vpid == OMPI_PROC_MY_NAME -> vpid )) {
143142 /* nothing else to do */
144143 return OMPI_SUCCESS ;
145144 }
146145
147- /* get the locality information - all RTEs are required
148- * to provide this information at startup */
149- OPAL_MODEX_RECV_VALUE_OPTIONAL (ret , OPAL_PMIX_LOCALITY , & proc -> super .proc_name , & u16ptr , OPAL_UINT16 );
150- if (OPAL_SUCCESS != ret ) {
151- proc -> super .proc_flags = OPAL_PROC_NON_LOCAL ;
152- } else {
153- proc -> super .proc_flags = u16 ;
154- }
155-
156146 /* we can retrieve the hostname at no cost because it
157147 * was provided at startup - but make it optional so
158148 * we don't chase after it if some system doesn't
@@ -287,20 +277,6 @@ int ompi_proc_init(void)
287277 }
288278#endif
289279
290- if (ompi_process_info .num_procs < ompi_add_procs_cutoff ) {
291- /* create proc structures and find self */
292- for (ompi_vpid_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
293- if (i == OMPI_PROC_MY_NAME -> vpid ) {
294- continue ;
295- }
296-
297- ret = ompi_proc_allocate (OMPI_PROC_MY_NAME -> jobid , i , & proc );
298- if (OMPI_SUCCESS != ret ) {
299- return ret ;
300- }
301- }
302- }
303-
304280 return OMPI_SUCCESS ;
305281}
306282
@@ -329,47 +305,77 @@ static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b)
329305 */
330306int ompi_proc_complete_init (void )
331307{
308+ opal_process_name_t wildcard_rank ;
332309 ompi_proc_t * proc ;
333310 int ret , errcode = OMPI_SUCCESS ;
311+ char * val ;
334312
335313 opal_mutex_lock (& ompi_proc_lock );
336314
315+ /* Add all local peers first */
316+ wildcard_rank .jobid = OMPI_PROC_MY_NAME -> jobid ;
317+ wildcard_rank .vpid = OMPI_NAME_WILDCARD -> vpid ;
318+ /* retrieve the local peers */
319+ OPAL_MODEX_RECV_VALUE (ret , OPAL_PMIX_LOCAL_PEERS ,
320+ & wildcard_rank , & val , OPAL_STRING );
321+ if (OPAL_SUCCESS == ret && NULL != val ) {
322+ char * * peers = opal_argv_split (val , ',' );
323+ int i ;
324+ free (val );
325+ for (i = 0 ; NULL != peers [i ]; i ++ ) {
326+ ompi_vpid_t local_rank = strtoul (peers [i ], NULL , 10 );
327+ uint16_t u16 , * u16ptr = & u16 ;
328+ if (OMPI_PROC_MY_NAME -> vpid == local_rank ) {
329+ continue ;
330+ }
331+ ret = ompi_proc_allocate (OMPI_PROC_MY_NAME -> jobid , local_rank , & proc );
332+ if (OMPI_SUCCESS != ret ) {
333+ return ret ;
334+ }
335+ /* get the locality information - all RTEs are required
336+ * to provide this information at startup */
337+ OPAL_MODEX_RECV_VALUE_OPTIONAL (ret , OPAL_PMIX_LOCALITY , & proc -> super .proc_name , & u16ptr , OPAL_UINT16 );
338+ if (OPAL_SUCCESS == ret ) {
339+ proc -> super .proc_flags = u16 ;
340+ }
341+ }
342+ opal_argv_free (peers );
343+ }
344+
345+ /* Complete initialization of node-local procs */
337346 OPAL_LIST_FOREACH (proc , & ompi_proc_list , ompi_proc_t ) {
338347 ret = ompi_proc_complete_init_single (proc );
339348 if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
340349 errcode = ret ;
341350 break ;
342351 }
343352 }
344- opal_mutex_unlock (& ompi_proc_lock );
345353
346- if (ompi_process_info .num_procs >= ompi_add_procs_cutoff ) {
347- char * val = NULL ;
348- opal_process_name_t wildcard_rank ;
349- wildcard_rank .jobid = OMPI_PROC_MY_NAME -> jobid ;
350- wildcard_rank .vpid = OMPI_NAME_WILDCARD -> vpid ;
351- /* retrieve the local peers */
352- OPAL_MODEX_RECV_VALUE (ret , OPAL_PMIX_LOCAL_PEERS ,
353- & wildcard_rank , & val , OPAL_STRING );
354- if (OPAL_SUCCESS == ret && NULL != val ) {
355- char * * peers = opal_argv_split (val , ',' );
356- int i ;
357- free (val );
358- for (i = 0 ; NULL != peers [i ]; i ++ ) {
359- ompi_vpid_t local_rank = strtoul (peers [i ], NULL , 10 );
360- opal_process_name_t proc_name = {.vpid = local_rank , .jobid = OMPI_PROC_MY_NAME -> jobid };
361-
362- if (OMPI_PROC_MY_NAME -> vpid == local_rank ) {
363- continue ;
364- }
365- (void ) ompi_proc_for_name (proc_name );
366- }
367- opal_argv_free (peers );
354+ /* if cutoff is larger than # of procs - add all processes
355+ * NOTE that local procs will be automatically skipped as they
356+ * are already in the hash table
357+ */
358+ if (ompi_process_info .num_procs < ompi_add_procs_cutoff ) {
359+ /* sinse ompi_proc_for_name is locking internally -
360+ * we need to release lock here
361+ */
362+ opal_mutex_unlock (& ompi_proc_lock );
363+
364+ for (ompi_vpid_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
365+ opal_process_name_t proc_name ;
366+ proc_name .jobid = OMPI_PROC_MY_NAME -> jobid ;
367+ proc_name .vpid = i ;
368+ (void ) ompi_proc_for_name (proc_name );
368369 }
370+
371+ /* acquire lock back for the next step - sort */
372+ opal_mutex_lock (& ompi_proc_lock );
369373 }
370374
371375 opal_list_sort (& ompi_proc_list , ompi_proc_compare_vid );
372376
377+ opal_mutex_unlock (& ompi_proc_lock );
378+
373379 return errcode ;
374380}
375381
0 commit comments