1212 * All rights reserved.
1313 * Copyright (c) 2012 Los Alamos National Security, LLC. All rights
1414 * reserved.
15- * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
16- * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
15+ * Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
16+ * Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved.
17+ * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights
18+ * reserved.
19+ * Copyright (c) 2021 Nanook Consulting. All rights reserved.
1720 * $COPYRIGHT$
1821 *
1922 * Additional copyrights may follow
@@ -44,8 +47,6 @@ typedef struct opened_component_t {
4447 mca_pml_base_component_t * om_component ;
4548} opened_component_t ;
4649
47- static bool modex_reqd = false;
48-
4950/**
5051 * Function for selecting one component from all those that are
5152 * available.
@@ -59,7 +60,7 @@ static bool modex_reqd=false;
5960int mca_pml_base_select (bool enable_progress_threads ,
6061 bool enable_mpi_threads )
6162{
62- int i , priority = 0 , best_priority = 0 , num_pml = 0 ;
63+ int i , priority = 0 , best_priority = 0 , num_pml = 0 , ret = 0 ;
6364 opal_list_item_t * item = NULL ;
6465 mca_base_component_list_item_t * cli = NULL ;
6566 mca_pml_base_component_t * component = NULL , * best_component = NULL ;
@@ -186,12 +187,13 @@ int mca_pml_base_select(bool enable_progress_threads,
186187 "selected %s best priority %d\n" ,
187188 best_component -> pmlm_version .mca_component_name , best_priority );
188189
189- /* if more than one PML could be considered, then we still need the
190- * modex since we cannot know which one will be selected on all procs
191- */
192- if (1 < num_pml ) {
193- modex_reqd = true;
194- }
190+ /* Save the winner */
191+
192+ mca_pml_base_selected_component = * best_component ;
193+ mca_pml = * best_module ;
194+ opal_output_verbose ( 10 , ompi_pml_base_framework .framework_output ,
195+ "select: component %s selected" ,
196+ mca_pml_base_selected_component .pmlm_version .mca_component_name );
195197
196198 /* Finalize all non-selected components */
197199
@@ -239,14 +241,6 @@ int mca_pml_base_select(bool enable_progress_threads,
239241 }
240242#endif
241243
242- /* Save the winner */
243-
244- mca_pml_base_selected_component = * best_component ;
245- mca_pml = * best_module ;
246- opal_output_verbose ( 10 , ompi_pml_base_framework .framework_output ,
247- "select: component %s selected" ,
248- mca_pml_base_selected_component .pmlm_version .mca_component_name );
249-
250244 /* This base function closes, unloads, and removes from the
251245 available list all unselected components. The available list will
252246 contain only the selected component. */
@@ -287,13 +281,11 @@ int mca_pml_base_select(bool enable_progress_threads,
287281 }
288282
289283 /* register winner in the modex */
290- if (modex_reqd && 0 == OMPI_PROC_MY_NAME -> vpid ) {
291- mca_pml_base_pml_selected (best_component -> pmlm_version .mca_component_name );
292- }
284+ ret = mca_pml_base_pml_selected (best_component -> pmlm_version .mca_component_name );
293285
294286 /* All done */
295287
296- return OMPI_SUCCESS ;
288+ return ret ;
297289}
298290
299291/* need a "commonly" named PML structure so everything ends up in the
@@ -307,49 +299,56 @@ static mca_base_component_t pml_base_component = {
307299};
308300
309301
302+ /*
303+ * If direct modex, then publish PML for all procs. If full modex then
304+ * publish PML for rank 0 only. This information is used during add_procs
305+ * to perform PML check.
306+ * During PML check, for direct modex, compare our PML with the peer's
307+ * PML for all procs in the add_procs call. This does not change the
308+ * connection complexity of modex transfers, since adding the proc is
309+ * going to get the peer information in the MTL/PML/BTL anyway.
310+ * For full modex, compare our PML with rank 0.
311+ * Direct Modex is performed when collect_all_data is false, as we do
312+ * not perform a fence operation during MPI_Init if async_modex is true.
313+ * If async_modex is false and collect_all_data is false then we do a
314+ * zero-byte barrier and we would still require direct modex during
315+ * add_procs
316+ */
310317int
311318mca_pml_base_pml_selected (const char * name )
312319{
313- int rc ;
320+ int rc = 0 ;
321+
322+ if (!opal_pmix_collect_all_data || 0 == OMPI_PROC_MY_NAME -> vpid ) {
323+ OPAL_MODEX_SEND (rc , OPAL_PMIX_GLOBAL , & pml_base_component , name ,
324+ strlen (name ) + 1 );
325+ }
314326
315- OPAL_MODEX_SEND (rc , OPAL_PMIX_GLOBAL , & pml_base_component , name , strlen (name ) + 1 );
316327 return rc ;
317328}
318329
319- int
320- mca_pml_base_pml_check_selected (const char * my_pml ,
321- ompi_proc_t * * procs ,
322- size_t nprocs )
330+ static int
331+ mca_pml_base_pml_check_selected_impl (const char * my_pml ,
332+ opal_process_name_t proc_name )
323333{
324334 size_t size ;
325- int ret ;
335+ int ret = 0 ;
326336 char * remote_pml ;
327337
328- /* if no modex was required by the PML, then
329- * we can assume success
330- */
331- if (!modex_reqd ) {
338+ /* if we are proc_name=OMPI_PROC_MY_NAME, then we can also assume success */
339+ if (0 == opal_compare_proc (ompi_proc_local ()-> super .proc_name , proc_name )) {
332340 opal_output_verbose ( 10 , ompi_pml_base_framework .framework_output ,
333- "check:select: modex not reqd " );
341+ "check:select: PML check not necessary on self " );
334342 return OMPI_SUCCESS ;
335343 }
336-
337- /* if we are rank=0, then we can also assume success */
338- if (0 == OMPI_PROC_MY_NAME -> vpid ) {
344+ OPAL_MODEX_RECV_STRING (ret ,
345+ mca_base_component_to_string (& pml_base_component ),
346+ & proc_name , (void * * ) & remote_pml , & size );
347+ if (OPAL_ERR_NOT_FOUND == ret ) {
339348 opal_output_verbose ( 10 , ompi_pml_base_framework .framework_output ,
340- "check:select: rank=0" );
341- return OMPI_SUCCESS ;
342- }
343-
344- /* get the name of the PML module selected by rank=0 */
345- OPAL_MODEX_RECV (ret , & pml_base_component ,
346- & procs [0 ]-> super .proc_name , (void * * ) & remote_pml , & size );
347-
348- /* if this key wasn't found, then just assume all is well... */
349- if (OMPI_SUCCESS != ret ) {
350- opal_output_verbose ( 10 , ompi_pml_base_framework .framework_output ,
351- "check:select: modex data not found" );
352- return OMPI_SUCCESS ;
349+ "check:select: PML modex for process %s not found" ,
350+ OMPI_NAME_PRINT (& proc_name ));
351+ return OMPI_ERR_NOT_FOUND ;
353352 }
354353
355354 /* the remote pml returned should never be NULL if an error
@@ -358,26 +357,68 @@ mca_pml_base_pml_check_selected(const char *my_pml,
358357 */
359358 if (NULL == remote_pml ) {
360359 opal_output_verbose ( 10 , ompi_pml_base_framework .framework_output ,
361- "check:select: got a NULL pml from rank=0" );
360+ "check:select: got a NULL pml from process %s" ,
361+ OMPI_NAME_PRINT (& proc_name ));
362362 return OMPI_ERR_UNREACH ;
363363 }
364364
365365 opal_output_verbose ( 10 , ompi_pml_base_framework .framework_output ,
366- "check:select: checking my pml %s against rank=0 pml %s" ,
367- my_pml , remote_pml );
366+ "check:select: checking my pml %s against process %s"
367+ " pml %s" , my_pml , OMPI_NAME_PRINT (& proc_name ),
368+ remote_pml );
368369
369370 /* if that module doesn't match my own, return an error */
370371 if ((size != strlen (my_pml ) + 1 ) ||
371372 (0 != strcmp (my_pml , remote_pml ))) {
373+ char * errhost = NULL ;
374+ OPAL_MODEX_RECV_VALUE_OPTIONAL (ret , OPAL_PMIX_HOSTNAME , & proc_name ,
375+ & (errhost ), OPAL_STRING );
372376 opal_output (0 , "%s selected pml %s, but peer %s on %s selected pml %s" ,
373377 OMPI_NAME_PRINT (& ompi_proc_local ()-> super .proc_name ),
374- my_pml , OMPI_NAME_PRINT (& procs [ 0 ] -> super . proc_name ),
375- (NULL == procs [ 0 ] -> super . proc_hostname ) ? "unknown" : procs [ 0 ] -> super . proc_hostname ,
378+ my_pml , OMPI_NAME_PRINT (& proc_name ),
379+ (NULL == errhost ) ? "unknown" : errhost ,
376380 remote_pml );
377- free (remote_pml ); /* cleanup before returning */
381+ free (remote_pml );
382+ free (errhost );
383+ /* cleanup before returning */
378384 return OMPI_ERR_UNREACH ;
379385 }
380386
381387 free (remote_pml );
382388 return OMPI_SUCCESS ;
383389}
390+
391+ int
392+ mca_pml_base_pml_check_selected (const char * my_pml ,
393+ ompi_proc_t * * procs ,
394+ size_t nprocs )
395+ {
396+ int ret = 0 ;
397+ size_t i ;
398+
399+ if (!opal_pmix_collect_all_data ) {
400+ /*
401+ * If direct modex, then compare our PML with the peer's PML
402+ * for all procs
403+ */
404+ for (i = 0 ; i < nprocs ; i ++ ) {
405+ ret = mca_pml_base_pml_check_selected_impl (
406+ my_pml ,
407+ procs [i ]-> super .proc_name );
408+ if (ret ) {
409+ return ret ;
410+ }
411+ }
412+ } else {
413+ /* else if full modex compare our PML with rank 0 */
414+ opal_process_name_t proc_name = {
415+ .jobid = ompi_proc_local ()-> super .proc_name .jobid ,
416+ .vpid = 0
417+ };
418+ ret = mca_pml_base_pml_check_selected_impl (
419+ my_pml ,
420+ proc_name );
421+ }
422+
423+ return ret ;
424+ }
0 commit comments