Skip to content

comm: add pmix timeout knob to group ops #13010

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions ompi/communicator/comm_cid.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2020-2024 Triad National Security, LLC. All rights
* Copyright (c) 2020-2025 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -320,6 +320,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
pmix_proc_t *procs = NULL;
void *grpinfo = NULL, *list = NULL;
pmix_data_array_t darray;
pmix_info_t tinfo;

switch (mode) {
case OMPI_COMM_CID_GROUP_NEW:
Expand Down Expand Up @@ -349,6 +350,13 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
goto fn_exit;
}

rc = PMIx_Info_list_add(grpinfo, PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
if (PMIX_SUCCESS != rc) {
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
rc = OMPI_ERR_OUT_OF_RESOURCE;
goto fn_exit;
}

list = PMIx_Info_list_start();

size_t c_index = (size_t)newcomm->c_index;
Expand Down Expand Up @@ -450,7 +458,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
tag, tproc_count, ninfo, cid_base));

/* destruct the group */
rc = PMIx_Group_destruct (tag, NULL, 0);
PMIX_INFO_CONSTRUCT(&tinfo);
PMIX_INFO_LOAD(&tinfo, PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
rc = PMIx_Group_destruct (tag, &tinfo, 0);
PMIX_INFO_DESTRUCT(&tinfo);
if(PMIX_SUCCESS != rc) {
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc)));
rc = opal_pmix_convert_status(rc);
Expand Down
4 changes: 2 additions & 2 deletions ompi/runtime/ompi_mpi_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* All rights reserved.
* Copyright (c) 2016-2021 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018-2024 Triad National Security, LLC. All rights
* Copyright (c) 2018-2025 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
Expand Down Expand Up @@ -391,7 +391,7 @@ int ompi_mpi_register_params(void)

ompi_pmix_connect_timeout = 0; /* infinite timeout - see PMIx standard */
(void) mca_base_var_register ("ompi", "mpi", NULL, "pmix_connect_timeout",
"Timeout(secs) for calls to PMIx_Connect. Default is no timeout.",
"Timeout(secs) for calls to PMIx_Connect and PMIx_Group_construct/destruct. Default is no timeout.",
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL,
0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&ompi_pmix_connect_timeout);
Expand Down
Loading