Skip to content

Commit 56307af

Browse files
committed
v4.1.x: osc/ucx fix osc ucx component priority selection
main commit equivalent: c2e6cd9 Signed-off-by: Tomislav Janjusic <tomislavj@nvidia.com> Co-authored-by: Mamzi Bayatpour <mbayatpour@nvidia.com> bot:notacherrypick
1 parent 1c67bf1 commit 56307af

File tree

1 file changed

+63
-33
lines changed

1 file changed

+63
-33
lines changed

ompi/mca/osc/ucx/osc_ucx_component.c

Lines changed: 63 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,69 @@ static int progress_callback(void) {
142142
return 0;
143143
}
144144

145+
static int ucp_context_init(void) {
146+
int ret = OMPI_SUCCESS;
147+
ucs_status_t status;
148+
ucp_config_t *config = NULL;
149+
ucp_params_t context_params;
150+
151+
status = ucp_config_read("MPI", NULL, &config);
152+
if (UCS_OK != status) {
153+
OSC_UCX_VERBOSE(1, "ucp_config_read failed: %d", status);
154+
return OMPI_ERROR;
155+
}
156+
157+
/* initialize UCP context */
158+
memset(&context_params, 0, sizeof(context_params));
159+
context_params.field_mask = UCP_PARAM_FIELD_FEATURES |
160+
UCP_PARAM_FIELD_MT_WORKERS_SHARED |
161+
UCP_PARAM_FIELD_ESTIMATED_NUM_EPS |
162+
UCP_PARAM_FIELD_REQUEST_INIT |
163+
UCP_PARAM_FIELD_REQUEST_SIZE;
164+
context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64;
165+
context_params.mt_workers_shared = 0;
166+
context_params.estimated_num_eps = ompi_proc_world_size();
167+
context_params.request_init = internal_req_init;
168+
context_params.request_size = sizeof(ompi_osc_ucx_internal_request_t);
169+
170+
status = ucp_init(&context_params, config, &mca_osc_ucx_component.ucp_context);
171+
ucp_config_release(config);
172+
if (UCS_OK != status) {
173+
OSC_UCX_VERBOSE(1, "ucp_init failed: %d", status);
174+
ret = OMPI_ERROR;
175+
}
176+
177+
return ret;
178+
}
145179
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) {
180+
int ret = OMPI_SUCCESS;
181+
opal_common_ucx_support_level_t support_level;
182+
146183
mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads;
147184

148185
opal_common_ucx_mca_register();
149-
return OMPI_SUCCESS;
186+
187+
ret = ucp_context_init();
188+
if (OMPI_ERROR == ret) {
189+
return OMPI_ERR_NOT_AVAILABLE;
190+
}
191+
192+
support_level = opal_common_ucx_support_level(mca_osc_ucx_component.ucp_context);
193+
if (OPAL_COMMON_UCX_SUPPORT_NONE == support_level) {
194+
ucp_cleanup(mca_osc_ucx_component.ucp_context);
195+
mca_osc_ucx_component.ucp_context = NULL;
196+
return OMPI_ERR_NOT_AVAILABLE;
197+
}
198+
199+
/*
200+
* Retain priority if we have supported devices and transports.
201+
* Lower priority if we have supported transports, but not supported devices.
202+
*/
203+
mca_osc_ucx_component.priority = (support_level == OPAL_COMMON_UCX_SUPPORT_DEVICE) ?
204+
mca_osc_ucx_component.priority : 19;
205+
OSC_UCX_VERBOSE(2, "returning priority %d", mca_osc_ucx_component.priority);
206+
207+
return ret;
150208
}
151209

152210
static int component_finalize(void) {
@@ -165,7 +223,10 @@ static int component_finalize(void) {
165223
assert(mca_osc_ucx_component.num_incomplete_req_ops == 0);
166224
if (mca_osc_ucx_component.env_initialized == true) {
167225
OBJ_DESTRUCT(&mca_osc_ucx_component.requests);
168-
ucp_cleanup(mca_osc_ucx_component.ucp_context);
226+
if (NULL != mca_osc_ucx_component.ucp_context) {
227+
ucp_cleanup(mca_osc_ucx_component.ucp_context);
228+
mca_osc_ucx_component.ucp_context = NULL;
229+
}
169230
mca_osc_ucx_component.env_initialized = false;
170231
}
171232
opal_common_ucx_mca_deregister();
@@ -317,18 +378,9 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
317378
_osc_ucx_init_lock();
318379

319380
if (mca_osc_ucx_component.env_initialized == false) {
320-
ucp_config_t *config = NULL;
321-
ucp_params_t context_params;
322381
ucp_worker_params_t worker_params;
323382
ucp_worker_attr_t worker_attr;
324383

325-
status = ucp_config_read("MPI", NULL, &config);
326-
if (UCS_OK != status) {
327-
OSC_UCX_VERBOSE(1, "ucp_config_read failed: %d", status);
328-
ret = OMPI_ERROR;
329-
goto select_unlock;
330-
}
331-
332384
OBJ_CONSTRUCT(&mca_osc_ucx_component.requests, opal_free_list_t);
333385
ret = opal_free_list_init (&mca_osc_ucx_component.requests,
334386
sizeof(ompi_osc_ucx_request_t),
@@ -340,28 +392,6 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
340392
goto select_unlock;
341393
}
342394

343-
/* initialize UCP context */
344-
345-
memset(&context_params, 0, sizeof(context_params));
346-
context_params.field_mask = UCP_PARAM_FIELD_FEATURES |
347-
UCP_PARAM_FIELD_MT_WORKERS_SHARED |
348-
UCP_PARAM_FIELD_ESTIMATED_NUM_EPS |
349-
UCP_PARAM_FIELD_REQUEST_INIT |
350-
UCP_PARAM_FIELD_REQUEST_SIZE;
351-
context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64;
352-
context_params.mt_workers_shared = 0;
353-
context_params.estimated_num_eps = ompi_proc_world_size();
354-
context_params.request_init = internal_req_init;
355-
context_params.request_size = sizeof(ompi_osc_ucx_internal_request_t);
356-
357-
status = ucp_init(&context_params, config, &mca_osc_ucx_component.ucp_context);
358-
ucp_config_release(config);
359-
if (UCS_OK != status) {
360-
OSC_UCX_VERBOSE(1, "ucp_init failed: %d", status);
361-
ret = OMPI_ERROR;
362-
goto select_unlock;
363-
}
364-
365395
assert(mca_osc_ucx_component.ucp_worker == NULL);
366396
memset(&worker_params, 0, sizeof(worker_params));
367397
worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;

0 commit comments

Comments
 (0)