Skip to content

Commit bbcab84

Browse files
authored
Merge pull request #10448 from janjust/v4.1.x-osc-ucx-priority-fix
v4.1.x: osc/ucx fix osc ucx component priority selection
2 parents 0e4fd94 + 5ff040f commit bbcab84

File tree

1 file changed

+72
-33
lines changed

1 file changed

+72
-33
lines changed

ompi/mca/osc/ucx/osc_ucx_component.c

Lines changed: 72 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,78 @@ static int progress_callback(void) {
142142
return 0;
143143
}
144144

145+
static int ucp_context_init(void) {
146+
int ret = OMPI_SUCCESS;
147+
ucs_status_t status;
148+
ucp_config_t *config = NULL;
149+
ucp_params_t context_params;
150+
151+
status = ucp_config_read("MPI", NULL, &config);
152+
if (UCS_OK != status) {
153+
OSC_UCX_VERBOSE(1, "ucp_config_read failed: %d", status);
154+
return OMPI_ERROR;
155+
}
156+
157+
/* initialize UCP context */
158+
memset(&context_params, 0, sizeof(context_params));
159+
context_params.field_mask = UCP_PARAM_FIELD_FEATURES |
160+
UCP_PARAM_FIELD_MT_WORKERS_SHARED |
161+
UCP_PARAM_FIELD_ESTIMATED_NUM_EPS |
162+
UCP_PARAM_FIELD_REQUEST_INIT |
163+
UCP_PARAM_FIELD_REQUEST_SIZE;
164+
context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64;
165+
context_params.mt_workers_shared = 0;
166+
context_params.estimated_num_eps = ompi_proc_world_size();
167+
context_params.request_init = internal_req_init;
168+
context_params.request_size = sizeof(ompi_osc_ucx_internal_request_t);
169+
170+
status = ucp_init(&context_params, config, &mca_osc_ucx_component.ucp_context);
171+
ucp_config_release(config);
172+
if (UCS_OK != status) {
173+
OSC_UCX_VERBOSE(1, "ucp_init failed: %d", status);
174+
ret = OMPI_ERROR;
175+
}
176+
177+
return ret;
178+
}
145179
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) {
180+
opal_common_ucx_support_level_t support_level = OPAL_COMMON_UCX_SUPPORT_NONE;
181+
mca_base_var_source_t param_source = MCA_BASE_VAR_SOURCE_DEFAULT;
182+
int ret = OMPI_SUCCESS,
183+
param = -1;
184+
146185
mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads;
147186

148187
opal_common_ucx_mca_register();
149-
return OMPI_SUCCESS;
188+
189+
ret = ucp_context_init();
190+
if (OMPI_ERROR == ret) {
191+
return OMPI_ERR_NOT_AVAILABLE;
192+
}
193+
194+
support_level = opal_common_ucx_support_level(mca_osc_ucx_component.ucp_context);
195+
if (OPAL_COMMON_UCX_SUPPORT_NONE == support_level) {
196+
ucp_cleanup(mca_osc_ucx_component.ucp_context);
197+
mca_osc_ucx_component.ucp_context = NULL;
198+
return OMPI_ERR_NOT_AVAILABLE;
199+
}
200+
201+
param = mca_base_var_find("ompi","osc","ucx","priority");
202+
if (0 <= param) {
203+
(void) mca_base_var_get_value(param, NULL, &param_source, NULL);
204+
}
205+
206+
/*
207+
* Retain priority if we have supported devices and transports.
208+
* Lower priority if we have supported transports, but not supported devices.
209+
*/
210+
if (MCA_BASE_VAR_SOURCE_DEFAULT == param_source) {
211+
mca_osc_ucx_component.priority = (support_level == OPAL_COMMON_UCX_SUPPORT_DEVICE) ?
212+
mca_osc_ucx_component.priority : 9;
213+
OSC_UCX_VERBOSE(2, "returning priority %d", mca_osc_ucx_component.priority);
214+
}
215+
216+
return ret;
150217
}
151218

152219
static int component_finalize(void) {
@@ -165,7 +232,10 @@ static int component_finalize(void) {
165232
assert(mca_osc_ucx_component.num_incomplete_req_ops == 0);
166233
if (mca_osc_ucx_component.env_initialized == true) {
167234
OBJ_DESTRUCT(&mca_osc_ucx_component.requests);
168-
ucp_cleanup(mca_osc_ucx_component.ucp_context);
235+
if (NULL != mca_osc_ucx_component.ucp_context) {
236+
ucp_cleanup(mca_osc_ucx_component.ucp_context);
237+
mca_osc_ucx_component.ucp_context = NULL;
238+
}
169239
mca_osc_ucx_component.env_initialized = false;
170240
}
171241
opal_common_ucx_mca_deregister();
@@ -317,18 +387,9 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
317387
_osc_ucx_init_lock();
318388

319389
if (mca_osc_ucx_component.env_initialized == false) {
320-
ucp_config_t *config = NULL;
321-
ucp_params_t context_params;
322390
ucp_worker_params_t worker_params;
323391
ucp_worker_attr_t worker_attr;
324392

325-
status = ucp_config_read("MPI", NULL, &config);
326-
if (UCS_OK != status) {
327-
OSC_UCX_VERBOSE(1, "ucp_config_read failed: %d", status);
328-
ret = OMPI_ERROR;
329-
goto select_unlock;
330-
}
331-
332393
OBJ_CONSTRUCT(&mca_osc_ucx_component.requests, opal_free_list_t);
333394
ret = opal_free_list_init (&mca_osc_ucx_component.requests,
334395
sizeof(ompi_osc_ucx_request_t),
@@ -340,28 +401,6 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
340401
goto select_unlock;
341402
}
342403

343-
/* initialize UCP context */
344-
345-
memset(&context_params, 0, sizeof(context_params));
346-
context_params.field_mask = UCP_PARAM_FIELD_FEATURES |
347-
UCP_PARAM_FIELD_MT_WORKERS_SHARED |
348-
UCP_PARAM_FIELD_ESTIMATED_NUM_EPS |
349-
UCP_PARAM_FIELD_REQUEST_INIT |
350-
UCP_PARAM_FIELD_REQUEST_SIZE;
351-
context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64;
352-
context_params.mt_workers_shared = 0;
353-
context_params.estimated_num_eps = ompi_proc_world_size();
354-
context_params.request_init = internal_req_init;
355-
context_params.request_size = sizeof(ompi_osc_ucx_internal_request_t);
356-
357-
status = ucp_init(&context_params, config, &mca_osc_ucx_component.ucp_context);
358-
ucp_config_release(config);
359-
if (UCS_OK != status) {
360-
OSC_UCX_VERBOSE(1, "ucp_init failed: %d", status);
361-
ret = OMPI_ERROR;
362-
goto select_unlock;
363-
}
364-
365404
assert(mca_osc_ucx_component.ucp_worker == NULL);
366405
memset(&worker_params, 0, sizeof(worker_params));
367406
worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;

0 commit comments

Comments
 (0)