Skip to content

btl/openib: do not initialize device with not allowed ports #6184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions opal/mca/btl/openib/btl_openib.c
Original file line number Diff line number Diff line change
Expand Up @@ -1047,7 +1047,8 @@ int mca_btl_openib_add_procs(
opal_bitmap_clear_all_bits(reachable);
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
true, opal_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num);
openib_btl->device_name, openib_btl->port_num);
return OPAL_SUCCESS;
}

btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt);
Expand Down Expand Up @@ -1719,11 +1720,11 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
free(openib_btl->cpcs[i]);
}
free(openib_btl->cpcs);
}

/* Release device if there are no more users */
if(!(--openib_btl->device->btls)) {
OBJ_RELEASE(openib_btl->device);
/* Release device if there are no more users */
if(!(--openib_btl->device->allowed_btls)) {
OBJ_RELEASE(openib_btl->device);
}
}

if (NULL != openib_btl->qps) {
Expand Down
2 changes: 2 additions & 0 deletions opal/mca/btl/openib/btl_openib.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ typedef struct mca_btl_openib_device_t {
/* Whether this device supports eager RDMA */
uint8_t use_eager_rdma;
uint8_t btls; /** < number of btls using this device */
uint8_t allowed_btls; /** < number of allowed btls using this device */
opal_pointer_array_t *endpoints;
opal_pointer_array_t *device_btls;
uint16_t hp_cq_polls;
Expand Down Expand Up @@ -483,6 +484,7 @@ struct mca_btl_openib_module_t {
uint8_t num_cpcs;

mca_btl_openib_device_t *device;
char * device_name;
uint8_t port_num; /**< ID of the PORT */
uint16_t pkey_index;
struct ibv_port_attr ib_port_attr;
Expand Down
14 changes: 11 additions & 3 deletions opal/mca/btl/openib/btl_openib_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -650,9 +650,10 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
sizeof(mca_btl_openib_module));
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
openib_btl->device = device;
openib_btl->port_num = (uint8_t) port_num;
openib_btl->allowed = false;
openib_btl->device = NULL;
openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev));
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
Expand Down Expand Up @@ -786,6 +787,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
openib_btl->device = device;
openib_btl->device_name = NULL;
openib_btl->port_num = (uint8_t) port_num;
openib_btl->pkey_index = pkey_index;
openib_btl->lid = lid;
Expand Down Expand Up @@ -906,6 +908,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
++device->btls;
++device->allowed_btls;
++mca_btl_openib_component.ib_num_btls;
++mca_btl_openib_component.ib_allowed_btls;
if (-1 != mca_btl_openib_component.ib_max_btls &&
Expand Down Expand Up @@ -1935,7 +1938,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
if (ib_port_attr.active_mtu < device->mtu){
device->mtu = ib_port_attr.active_mtu;
}
if (mca_btl_openib_component.apm_ports && device->btls > 0) {
if (mca_btl_openib_component.apm_ports && device->allowed_btls > 0) {
init_apm_port(device, i, ib_port_attr.lid);
break;
}
Expand Down Expand Up @@ -1971,7 +1974,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)

/* If we made a BTL, check APM status and return. Otherwise, fall
through and destroy everything */
if (device->btls > 0) {
if (device->allowed_btls > 0) {
/* if apm was enabled it should be > 1 */
if (1 == mca_btl_openib_component.apm_ports) {
opal_show_help("help-mpi-btl-openib.txt",
Expand Down Expand Up @@ -2292,6 +2295,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
good:
mca_btl_openib_component.devices_count++;
return OPAL_SUCCESS;
} else if (device->btls > 0) {
/* no port is allowed to be used by btl/openib,
* so release the device right away */
OBJ_RELEASE(device);
return OPAL_SUCCESS;
}

error:
Expand Down
1 change: 0 additions & 1 deletion opal/mca/btl/openib/btl_openib_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,6 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc)

if (0 == ib_proc->proc_port_count) {
ib_proc->proc_endpoints = NULL;
goto no_err_exit;
} else {
ib_proc->proc_endpoints = (volatile mca_btl_base_endpoint_t**)
malloc(ib_proc->proc_port_count *
Expand Down