Skip to content

Commit 621e55f

Browse files
jgunthorpedledford
authored andcommitted
RDMA/devices: Do not deadlock during client removal
lockdep reports: WARNING: possible circular locking dependency detected modprobe/302 is trying to acquire lock: 0000000007c8919c ((wq_completion)ib_cm){+.+.}, at: flush_workqueue+0xdf/0x990 but task is already holding lock: 000000002d3d2ca9 (&device->client_data_rwsem){++++}, at: remove_client_context+0x79/0xd0 [ib_core] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&device->client_data_rwsem){++++}: down_read+0x3f/0x160 ib_get_net_dev_by_params+0xd5/0x200 [ib_core] cma_ib_req_handler+0x5f6/0x2090 [rdma_cm] cm_process_work+0x29/0x110 [ib_cm] cm_req_handler+0x10f5/0x1c00 [ib_cm] cm_work_handler+0x54c/0x311d [ib_cm] process_one_work+0x4aa/0xa30 worker_thread+0x62/0x5b0 kthread+0x1ca/0x1f0 ret_from_fork+0x24/0x30 -> #1 ((work_completion)(&(&work->work)->work)){+.+.}: process_one_work+0x45f/0xa30 worker_thread+0x62/0x5b0 kthread+0x1ca/0x1f0 ret_from_fork+0x24/0x30 -> #0 ((wq_completion)ib_cm){+.+.}: lock_acquire+0xc8/0x1d0 flush_workqueue+0x102/0x990 cm_remove_one+0x30e/0x3c0 [ib_cm] remove_client_context+0x94/0xd0 [ib_core] disable_device+0x10a/0x1f0 [ib_core] __ib_unregister_device+0x5a/0xe0 [ib_core] ib_unregister_device+0x21/0x30 [ib_core] mlx5_ib_stage_ib_reg_cleanup+0x9/0x10 [mlx5_ib] __mlx5_ib_remove+0x3d/0x70 [mlx5_ib] mlx5_ib_remove+0x12e/0x140 [mlx5_ib] mlx5_remove_device+0x144/0x150 [mlx5_core] mlx5_unregister_interface+0x3f/0xf0 [mlx5_core] mlx5_ib_cleanup+0x10/0x3a [mlx5_ib] __x64_sys_delete_module+0x227/0x350 do_syscall_64+0xc3/0x6a4 entry_SYSCALL_64_after_hwframe+0x49/0xbe Which is due to the read side of the client_data_rwsem being obtained recursively through a work queue flush during cm client removal. The lock is being held across the remove in remove_client_context() so that the function is a fence, once it returns the client is removed. This is required so that the two callers do not proceed with destruction until the client completes removal. Instead of using client_data_rwsem use the existing device unregistration refcount and add a similar client unregistration (client->uses) refcount. This will fence the two unregistration paths without holding any locks. Cc: <stable@vger.kernel.org> Fixes: 921eab1 ("RDMA/devices: Re-organize device.c locking") Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Link: https://lore.kernel.org/r/20190731081841.32345-2-leon@kernel.org Signed-off-by: Doug Ledford <dledford@redhat.com>
1 parent 61f2598 commit 621e55f

File tree

2 files changed

+44
-13
lines changed

2 files changed

+44
-13
lines changed

drivers/infiniband/core/device.c

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,12 @@ static LIST_HEAD(client_list);
9999
static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
100100
static DECLARE_RWSEM(clients_rwsem);
101101

102+
static void ib_client_put(struct ib_client *client)
103+
{
104+
if (refcount_dec_and_test(&client->uses))
105+
complete(&client->uses_zero);
106+
}
107+
102108
/*
103109
* If client_data is registered then the corresponding client must also still
104110
* be registered.
@@ -660,6 +666,14 @@ static int add_client_context(struct ib_device *device,
660666
return 0;
661667

662668
down_write(&device->client_data_rwsem);
669+
/*
670+
* So long as the client is registered hold both the client and device
671+
* unregistration locks.
672+
*/
673+
if (!refcount_inc_not_zero(&client->uses))
674+
goto out_unlock;
675+
refcount_inc(&device->refcount);
676+
663677
/*
664678
* Another caller to add_client_context got here first and has already
665679
* completely initialized context.
@@ -683,6 +697,9 @@ static int add_client_context(struct ib_device *device,
683697
return 0;
684698

685699
out:
700+
ib_device_put(device);
701+
ib_client_put(client);
702+
out_unlock:
686703
up_write(&device->client_data_rwsem);
687704
return ret;
688705
}
@@ -702,7 +719,7 @@ static void remove_client_context(struct ib_device *device,
702719
client_data = xa_load(&device->client_data, client_id);
703720
xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
704721
client = xa_load(&clients, client_id);
705-
downgrade_write(&device->client_data_rwsem);
722+
up_write(&device->client_data_rwsem);
706723

707724
/*
708725
* Notice we cannot be holding any exclusive locks when calling the
@@ -712,17 +729,13 @@ static void remove_client_context(struct ib_device *device,
712729
*
713730
* For this reason clients and drivers should not call the
714731
* unregistration functions will holdling any locks.
715-
*
716-
* It tempting to drop the client_data_rwsem too, but this is required
717-
* to ensure that unregister_client does not return until all clients
718-
* are completely unregistered, which is required to avoid module
719-
* unloading races.
720732
*/
721733
if (client->remove)
722734
client->remove(device, client_data);
723735

724736
xa_erase(&device->client_data, client_id);
725-
up_read(&device->client_data_rwsem);
737+
ib_device_put(device);
738+
ib_client_put(client);
726739
}
727740

728741
static int alloc_port_data(struct ib_device *device)
@@ -1705,6 +1718,8 @@ int ib_register_client(struct ib_client *client)
17051718
unsigned long index;
17061719
int ret;
17071720

1721+
refcount_set(&client->uses, 1);
1722+
init_completion(&client->uses_zero);
17081723
ret = assign_client_id(client);
17091724
if (ret)
17101725
return ret;
@@ -1740,16 +1755,29 @@ void ib_unregister_client(struct ib_client *client)
17401755
unsigned long index;
17411756

17421757
down_write(&clients_rwsem);
1758+
ib_client_put(client);
17431759
xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
17441760
up_write(&clients_rwsem);
1761+
1762+
/* We do not want to have locks while calling client->remove() */
1763+
rcu_read_lock();
1764+
xa_for_each (&devices, index, device) {
1765+
if (!ib_device_try_get(device))
1766+
continue;
1767+
rcu_read_unlock();
1768+
1769+
remove_client_context(device, client->client_id);
1770+
1771+
ib_device_put(device);
1772+
rcu_read_lock();
1773+
}
1774+
rcu_read_unlock();
1775+
17451776
/*
1746-
* Every device still known must be serialized to make sure we are
1747-
* done with the client callbacks before we return.
1777+
* remove_client_context() is not a fence, it can return even though a
1778+
* removal is ongoing. Wait until all removals are completed.
17481779
*/
1749-
down_read(&devices_rwsem);
1750-
xa_for_each (&devices, index, device)
1751-
remove_client_context(device, client->client_id);
1752-
up_read(&devices_rwsem);
1780+
wait_for_completion(&client->uses_zero);
17531781

17541782
down_write(&clients_rwsem);
17551783
list_del(&client->list);

include/rdma/ib_verbs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2647,6 +2647,9 @@ struct ib_client {
26472647
const union ib_gid *gid,
26482648
const struct sockaddr *addr,
26492649
void *client_data);
2650+
2651+
refcount_t uses;
2652+
struct completion uses_zero;
26502653
struct list_head list;
26512654
u32 client_id;
26522655

0 commit comments

Comments
 (0)