Skip to content

Commit b85f751

Browse files
Dust Ligregkh
authored andcommitted
net/smc: fix kernel panic caused by race of smc_sock
[ Upstream commit 349d431 ] A crash occurs when smc_cdc_tx_handler() tries to access smc_sock but smc_release() has already freed it. [ 4570.695099] BUG: unable to handle page fault for address: 000000002eae9e88 [ 4570.696048] #PF: supervisor write access in kernel mode [ 4570.696728] #PF: error_code(0x0002) - not-present page [ 4570.697401] PGD 0 P4D 0 [ 4570.697716] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 4570.698228] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-rc4+ #111 [ 4570.699013] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/0 [ 4570.699933] RIP: 0010:_raw_spin_lock+0x1a/0x30 <...> [ 4570.711446] Call Trace: [ 4570.711746] <IRQ> [ 4570.711992] smc_cdc_tx_handler+0x41/0xc0 [ 4570.712470] smc_wr_tx_tasklet_fn+0x213/0x560 [ 4570.712981] ? smc_cdc_tx_dismisser+0x10/0x10 [ 4570.713489] tasklet_action_common.isra.17+0x66/0x140 [ 4570.714083] __do_softirq+0x123/0x2f4 [ 4570.714521] irq_exit_rcu+0xc4/0xf0 [ 4570.714934] common_interrupt+0xba/0xe0 Though smc_cdc_tx_handler() checked the existence of smc connection, smc_release() may have already dismissed and released the smc socket before smc_cdc_tx_handler() further visits it. smc_cdc_tx_handler() |smc_release() if (!conn) | | |smc_cdc_tx_dismiss_slots() | smc_cdc_tx_dismisser() | |sock_put(&smc->sk) <- last sock_put, | smc_sock freed bh_lock_sock(&smc->sk) (panic) | To make sure we won't receive any CDC messages after we free the smc_sock, add a refcount on the smc_connection for inflight CDC message(posted to the QP but haven't received related CQE), and don't release the smc_connection until all the inflight CDC messages haven been done, for both success or failed ones. Using refcount on CDC messages brings another problem: when the link is going to be destroyed, smcr_link_clear() will reset the QP, which then remove all the pending CQEs related to the QP in the CQ. To make sure all the CQEs will always come back so the refcount on the smc_connection can always reach 0, smc_ib_modify_qp_reset() was replaced by smc_ib_modify_qp_error(). And remove the timeout in smc_wr_tx_wait_no_pending_sends() since we need to wait for all pending WQEs done, or we may encounter use-after- free when handling CQEs. For IB device removal routine, we need to wait for all the QPs on that device been destroyed before we can destroy CQs on the device, or the refcount on smc_connection won't reach 0 and smc_sock cannot be released. Fixes: 5f08318 ("smc: connection data control (CDC)") Reported-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: Dust Li <dust.li@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 85ce259 commit b85f751

File tree

8 files changed

+57
-76
lines changed

8 files changed

+57
-76
lines changed

net/smc/smc.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,11 @@ struct smc_connection {
170170
u16 tx_cdc_seq; /* sequence # for CDC send */
171171
u16 tx_cdc_seq_fin; /* sequence # - tx completed */
172172
spinlock_t send_lock; /* protect wr_sends */
173+
atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe
174+
* - inc when post wqe,
175+
* - dec on polled tx cqe
176+
*/
177+
wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
173178
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
174179
u32 tx_off; /* base offset in peer rmb */
175180

net/smc/smc_cdc.c

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
3131
struct smc_sock *smc;
3232
int diff;
3333

34-
if (!conn)
35-
/* already dismissed */
36-
return;
37-
3834
smc = container_of(conn, struct smc_sock, conn);
3935
bh_lock_sock(&smc->sk);
4036
if (!wc_status) {
@@ -51,6 +47,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
5147
conn);
5248
conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
5349
}
50+
51+
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
52+
unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
53+
wake_up(&conn->cdc_pend_tx_wq);
54+
WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
55+
5456
smc_tx_sndbuf_nonfull(smc);
5557
bh_unlock_sock(&smc->sk);
5658
}
@@ -107,13 +109,18 @@ int smc_cdc_msg_send(struct smc_connection *conn,
107109
conn->tx_cdc_seq++;
108110
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
109111
smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed);
112+
113+
atomic_inc(&conn->cdc_pend_tx_wr);
114+
smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
115+
110116
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
111117
if (!rc) {
112118
smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
113119
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
114120
} else {
115121
conn->tx_cdc_seq--;
116122
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
123+
atomic_dec(&conn->cdc_pend_tx_wr);
117124
}
118125

119126
return rc;
@@ -136,7 +143,18 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn,
136143
peer->token = htonl(local->token);
137144
peer->prod_flags.failover_validation = 1;
138145

146+
/* We need to set pend->conn here to make sure smc_cdc_tx_handler()
147+
* can handle properly
148+
*/
149+
smc_cdc_add_pending_send(conn, pend);
150+
151+
atomic_inc(&conn->cdc_pend_tx_wr);
152+
smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
153+
139154
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
155+
if (unlikely(rc))
156+
atomic_dec(&conn->cdc_pend_tx_wr);
157+
140158
return rc;
141159
}
142160

@@ -193,31 +211,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
193211
return rc;
194212
}
195213

196-
static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
197-
unsigned long data)
214+
void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn)
198215
{
199-
struct smc_connection *conn = (struct smc_connection *)data;
200-
struct smc_cdc_tx_pend *cdc_pend =
201-
(struct smc_cdc_tx_pend *)tx_pend;
202-
203-
return cdc_pend->conn == conn;
204-
}
205-
206-
static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
207-
{
208-
struct smc_cdc_tx_pend *cdc_pend =
209-
(struct smc_cdc_tx_pend *)tx_pend;
210-
211-
cdc_pend->conn = NULL;
212-
}
213-
214-
void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
215-
{
216-
struct smc_link *link = conn->lnk;
217-
218-
smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
219-
smc_cdc_tx_filter, smc_cdc_tx_dismisser,
220-
(unsigned long)conn);
216+
wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr));
221217
}
222218

223219
/* Send a SMC-D CDC header.

net/smc/smc_cdc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn,
291291
struct smc_wr_buf **wr_buf,
292292
struct smc_rdma_wr **wr_rdma_buf,
293293
struct smc_cdc_tx_pend **pend);
294-
void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
294+
void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn);
295295
int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
296296
struct smc_cdc_tx_pend *pend);
297297
int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);

net/smc/smc_core.c

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,7 +1056,7 @@ void smc_conn_free(struct smc_connection *conn)
10561056
smc_ism_unset_conn(conn);
10571057
tasklet_kill(&conn->rx_tsklet);
10581058
} else {
1059-
smc_cdc_tx_dismiss_slots(conn);
1059+
smc_cdc_wait_pend_tx_wr(conn);
10601060
if (current_work() != &conn->abort_work)
10611061
cancel_work_sync(&conn->abort_work);
10621062
}
@@ -1133,7 +1133,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
11331133
smc_llc_link_clear(lnk, log);
11341134
smcr_buf_unmap_lgr(lnk);
11351135
smcr_rtoken_clear_link(lnk);
1136-
smc_ib_modify_qp_reset(lnk);
1136+
smc_ib_modify_qp_error(lnk);
11371137
smc_wr_free_link(lnk);
11381138
smc_ib_destroy_queue_pair(lnk);
11391139
smc_ib_dealloc_protection_domain(lnk);
@@ -1264,7 +1264,7 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
12641264
else
12651265
tasklet_unlock_wait(&conn->rx_tsklet);
12661266
} else {
1267-
smc_cdc_tx_dismiss_slots(conn);
1267+
smc_cdc_wait_pend_tx_wr(conn);
12681268
}
12691269
smc_lgr_unregister_conn(conn);
12701270
smc_close_active_abort(smc);
@@ -1387,11 +1387,16 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd)
13871387
/* Called when an SMCR device is removed or the smc module is unloaded.
13881388
* If smcibdev is given, all SMCR link groups using this device are terminated.
13891389
* If smcibdev is NULL, all SMCR link groups are terminated.
1390+
*
1391+
* We must wait here for QPs been destroyed before we destroy the CQs,
1392+
* or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus
1393+
* smc_sock cannot be released.
13901394
*/
13911395
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
13921396
{
13931397
struct smc_link_group *lgr, *lg;
13941398
LIST_HEAD(lgr_free_list);
1399+
LIST_HEAD(lgr_linkdown_list);
13951400
int i;
13961401

13971402
spin_lock_bh(&smc_lgr_list.lock);
@@ -1403,7 +1408,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
14031408
list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
14041409
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
14051410
if (lgr->lnk[i].smcibdev == smcibdev)
1406-
smcr_link_down_cond_sched(&lgr->lnk[i]);
1411+
list_move_tail(&lgr->list, &lgr_linkdown_list);
14071412
}
14081413
}
14091414
}
@@ -1415,6 +1420,16 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
14151420
__smc_lgr_terminate(lgr, false);
14161421
}
14171422

1423+
list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) {
1424+
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1425+
if (lgr->lnk[i].smcibdev == smcibdev) {
1426+
mutex_lock(&lgr->llc_conf_mutex);
1427+
smcr_link_down_cond(&lgr->lnk[i]);
1428+
mutex_unlock(&lgr->llc_conf_mutex);
1429+
}
1430+
}
1431+
}
1432+
14181433
if (smcibdev) {
14191434
if (atomic_read(&smcibdev->lnk_cnt))
14201435
wait_event(smcibdev->lnks_deleted,
@@ -1514,7 +1529,6 @@ static void smcr_link_down(struct smc_link *lnk)
15141529
if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
15151530
return;
15161531

1517-
smc_ib_modify_qp_reset(lnk);
15181532
to_lnk = smc_switch_conns(lgr, lnk, true);
15191533
if (!to_lnk) { /* no backup link available */
15201534
smcr_link_clear(lnk, true);
@@ -1742,6 +1756,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
17421756
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
17431757
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
17441758
conn->urg_state = SMC_URG_READ;
1759+
init_waitqueue_head(&conn->cdc_pend_tx_wq);
17451760
INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
17461761
if (ini->is_smcd) {
17471762
conn->rx_off = sizeof(struct smcd_cdc_msg);

net/smc/smc_ib.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,12 +101,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk)
101101
IB_QP_MAX_QP_RD_ATOMIC);
102102
}
103103

104-
int smc_ib_modify_qp_reset(struct smc_link *lnk)
104+
int smc_ib_modify_qp_error(struct smc_link *lnk)
105105
{
106106
struct ib_qp_attr qp_attr;
107107

108108
memset(&qp_attr, 0, sizeof(qp_attr));
109-
qp_attr.qp_state = IB_QPS_RESET;
109+
qp_attr.qp_state = IB_QPS_ERR;
110110
return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
111111
}
112112

net/smc/smc_ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk);
7979
int smc_ib_ready_link(struct smc_link *lnk);
8080
int smc_ib_modify_qp_rts(struct smc_link *lnk);
8181
int smc_ib_modify_qp_reset(struct smc_link *lnk);
82+
int smc_ib_modify_qp_error(struct smc_link *lnk);
8283
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
8384
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
8485
struct smc_buf_desc *buf_slot, u8 link_idx);

net/smc/smc_wr.c

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,9 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link)
6262
}
6363

6464
/* wait till all pending tx work requests on the given link are completed */
65-
int smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
65+
void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
6666
{
67-
if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link),
68-
SMC_WR_TX_WAIT_PENDING_TIME))
69-
return 0;
70-
else /* timeout */
71-
return -EPIPE;
67+
wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
7268
}
7369

7470
static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
@@ -87,7 +83,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
8783
struct smc_wr_tx_pend pnd_snd;
8884
struct smc_link *link;
8985
u32 pnd_snd_idx;
90-
int i;
9186

9287
link = wc->qp->qp_context;
9388

@@ -115,14 +110,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
115110
if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
116111
return;
117112
if (wc->status) {
118-
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
119-
/* clear full struct smc_wr_tx_pend including .priv */
120-
memset(&link->wr_tx_pends[i], 0,
121-
sizeof(link->wr_tx_pends[i]));
122-
memset(&link->wr_tx_bufs[i], 0,
123-
sizeof(link->wr_tx_bufs[i]));
124-
clear_bit(i, link->wr_tx_mask);
125-
}
126113
/* terminate link */
127114
smcr_link_down_cond_sched(link);
128115
}
@@ -351,25 +338,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
351338
return rc;
352339
}
353340

354-
void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type,
355-
smc_wr_tx_filter filter,
356-
smc_wr_tx_dismisser dismisser,
357-
unsigned long data)
358-
{
359-
struct smc_wr_tx_pend_priv *tx_pend;
360-
struct smc_wr_rx_hdr *wr_tx;
361-
int i;
362-
363-
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
364-
wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i];
365-
if (wr_tx->type != wr_tx_hdr_type)
366-
continue;
367-
tx_pend = &link->wr_tx_pends[i].priv;
368-
if (filter(tx_pend, data))
369-
dismisser(tx_pend);
370-
}
371-
}
372-
373341
/****************************** receive queue ********************************/
374342

375343
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
@@ -574,10 +542,7 @@ void smc_wr_free_link(struct smc_link *lnk)
574542
smc_wr_wakeup_reg_wait(lnk);
575543
smc_wr_wakeup_tx_wait(lnk);
576544

577-
if (smc_wr_tx_wait_no_pending_sends(lnk))
578-
memset(lnk->wr_tx_mask, 0,
579-
BITS_TO_LONGS(SMC_WR_BUF_CNT) *
580-
sizeof(*lnk->wr_tx_mask));
545+
smc_wr_tx_wait_no_pending_sends(lnk);
581546
wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
582547
wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
583548

net/smc/smc_wr.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
2323

2424
#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
25-
#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
2625

2726
#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
2827

@@ -122,7 +121,7 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
122121
smc_wr_tx_filter filter,
123122
smc_wr_tx_dismisser dismisser,
124123
unsigned long data);
125-
int smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
124+
void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
126125

127126
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
128127
int smc_wr_rx_post_init(struct smc_link *link);

0 commit comments

Comments
 (0)