Skip to content

Commit 05077e2

Browse files
committed
Be more careful with locking around db.db_data
Lock db_mtx in some places that access db->db_data. But don't lock it in free_children, even though it does access db->db_data, because that leads to a recurse-on-non-recursive panic. Lock db_rwlock in some places that access db->db.db_data's contents. Closes #16626 Sponsored by: ConnectWise Signed-off-by: Alan Somers <asomers@gmail.com>
1 parent 68817d2 commit 05077e2

File tree

4 files changed

+112
-16
lines changed

4 files changed

+112
-16
lines changed

module/zfs/dbuf.c

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,7 +1200,8 @@ dbuf_verify(dmu_buf_impl_t *db)
12001200
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
12011201
(db->db_buf == NULL || db->db_buf->b_data) &&
12021202
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
1203-
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
1203+
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg) &&
1204+
RW_LOCK_HELD(&db->db_rwlock)) {
12041205
/*
12051206
* If the blkptr isn't set but they have nonzero data,
12061207
* it had better be dirty, otherwise we'll lose that
@@ -1704,7 +1705,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
17041705
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
17051706
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
17061707
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
1708+
rw_enter(&db->db_rwlock, RW_READER);
17071709
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
1710+
rw_exit(&db->db_rwlock);
17081711
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
17091712
dnode_t *dn = DB_DNODE(db);
17101713
int size = arc_buf_size(db->db_buf);
@@ -1734,7 +1737,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
17341737
} else {
17351738
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
17361739
}
1740+
rw_enter(&db->db_rwlock, RW_READER);
17371741
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
1742+
rw_exit(&db->db_rwlock);
17381743
} else {
17391744
db->db_buf = NULL;
17401745
dbuf_clear_data(db);
@@ -3006,7 +3011,9 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
30063011
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
30073012
/* we were freed while filling */
30083013
/* XXX dbuf_undirty? */
3014+
rw_enter(&db->db_rwlock, RW_WRITER);
30093015
memset(db->db.db_data, 0, db->db.db_size);
3016+
rw_exit(&db->db_rwlock);
30103017
db->db_freed_in_flight = FALSE;
30113018
db->db_state = DB_CACHED;
30123019
DTRACE_SET_STATE(db,
@@ -3381,12 +3388,14 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
33813388
*parentp = NULL;
33823389
return (err);
33833390
}
3391+
mutex_enter(&(*parentp)->db_mtx);
33843392
rw_enter(&(*parentp)->db_rwlock, RW_READER);
33853393
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
33863394
(blkid & ((1ULL << epbs) - 1));
33873395
if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
33883396
ASSERT(BP_IS_HOLE(*bpp));
33893397
rw_exit(&(*parentp)->db_rwlock);
3398+
mutex_exit(&(*parentp)->db_mtx);
33903399
return (0);
33913400
} else {
33923401
/* the block is referenced from the dnode */
@@ -4570,10 +4579,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
45704579
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
45714580
} else {
45724581
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
4582+
ASSERT(MUTEX_HELD(&parent_db->db_mtx));
45734583
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
45744584
VERIFY3U(parent_db->db_level, ==, 1);
45754585
VERIFY3P(DB_DNODE(parent_db), ==, dn);
45764586
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
4587+
ASSERT(RW_LOCK_HELD(&parent_db->db_rwlock));
45774588
blkptr_t *bp = parent_db->db.db_data;
45784589
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
45794590
}
@@ -4584,12 +4595,22 @@ dbuf_lightweight_ready(zio_t *zio)
45844595
{
45854596
dbuf_dirty_record_t *dr = zio->io_private;
45864597
blkptr_t *bp = zio->io_bp;
4598+
dmu_buf_impl_t *parent_db = NULL;
45874599

45884600
if (zio->io_error != 0)
45894601
return;
45904602

45914603
dnode_t *dn = dr->dr_dnode;
45924604

4605+
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4606+
if (dr->dr_parent == NULL) {
4607+
parent_db = dn->dn_dbuf;
4608+
} else {
4609+
parent_db = dr->dr_parent->dr_dbuf;
4610+
}
4611+
mutex_enter(&parent_db->db_mtx);
4612+
4613+
rw_enter(&parent_db->db_rwlock, RW_READER);
45934614
blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
45944615
spa_t *spa = dmu_objset_spa(dn->dn_objset);
45954616
int64_t delta = bp_get_dsize_sync(spa, bp) -
@@ -4609,16 +4630,13 @@ dbuf_lightweight_ready(zio_t *zio)
46094630
BP_SET_FILL(bp, fill);
46104631
}
46114632

4612-
dmu_buf_impl_t *parent_db;
4613-
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4614-
if (dr->dr_parent == NULL) {
4615-
parent_db = dn->dn_dbuf;
4616-
} else {
4617-
parent_db = dr->dr_parent->dr_dbuf;
4633+
if (!rw_tryupgrade(&parent_db->db_rwlock)) {
4634+
rw_exit(&parent_db->db_rwlock);
4635+
rw_enter(&parent_db->db_rwlock, RW_WRITER);
46184636
}
4619-
rw_enter(&parent_db->db_rwlock, RW_WRITER);
46204637
*bp_orig = *bp;
46214638
rw_exit(&parent_db->db_rwlock);
4639+
mutex_exit(&parent_db->db_mtx);
46224640
}
46234641

46244642
static void
@@ -4650,6 +4668,7 @@ noinline static void
46504668
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46514669
{
46524670
dnode_t *dn = dr->dr_dnode;
4671+
dmu_buf_impl_t *parent_db = NULL;
46534672
zio_t *pio;
46544673
if (dn->dn_phys->dn_nlevels == 1) {
46554674
pio = dn->dn_zio;
@@ -4668,6 +4687,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46684687
* See comment in dbuf_write(). This is so that zio->io_bp_orig
46694688
* will have the old BP in dbuf_lightweight_done().
46704689
*/
4690+
if (dr->dr_dnode->dn_phys->dn_nlevels != 1) {
4691+
parent_db = dr->dr_parent->dr_dbuf;
4692+
mutex_enter(&parent_db->db_mtx);
4693+
rw_enter(&parent_db->db_rwlock, RW_READER);
4694+
}
46714695
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
46724696

46734697
dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
@@ -4677,6 +4701,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46774701
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
46784702
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
46794703

4704+
if (parent_db) {
4705+
rw_exit(&parent_db->db_rwlock);
4706+
mutex_exit(&parent_db->db_mtx);
4707+
}
4708+
46804709
zio_nowait(dr->dr_zio);
46814710
}
46824711

@@ -4833,7 +4862,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
48334862
} else {
48344863
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
48354864
}
4865+
rw_enter(&db->db_rwlock, RW_READER);
48364866
memcpy((*datap)->b_data, db->db.db_data, psize);
4867+
rw_exit(&db->db_rwlock);
48374868
}
48384869
db->db_data_pending = dr;
48394870

@@ -4939,6 +4970,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49394970

49404971
if (dn->dn_type == DMU_OT_DNODE) {
49414972
i = 0;
4973+
rw_enter(&db->db_rwlock, RW_READER);
49424974
while (i < db->db.db_size) {
49434975
dnode_phys_t *dnp =
49444976
(void *)(((char *)db->db.db_data) + i);
@@ -4964,6 +4996,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49644996
DNODE_MIN_SIZE;
49654997
}
49664998
}
4999+
rw_exit(&db->db_rwlock);
49675000
} else {
49685001
if (BP_IS_HOLE(bp)) {
49695002
fill = 0;
@@ -4972,6 +5005,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49725005
}
49735006
}
49745007
} else {
5008+
rw_enter(&db->db_rwlock, RW_READER);
49755009
blkptr_t *ibp = db->db.db_data;
49765010
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
49775011
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
@@ -4981,6 +5015,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49815015
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
49825016
fill += BP_GET_FILL(ibp);
49835017
}
5018+
rw_exit(&db->db_rwlock);
49845019
}
49855020
DB_DNODE_EXIT(db);
49865021

@@ -5015,6 +5050,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
50155050
DB_DNODE_EXIT(db);
50165051
ASSERT3U(epbs, <, 31);
50175052

5053+
mutex_enter(&db->db_mtx);
5054+
rw_enter(&db->db_rwlock, RW_READER);
50185055
/* Determine if all our children are holes */
50195056
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
50205057
if (!BP_IS_HOLE(bp))
@@ -5031,10 +5068,14 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
50315068
* anybody from reading the blocks we're about to
50325069
* zero out.
50335070
*/
5034-
rw_enter(&db->db_rwlock, RW_WRITER);
5071+
if (!rw_tryupgrade(&db->db_rwlock)) {
5072+
rw_exit(&db->db_rwlock);
5073+
rw_enter(&db->db_rwlock, RW_WRITER);
5074+
}
50355075
memset(db->db.db_data, 0, db->db.db_size);
5036-
rw_exit(&db->db_rwlock);
50375076
}
5077+
rw_exit(&db->db_rwlock);
5078+
mutex_exit(&db->db_mtx);
50385079
}
50395080

50405081
static void
@@ -5230,11 +5271,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
52305271
* avoid lock contention, only grab it when we are actually
52315272
* changing the BP.
52325273
*/
5233-
if (rw != NULL)
5274+
if (rw != NULL && !rw_tryupgrade(rw)) {
5275+
rw_exit(rw);
52345276
rw_enter(rw, RW_WRITER);
5277+
}
52355278
*bp = bp_copy;
5236-
if (rw != NULL)
5237-
rw_exit(rw);
52385279
}
52395280
}
52405281

@@ -5250,6 +5291,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
52505291
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
52515292
return;
52525293

5294+
mutex_enter(&db->db_mtx);
5295+
rw_enter(&db->db_rwlock, RW_READER);
52535296
if (db->db_level > 0) {
52545297
blkptr_t *bp = db->db.db_data;
52555298
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
@@ -5268,6 +5311,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
52685311
}
52695312
}
52705313
}
5314+
rw_exit(&db->db_rwlock);
5315+
mutex_exit(&db->db_mtx);
52715316
}
52725317

52735318

module/zfs/dmu_objset.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2296,6 +2296,7 @@ void
22962296
dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
22972297
{
22982298
objset_t *os = dn->dn_objset;
2299+
krwlock_t *rw = NULL;
22992300
void *data = NULL;
23002301
dmu_buf_impl_t *db = NULL;
23012302
int flags = dn->dn_id_flags;
@@ -2340,8 +2341,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
23402341
FTAG, (dmu_buf_t **)&db);
23412342
ASSERT(error == 0);
23422343
mutex_enter(&db->db_mtx);
2343-
data = (before) ? db->db.db_data :
2344-
dmu_objset_userquota_find_data(db, tx);
2344+
if (before) {
2345+
rw = &db->db_rwlock;
2346+
data = db->db.db_data;
2347+
} else {
2348+
data = dmu_objset_userquota_find_data(db, tx);
2349+
}
23452350
have_spill = B_TRUE;
23462351
} else {
23472352
mutex_enter(&dn->dn_mtx);
@@ -2355,7 +2360,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
23552360
* type has changed and that type isn't an object type to track
23562361
*/
23572362
zfs_file_info_t zfi;
2363+
if (rw)
2364+
rw_enter(rw, RW_READER);
23582365
error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
2366+
if (rw)
2367+
rw_exit(rw);
23592368

23602369
if (before) {
23612370
ASSERT(data);

0 commit comments

Comments
 (0)