Skip to content

Be more careful with locking db.db_mtx #17418

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 58 additions & 13 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1193,7 +1193,8 @@ dbuf_verify(dmu_buf_impl_t *db)
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
(db->db_buf == NULL || db->db_buf->b_data) &&
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg) &&
RW_LOCK_HELD(&db->db_rwlock)) {
/*
* If the blkptr isn't set but they have nonzero data,
* it had better be dirty, otherwise we'll lose that
Expand Down Expand Up @@ -1697,7 +1698,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
rw_enter(&db->db_rwlock, RW_READER);
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
rw_exit(&db->db_rwlock);
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
dnode_t *dn = DB_DNODE(db);
int size = arc_buf_size(db->db_buf);
Expand Down Expand Up @@ -1727,7 +1730,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
rw_enter(&db->db_rwlock, RW_READER);
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
rw_exit(&db->db_rwlock);
} else {
db->db_buf = NULL;
dbuf_clear_data(db);
Expand Down Expand Up @@ -2999,7 +3004,9 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
rw_enter(&db->db_rwlock, RW_WRITER);
memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
db->db_freed_in_flight = FALSE;
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db,
Expand Down Expand Up @@ -3374,8 +3381,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
*parentp = NULL;
return (err);
}
mutex_enter(&(*parentp)->db_mtx);
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
(blkid & ((1ULL << epbs) - 1));
mutex_exit(&(*parentp)->db_mtx);
return (0);
} else {
/* the block is referenced from the dnode */
Expand Down Expand Up @@ -4560,10 +4569,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
} else {
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
ASSERT(MUTEX_HELD(&parent_db->db_mtx));
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
VERIFY3U(parent_db->db_level, ==, 1);
VERIFY3P(DB_DNODE(parent_db), ==, dn);
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
ASSERT(RW_LOCK_HELD(&parent_db->db_rwlock));
blkptr_t *bp = parent_db->db.db_data;
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
}
Expand All @@ -4574,12 +4585,22 @@ dbuf_lightweight_ready(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
blkptr_t *bp = zio->io_bp;
dmu_buf_impl_t *parent_db = NULL;

if (zio->io_error != 0)
return;

dnode_t *dn = dr->dr_dnode;

EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
if (dr->dr_parent == NULL) {
parent_db = dn->dn_dbuf;
} else {
parent_db = dr->dr_parent->dr_dbuf;
}
mutex_enter(&parent_db->db_mtx);

rw_enter(&parent_db->db_rwlock, RW_READER);
blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
spa_t *spa = dmu_objset_spa(dn->dn_objset);
int64_t delta = bp_get_dsize_sync(spa, bp) -
Expand All @@ -4599,16 +4620,13 @@ dbuf_lightweight_ready(zio_t *zio)
BP_SET_FILL(bp, fill);
}

dmu_buf_impl_t *parent_db;
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
if (dr->dr_parent == NULL) {
parent_db = dn->dn_dbuf;
} else {
parent_db = dr->dr_parent->dr_dbuf;
if (!rw_tryupgrade(&parent_db->db_rwlock)) {
rw_exit(&parent_db->db_rwlock);
rw_enter(&parent_db->db_rwlock, RW_WRITER);
}
rw_enter(&parent_db->db_rwlock, RW_WRITER);
*bp_orig = *bp;
rw_exit(&parent_db->db_rwlock);
mutex_exit(&parent_db->db_mtx);
}

static void
Expand Down Expand Up @@ -4640,6 +4658,7 @@ noinline static void
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dnode_t *dn = dr->dr_dnode;
dmu_buf_impl_t *parent_db = NULL;
zio_t *pio;
if (dn->dn_phys->dn_nlevels == 1) {
pio = dn->dn_zio;
Expand All @@ -4658,6 +4677,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* See comment in dbuf_write(). This is so that zio->io_bp_orig
* will have the old BP in dbuf_lightweight_done().
*/
if (dr->dr_dnode->dn_phys->dn_nlevels != 1) {
parent_db = dr->dr_parent->dr_dbuf;
mutex_enter(&parent_db->db_mtx);
rw_enter(&parent_db->db_rwlock, RW_READER);
}
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);

dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
Expand All @@ -4667,6 +4691,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);

if (parent_db) {
rw_exit(&parent_db->db_rwlock);
mutex_exit(&parent_db->db_mtx);
}

zio_nowait(dr->dr_zio);
}

Expand Down Expand Up @@ -4823,7 +4852,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
} else {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
rw_enter(&db->db_rwlock, RW_READER);
memcpy((*datap)->b_data, db->db.db_data, psize);
rw_exit(&db->db_rwlock);
}
db->db_data_pending = dr;

Expand Down Expand Up @@ -4929,6 +4960,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)

if (dn->dn_type == DMU_OT_DNODE) {
i = 0;
rw_enter(&db->db_rwlock, RW_READER);
while (i < db->db.db_size) {
dnode_phys_t *dnp =
(void *)(((char *)db->db.db_data) + i);
Expand All @@ -4954,6 +4986,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DNODE_MIN_SIZE;
}
}
rw_exit(&db->db_rwlock);
} else {
if (BP_IS_HOLE(bp)) {
fill = 0;
Expand All @@ -4962,6 +4995,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
}
}
} else {
rw_enter(&db->db_rwlock, RW_READER);
blkptr_t *ibp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
Expand All @@ -4971,6 +5005,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
fill += BP_GET_FILL(ibp);
}
rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);

Expand Down Expand Up @@ -5005,6 +5040,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
ASSERT3U(epbs, <, 31);

mutex_enter(&db->db_mtx);
rw_enter(&db->db_rwlock, RW_READER);
/* Determine if all our children are holes */
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
Expand All @@ -5021,10 +5058,14 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
* anybody from reading the blocks we're about to
* zero out.
*/
rw_enter(&db->db_rwlock, RW_WRITER);
if (!rw_tryupgrade(&db->db_rwlock)) {
rw_exit(&db->db_rwlock);
rw_enter(&db->db_rwlock, RW_WRITER);
}
memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
}
rw_exit(&db->db_rwlock);
mutex_exit(&db->db_mtx);
}

static void
Expand Down Expand Up @@ -5220,11 +5261,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
* avoid lock contention, only grab it when we are actually
* changing the BP.
*/
if (rw != NULL)
if (rw != NULL && !rw_tryupgrade(rw)) {
rw_exit(rw);
rw_enter(rw, RW_WRITER);
}
*bp = bp_copy;
if (rw != NULL)
rw_exit(rw);
}
}

Expand All @@ -5240,6 +5281,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
return;

mutex_enter(&db->db_mtx);
rw_enter(&db->db_rwlock, RW_READER);
if (db->db_level > 0) {
blkptr_t *bp = db->db.db_data;
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
Expand All @@ -5258,6 +5301,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
}
}
}
rw_exit(&db->db_rwlock);
mutex_exit(&db->db_mtx);
}


Expand Down
13 changes: 11 additions & 2 deletions module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -2190,6 +2190,7 @@ void
dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
krwlock_t *rw = NULL;
void *data = NULL;
dmu_buf_impl_t *db = NULL;
int flags = dn->dn_id_flags;
Expand Down Expand Up @@ -2234,8 +2235,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
FTAG, (dmu_buf_t **)&db);
ASSERT(error == 0);
mutex_enter(&db->db_mtx);
data = (before) ? db->db.db_data :
dmu_objset_userquota_find_data(db, tx);
if (before) {
rw = &db->db_rwlock;
data = db->db.db_data;
} else {
data = dmu_objset_userquota_find_data(db, tx);
}
have_spill = B_TRUE;
} else {
mutex_enter(&dn->dn_mtx);
Expand All @@ -2249,7 +2254,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
* type has changed and that type isn't an object type to track
*/
zfs_file_info_t zfi;
if (rw)
rw_enter(rw, RW_READER);
error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
if (rw)
rw_exit(rw);

if (before) {
ASSERT(data);
Expand Down
Loading
Loading