Skip to content

Commit 3d00930

Browse files
committed
deadlock between spa_errlog_lock and dp_config_rwlock
There is a lock order inversion deadlock between `spa_errlog_lock` and `dp_config_rwlock`: A thread in `spa_delete_dataset_errlog()` is running from a sync task. It is holding the `dp_config_rwlock` for writer (see `dsl_sync_task_sync()`), and waiting for the `spa_errlog_lock`. A thread in `dsl_pool_config_enter()` is holding the `spa_errlog_lock` (see `spa_get_errlog_size()`) and waiting for the `dp_config_rwlock` (as reader). Note that this was introduced by #12812. This commit address this by defining the lock ordering to be dp_config_rwlock first, then spa_errlog_lock / spa_errlist_lock. spa_get_errlog() and spa_get_errlog_size() can acquire the locks in this order, and then process_error_block() and get_head_and_birth_txg() can verify that the dp_config_rwlock is already held. Additionally, a buffer overrun in `spa_get_errlog()` is corrected. Many code paths didn't check if `*count` got to zero, instead continuing to overwrite past the beginning of the userspace buffer at `uaddr`. Tested by having some errors in the pool (via `zinject -t data /path/to/file`), one thread running `zpool iostat 0.001`, and another thread runs `zfs destroy` (in a loop, although it hits the first time). This reproduces the problem easily without the fix, and works with the fix. Closes #14239 Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
1 parent 5f73bbb commit 3d00930

File tree

8 files changed

+132
-204
lines changed

8 files changed

+132
-204
lines changed

cmd/ztest.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6313,7 +6313,7 @@ ztest_scrub_impl(spa_t *spa)
63136313
while (dsl_scan_scrubbing(spa_get_dsl(spa)))
63146314
txg_wait_synced(spa_get_dsl(spa), 0);
63156315

6316-
if (spa_get_errlog_size(spa) > 0)
6316+
if (spa_approx_errlog_size(spa) > 0)
63176317
return (ECKSUM);
63186318

63196319
ztest_pool_scrubbed = B_TRUE;

include/sys/spa.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1146,7 +1146,7 @@ extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
11461146
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
11471147
extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
11481148
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
1149-
extern uint64_t spa_get_errlog_size(spa_t *spa);
1149+
extern uint64_t spa_approx_errlog_size(spa_t *spa);
11501150
extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
11511151
extern void spa_errlog_rotate(spa_t *spa);
11521152
extern void spa_errlog_drain(spa_t *spa);

lib/libzfs/libzfs_pool.c

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4133,33 +4133,28 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
41334133
{
41344134
zfs_cmd_t zc = {"\0"};
41354135
libzfs_handle_t *hdl = zhp->zpool_hdl;
4136-
uint64_t count;
4137-
zbookmark_phys_t *zb = NULL;
4138-
int i;
4136+
zbookmark_phys_t *buf;
4137+
uint64_t buflen = 10000; /* approx. 1MB of RAM */
4138+
4139+
if (fnvlist_lookup_uint64(zhp->zpool_config,
4140+
ZPOOL_CONFIG_ERRCOUNT) == 0)
4141+
return (0);
41394142

41404143
/*
4141-
* Retrieve the raw error list from the kernel. If the number of errors
4142-
* has increased, allocate more space and continue until we get the
4143-
* entire list.
4144+
* Retrieve the raw error list from the kernel. If it doesn't fit,
4145+
* allocate a larger buffer and retry.
41444146
*/
4145-
count = fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT);
4146-
if (count == 0)
4147-
return (0);
4148-
zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
4149-
count * sizeof (zbookmark_phys_t));
4150-
zc.zc_nvlist_dst_size = count;
41514147
(void) strcpy(zc.zc_name, zhp->zpool_name);
41524148
for (;;) {
4149+
buf = zfs_alloc(zhp->zpool_hdl,
4150+
buflen * sizeof (zbookmark_phys_t));
4151+
zc.zc_nvlist_dst = (uintptr_t)buf;
4152+
zc.zc_nvlist_dst_size = buflen;
41534153
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG,
41544154
&zc) != 0) {
4155-
free((void *)(uintptr_t)zc.zc_nvlist_dst);
4155+
free(buf);
41564156
if (errno == ENOMEM) {
4157-
void *dst;
4158-
4159-
count = zc.zc_nvlist_dst_size;
4160-
dst = zfs_alloc(zhp->zpool_hdl, count *
4161-
sizeof (zbookmark_phys_t));
4162-
zc.zc_nvlist_dst = (uintptr_t)dst;
4157+
buflen *= 2;
41634158
} else {
41644159
return (zpool_standard_error_fmt(hdl, errno,
41654160
dgettext(TEXT_DOMAIN, "errors: List of "
@@ -4177,18 +4172,17 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
41774172
* _not_ copied as part of the process. So we point the start of our
41784173
* array appropriate and decrement the total number of elements.
41794174
*/
4180-
zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
4181-
zc.zc_nvlist_dst_size;
4182-
count -= zc.zc_nvlist_dst_size;
4175+
zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size;
4176+
uint64_t zblen = buflen - zc.zc_nvlist_dst_size;
41834177

4184-
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
4178+
qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
41854179

41864180
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
41874181

41884182
/*
41894183
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
41904184
*/
4191-
for (i = 0; i < count; i++) {
4185+
for (uint64_t i = 0; i < zblen; i++) {
41924186
nvlist_t *nv;
41934187

41944188
/* ignoring zb_blkid and zb_level for now */
@@ -4215,11 +4209,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
42154209
nvlist_free(nv);
42164210
}
42174211

4218-
free((void *)(uintptr_t)zc.zc_nvlist_dst);
4212+
free(buf);
42194213
return (0);
42204214

42214215
nomem:
4222-
free((void *)(uintptr_t)zc.zc_nvlist_dst);
4216+
free(buf);
42234217
return (no_memory(zhp->zpool_hdl));
42244218
}
42254219

lib/libzfs/libzfs_status.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,6 @@ check_status(nvlist_t *config, boolean_t isimport,
222222
{
223223
pool_scan_stat_t *ps = NULL;
224224
uint_t vsc, psc;
225-
uint64_t nerr;
226225
uint64_t suspended;
227226
uint64_t hostid = 0;
228227
uint64_t errata = 0;
@@ -392,6 +391,7 @@ check_status(nvlist_t *config, boolean_t isimport,
392391
* Persistent data errors.
393392
*/
394393
if (!isimport) {
394+
uint64_t nerr;
395395
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
396396
&nerr) == 0 && nerr != 0)
397397
return (ZPOOL_STATUS_CORRUPT_DATA);

module/zfs/dsl_scan.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -944,13 +944,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
944944

945945
if (dsl_scan_restarting(scn, tx))
946946
spa_history_log_internal(spa, "scan aborted, restarting", tx,
947-
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
947+
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
948948
else if (!complete)
949949
spa_history_log_internal(spa, "scan cancelled", tx,
950-
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
950+
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
951951
else
952952
spa_history_log_internal(spa, "scan done", tx,
953-
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
953+
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
954954

955955
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
956956
spa->spa_scrub_active = B_FALSE;
@@ -1013,7 +1013,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
10131013
vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
10141014
spa_history_log_internal(spa,
10151015
"starting deferred resilver", tx, "errors=%llu",
1016-
(u_longlong_t)spa_get_errlog_size(spa));
1016+
(u_longlong_t)spa_approx_errlog_size(spa));
10171017
spa_async_request(spa, SPA_ASYNC_RESILVER);
10181018
}
10191019

module/zfs/spa.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5543,7 +5543,7 @@ spa_get_stats(const char *name, nvlist_t **config,
55435543

55445544
fnvlist_add_uint64(*config,
55455545
ZPOOL_CONFIG_ERRCOUNT,
5546-
spa_get_errlog_size(spa));
5546+
spa_approx_errlog_size(spa));
55475547

55485548
if (spa_suspended(spa)) {
55495549
fnvlist_add_uint64(*config,

0 commit comments

Comments
 (0)