Skip to content

Commit 77e4878

Browse files
committed
Detect a slow raidz child during reads
A single slow responding disk can affect the overall read performance of a raidz group. When a raidz child disk is determined to be a persistent slow outlier, then have it sit out during reads for a period of time. The raidz group can use parity to reconstruct the data that was skipped. Each time a slow disk is placed into a sit out period, its `vdev_stat.vs_slow_ios count` is incremented and a zevent class `ereport.fs.zfs.delay` is posted. The length of the sit out period can be changed using the `raid_read_sit_out_secs` module parameter. Setting it to zero disables slow outlier detection. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Don Brady <don.brady@klarasystems.com>
1 parent 18c67d2 commit 77e4878

File tree

12 files changed

+345
-1
lines changed

12 files changed

+345
-1
lines changed

include/sys/fs/zfs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ typedef enum {
379379
VDEV_PROP_TRIM_SUPPORT,
380380
VDEV_PROP_TRIM_ERRORS,
381381
VDEV_PROP_SLOW_IOS,
382+
VDEV_PROP_SIT_OUT_READS,
382383
VDEV_NUM_PROPS
383384
} vdev_prop_t;
384385

include/sys/vdev_impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ struct vdev {
285285
boolean_t vdev_ishole; /* is a hole in the namespace */
286286
uint64_t vdev_top_zap;
287287
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
288+
uint64_t vdev_last_latency_check;
288289

289290
/* pool checkpoint related */
290291
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
@@ -432,6 +433,9 @@ struct vdev {
432433
hrtime_t vdev_mmp_pending; /* 0 if write finished */
433434
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
434435
uint64_t vdev_expansion_time; /* vdev's last expansion time */
436+
uint64_t vdev_outlier_count; /* read outlier amongst peers */
437+
uint64_t vdev_ewma_latency; /* moving average read latency */
438+
hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
435439
list_node_t vdev_leaf_node; /* leaf vdev list */
436440

437441
/*

include/sys/vdev_raidz.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
6060
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
6161
void vdev_raidz_reflow_copy_scratch(spa_t *);
6262
void raidz_dtl_reassessed(vdev_t *);
63+
boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
6364

6465
extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
6566

include/sys/vdev_raidz_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ typedef struct raidz_col {
118118
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
119119
uint8_t rc_force_repair:1; /* Write good data to this column */
120120
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
121+
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
121122
int rc_shadow_devidx; /* for double write during expansion */
122123
int rc_shadow_error; /* for double write during expansion */
123124
uint64_t rc_shadow_offset; /* for double write during expansion */
@@ -132,6 +133,7 @@ typedef struct raidz_row {
132133
int rr_firstdatacol; /* First data column/parity count */
133134
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
134135
int rr_nempty; /* empty sectors included in parity */
136+
int rr_outlier_cnt; /* Count of latency outlier devices */
135137
#ifdef ZFS_DEBUG
136138
uint64_t rr_offset; /* Logical offset for *_io_verify() */
137139
uint64_t rr_size; /* Physical size for *_io_verify() */

lib/libzfs/libzfs.abi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5917,7 +5917,8 @@
59175917
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
59185918
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
59195919
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
5920-
<enumerator name='VDEV_NUM_PROPS' value='52'/>
5920+
<enumerator name='VDEV_PROP_SIT_OUT_READS' value='52'/>
5921+
<enumerator name='VDEV_NUM_PROPS' value='53'/>
59215922
</enum-decl>
59225923
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
59235924
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>

lib/libzfs/libzfs_pool.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5478,6 +5478,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
54785478
/* Only use if provided by the RAIDZ VDEV above */
54795479
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
54805480
return (ENOENT);
5481+
if (prop == VDEV_PROP_SIT_OUT_READS)
5482+
return (ENOENT);
54815483
}
54825484
if (vdev_prop_index_to_string(prop, intval,
54835485
(const char **)&strval) != 0)

man/man4/zfs.4

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,18 @@ For testing, pause RAID-Z expansion when reflow amount reaches this value.
501501
.It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong
502502
For expanded RAID-Z, aggregate reads that have more rows than this.
503503
.
504+
.It Sy raidz_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
505+
When a slow disk outlier is detected it is placed in a sit out state.
506+
While sitting out the disk will not participate in normal reads, instead its
507+
data will be reconstructed as needed from parity.
508+
Resilver and scrub operations will always read from a disk, even if it's
509+
sitting out.
510+
Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time.
511+
Writes will still be issued to a disk which is sitting out to maintain full
512+
redundancy.
513+
Defaults to 600 seconds and a value of zero disables slow disk outlier
514+
detection.
515+
.
504516
.It Sy reference_history Ns = Ns Sy 3 Pq int
505517
Maximum reference holders being tracked when reference_tracking_enable is
506518
active.

man/man7/vdevprops.7

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,19 @@ Comma separated list of children of this vdev
104104
The number of children belonging to this vdev
105105
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
106106
The number of errors of each type encountered by this vdev
107+
.It Sy sit_out_reads
108+
True when a slow disk outlier was detected and the vdev is currently in a sit
109+
out state.
110+
While sitting out, the vdev will not participate in normal reads, instead its
111+
data will be reconstructed as needed from parity.
107112
.It Sy slow_ios
108113
The number of slow I/Os encountered by this vdev,
109114
These represent I/O operations that didn't complete in
110115
.Sy zio_slow_io_ms
111116
milliseconds
112117
.Pq Sy 30000 No by default .
118+
Can also be incremented when a vdev was determined to be a raidz leaf latency
119+
outlier.
113120
.It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
114121
The number of I/O operations of each type performed by this vdev
115122
.It Xo

module/zcommon/zpool_prop.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,9 @@ vdev_prop_init(void)
466466
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
467467
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
468468
boolean_table, sfeatures);
469+
zprop_register_index(VDEV_PROP_SIT_OUT_READS, "sit_out_reads", 0,
470+
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT_READS",
471+
boolean_table, sfeatures);
469472
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
470473
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
471474
boolean_table, sfeatures);

module/zfs/vdev.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4521,6 +4521,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
45214521
vd->vdev_stat.vs_checksum_errors = 0;
45224522
vd->vdev_stat.vs_dio_verify_errors = 0;
45234523
vd->vdev_stat.vs_slow_ios = 0;
4524+
atomic_store_64(&vd->vdev_outlier_count, 0);
4525+
vd->vdev_read_sit_out_expire = 0;
45244526

45254527
for (int c = 0; c < vd->vdev_children; c++)
45264528
vdev_clear(spa, vd->vdev_child[c]);
@@ -6361,6 +6363,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
63616363
ZPROP_SRC_NONE);
63626364
}
63636365
continue;
6366+
case VDEV_PROP_SIT_OUT_READS:
6367+
/* Only expose this for a draid or raidz leaf */
6368+
if (vd->vdev_ops->vdev_op_leaf &&
6369+
vd->vdev_top != NULL &&
6370+
(vd->vdev_top->vdev_ops ==
6371+
&vdev_raidz_ops ||
6372+
vd->vdev_top->vdev_ops ==
6373+
&vdev_draid_ops)) {
6374+
vdev_prop_add_list(outnvl, propname,
6375+
NULL, vdev_sit_out_reads(vd, 0),
6376+
ZPROP_SRC_NONE);
6377+
}
6378+
continue;
63646379
case VDEV_PROP_TRIM_SUPPORT:
63656380
/* only valid for leaf vdevs */
63666381
if (vd->vdev_ops->vdev_op_leaf) {

module/zfs/vdev_draid.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1993,6 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
19931993
rc->rc_force_repair = 1;
19941994
rc->rc_allow_repair = 1;
19951995
}
1996+
} else if (vdev_sit_out_reads(cvd, zio->io_flags)) {
1997+
rr->rr_outlier_cnt++;
1998+
rc->rc_latency_outlier = 1;
1999+
}
2000+
}
2001+
2002+
/*
2003+
* When the row contains a latency outlier and sufficient parity
2004+
* exists to reconstruct the column data, then skip reading the
2005+
* known slow child vdev as a performance optimization.
2006+
*/
2007+
if (rr->rr_outlier_cnt > 0 && rr->rr_missingdata == 0 &&
2008+
(rr->rr_firstdatacol - rr->rr_missingparity) > 0) {
2009+
2010+
for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
2011+
raidz_col_t *rc = &rr->rr_col[c];
2012+
2013+
if (rc->rc_latency_outlier) {
2014+
rr->rr_missingdata++;
2015+
rc->rc_error = SET_ERROR(EAGAIN);
2016+
rc->rc_skipped = 1;
2017+
break;
2018+
}
19962019
}
19972020
}
19982021

0 commit comments

Comments
 (0)