Skip to content

Add vdev property to toggle adding io to vdev queue #17358

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,9 +385,23 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
VDEV_PROP_SCHEDULER,
VDEV_NUM_PROPS
} vdev_prop_t;

/*
* Different scheduling behaviors for vdev prop io_scheduler.
* VDEV_SCHEDULER_AUTO = Don't queue if vdev is nonrot and backed by blkdev,
* queue otherwise.
* VDEV_SCHEDULER_CLASSIC = Always queue.
* VDEV_SCHEDULER_NONE = Never Queue.
*/
typedef enum {
VDEV_SCHEDULER_AUTO,
VDEV_SCHEDULER_CLASSIC,
VDEV_SCHEDULER_NONE
} vdev_scheduler_type_t;

/*
* Dataset property functions shared between libzfs and kernel.
*/
Expand Down
2 changes: 2 additions & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ struct vdev {
boolean_t vdev_resilver_deferred; /* resilver deferred */
boolean_t vdev_kobj_flag; /* kobj event record */
boolean_t vdev_attaching; /* vdev attach ashift handling */
boolean_t vdev_is_blkdev; /* vdev is backed by block device */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
Expand Down Expand Up @@ -466,6 +467,7 @@ struct vdev {
uint64_t vdev_io_t;
uint64_t vdev_slow_io_n;
uint64_t vdev_slow_io_t;
uint64_t vdev_scheduler; /* control how I/Os are submitted */
};

#define VDEV_PAD_SIZE (8 << 10)
Expand Down
3 changes: 2 additions & 1 deletion lib/libzfs/libzfs.abi
Original file line number Diff line number Diff line change
Expand Up @@ -5929,7 +5929,8 @@
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
<enumerator name='VDEV_NUM_PROPS' value='52'/>
<enumerator name='VDEV_PROP_SCHEDULER' value='52'/>
<enumerator name='VDEV_NUM_PROPS' value='53'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
Expand Down
16 changes: 16 additions & 0 deletions man/man7/vdevprops.7
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,22 @@ If this device should perform new allocations, used to disable a device
when it is scheduled for later removal.
See
.Xr zpool-remove 8 .
.It Sy io_scheduler Ns = Ns Sy auto Ns | Ns Sy classic Ns | Ns Sy none
Controls how I/O requests are added to the vdev queue when reading or
writing to this vdev.
.It Sy auto
Does not add I/O requests to the vdev queue if the vdev is backed by a
non-rotational block device.
This can sometimes improve performance for direct IOs.
I/O requests will be added to the vdev queue if the vdev is backed by a
rotational block device or file.
This is the default behavior.
.It Sy classic
Always adds I/O requests to the vdev queue.
.It Sy none
Never adds I/O requests to the vdev queue.
This is not recommended for vdevs backed by spinning disks as it could
result in starvation.
.El
.Ss User Properties
In addition to the standard native properties, ZFS supports arbitrary user
Expand Down
3 changes: 3 additions & 0 deletions module/os/freebsd/zfs/vdev_geom.c
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,9 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
else
vd->vdev_nonrot = B_FALSE;

/* Is backed by a block device. */
vd->vdev_is_blkdev = B_TRUE;

/* Set when device reports it supports TRIM. */
error = g_getattr("GEOM::candelete", cp, &has_trim);
vd->vdev_has_trim = (error == 0 && has_trim);
Expand Down
3 changes: 3 additions & 0 deletions module/os/linux/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
/* Inform the ZIO pipeline that we are non-rotational */
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));

/* Is backed by a block device. */
v->vdev_is_blkdev = B_TRUE;

/* Physical volume size in bytes for the partition */
*psize = bdev_capacity(bdev);

Expand Down
11 changes: 11 additions & 0 deletions module/zcommon/zpool_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,13 @@ vdev_prop_init(void)
{ NULL }
};

static const zprop_index_t vdevschedulertype_table[] = {
{ "auto", VDEV_SCHEDULER_AUTO },
{ "classic", VDEV_SCHEDULER_CLASSIC },
{ "none", VDEV_SCHEDULER_NONE },
{ NULL }
};

struct zfs_mod_supported_features *sfeatures =
zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES);

Expand Down Expand Up @@ -470,6 +477,10 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_SCHEDULER, "scheduler",
VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV,
"auto | classic | none", "IO_SCHEDULER", vdevschedulertype_table,
sfeatures);

/* default index properties */
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,
Expand Down
18 changes: 18 additions & 0 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);

vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER);

list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
Expand Down Expand Up @@ -3860,6 +3862,12 @@ vdev_load(vdev_t *vd)
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);

error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER,
&vd->vdev_scheduler);
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
}

/*
Expand Down Expand Up @@ -6095,6 +6103,15 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_slow_io_t = intval;
break;
case VDEV_PROP_SCHEDULER:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
if (vd->vdev_ops->vdev_op_leaf) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only allowing this to be set on leaves has the complication that when the drive is replaced it will revert back to the default value for the property. The VDEV_PROP_CHECKSUM/IO/SLOW_N/T properties handle this with a workaround which inherits non-default values if they're set on any of the parents (see vdev_prop_get_inherited). We should consider doing the same thing. The only wrinkle would be you'd want to cache to value in the vdev_t.

vd->vdev_scheduler = intval;
}
break;
default:
/* Most processing is done in vdev_props_set_sync */
break;
Expand Down Expand Up @@ -6458,6 +6475,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_IO_T:
case VDEV_PROP_SLOW_IO_N:
case VDEV_PROP_SLOW_IO_T:
case VDEV_PROP_SCHEDULER:
err = vdev_prop_get_int(vd, prop, &intval);
if (err && err != ENOENT)
break;
Expand Down
3 changes: 3 additions & 0 deletions module/zfs/vdev_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
*/
vd->vdev_nonrot = B_TRUE;

/* Is not backed by a block device. */
vd->vdev_is_blkdev = B_FALSE;

/*
* Allow TRIM on file based vdevs. This may not always be supported,
* since it depends on your kernel version and underlying filesystem
Expand Down
41 changes: 41 additions & 0 deletions module/zfs/vdev_queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,38 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
return (zio);
}

static boolean_t
vdev_should_queue_zio(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
boolean_t should_queue = B_TRUE;

/*
* Add zio with ZIO_FLAG_NODATA to queue as bypass code
* currently does not handle certain cases (gang abd, raidz
* write aggregation).
*/
if (zio->io_flags & ZIO_FLAG_NODATA)
return (B_TRUE);

switch (vd->vdev_scheduler) {
case VDEV_SCHEDULER_AUTO:
if (vd->vdev_nonrot && vd->vdev_is_blkdev)
should_queue = B_FALSE;
break;
case VDEV_SCHEDULER_CLASSIC:
should_queue = B_TRUE;
break;
case VDEV_SCHEDULER_NONE:
should_queue = B_FALSE;
break;
default:
should_queue = B_TRUE;
break;
}
return (should_queue);
}

zio_t *
vdev_queue_io(zio_t *zio)
{
Expand Down Expand Up @@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();

if (!vdev_should_queue_zio(zio)) {
zio->io_queue_state = ZIO_QS_NONE;
return (zio);
}

mutex_enter(&vq->vq_lock);
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
Expand Down Expand Up @@ -954,6 +991,10 @@ vdev_queue_io_done(zio_t *zio)
vq->vq_io_complete_ts = now;
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;

if (zio->io_queue_state == ZIO_QS_NONE) {
return;
}

mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);

Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
'user_property_001_pos', 'user_property_002_neg',
'zpool_set_clear_userprop']
'zpool_set_clear_userprop','vdev_set_io_scheduler']
tags = ['functional', 'cli_root', 'zpool_set']

[tests/functional/cli_root/zpool_split]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1248,6 +1248,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_set/setup.ksh \
functional/cli_root/zpool/setup.ksh \
functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh \
functional/cli_root/zpool_set/zpool_set_common.kshlib \
functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,5 @@ typeset -a properties=(
trim_support
trim_errors
slow_ios
scheduler
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2025 by Triad National Security, LLC.
#

. $STF_SUITE/include/libtest.shlib

#
# DESCRIPTION:
# Setting vdev io_scheduler property while reading from vdev should not cause panic.
#
# STRATEGY:
# 1. Create a zpool
# 2. Write a file to the pool.
# 3. Start reading from file, while also setting the io_scheduler property.
#

verify_runnable "global"

command -v fio > /dev/null || log_unsupported "fio missing"
log_must save_tunable DIO_ENABLED
log_must set_tunable32 DIO_ENABLED 1

function set_io_scheduler
{
zpool set scheduler=none $TESTPOOL1 $FILEDEV
sleep 0.1
zpool set scheduler=classic $TESTPOOL1 $FILEDEV
sleep 0.1
}

function cleanup
{
destroy_pool $TESTPOOL1
log_must rm -f $FILEDEV
log_must restore_tunable DIO_ENABLED
}

log_assert "Toggling vdev scheduler property while reading from vdev should not cause panic"
log_onexit cleanup

# 1. Create a pool

FILEDEV="$TEST_BASE_DIR/filedev.$$"
log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV
create_pool $TESTPOOL1 $FILEDEV

mntpnt=$(get_prop mountpoint $TESTPOOL1)

# 2. Write a file to the pool, while also setting the io_scheduler property.

log_must eval "fio --filename=$mntpnt/foobar --name=write-file \
--rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
--ioengine=sync --time_based --runtime=10 &"

ITERATIONS=30

for i in $(seq $ITERATIONS); do
log_must set_io_scheduler
done;
wait

# 3. Starting reading from file, while also setting the io_scheduler property.

log_must eval "fio --filename=$mntpnt/foobar --name=read-file \
--rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
--ioengine=sync --time_based --runtime=10 &"

for i in $(seq $ITERATIONS); do
log_must set_io_scheduler
done;
wait

log_pass "Setting vdev scheduler property while reading from vdev does not cause panic"
Loading