Skip to content

Unified allocation throttling #17020

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ extern uint64_t zfs_wrlog_data_max;
extern uint_t zfs_dirty_data_max_percent;
extern uint_t zfs_dirty_data_max_max_percent;
extern uint_t zfs_delay_min_dirty_percent;
extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
extern uint_t zfs_vdev_async_write_active_max_dirty_percent;
extern uint64_t zfs_delay_scale;

/* These macros are for indexing into the zfs_all_blkstats_t. */
Expand Down
28 changes: 12 additions & 16 deletions include/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,13 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
/*
* metaslab alloc flags
*/
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_ZIL 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10
#define METASLAB_MUST_RESERVE 0x20
#define METASLAB_ZIL 0x80

int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
int);
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int,
uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *);
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
Expand All @@ -103,15 +98,17 @@ void metaslab_stat_fini(void);
void metaslab_trace_init(zio_alloc_list_t *);
void metaslab_trace_fini(zio_alloc_list_t *);

metaslab_class_t *metaslab_class_create(spa_t *, const metaslab_ops_t *);
metaslab_class_t *metaslab_class_create(spa_t *, const metaslab_ops_t *,
boolean_t);
void metaslab_class_destroy(metaslab_class_t *);
int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_validate(metaslab_class_t *);
void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, zio_t *,
boolean_t, boolean_t *);
boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *);
Expand All @@ -130,9 +127,8 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, const void *, int, int,
boolean_t);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, const void *, int);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t,
const void *);
void metaslab_recalculate_weight_and_sort(metaslab_t *);
void metaslab_disable(metaslab_t *);
void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
Expand Down
61 changes: 16 additions & 45 deletions include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,23 +141,24 @@ typedef enum trace_alloc_type {
* Per-allocator data structure.
*/
typedef struct metaslab_class_allocator {
kmutex_t mca_lock;
avl_tree_t mca_tree;

metaslab_group_t *mca_rotor;
uint64_t mca_aliquot;

/*
* The allocation throttle works on a reservation system. Whenever
* an asynchronous zio wants to perform an allocation it must
* first reserve the number of blocks that it wants to allocate.
* first reserve the number of bytes that it wants to allocate.
* If there aren't sufficient slots available for the pending zio
* then that I/O is throttled until more slots free up. The current
* number of reserved allocations is maintained by the mca_alloc_slots
* refcount. The mca_alloc_max_slots value determines the maximum
* number of allocations that the system allows. Gang blocks are
* allowed to reserve slots even if we've reached the maximum
* number of allocations allowed.
* size of reserved allocations is maintained by mca_reserved.
* The maximum total size of reserved allocations is determined by
* mc_alloc_max in the metaslab_class_t. Gang blocks are allowed
* to reserve for their headers even if we've reached the maximum.
*/
uint64_t mca_alloc_max_slots;
zfs_refcount_t mca_alloc_slots;
uint64_t mca_reserved;
} ____cacheline_aligned metaslab_class_allocator_t;

/*
Expand Down Expand Up @@ -190,10 +191,10 @@ struct metaslab_class {
*/
uint64_t mc_groups;

/*
* Toggle to enable/disable the allocation throttle.
*/
boolean_t mc_is_log;
boolean_t mc_alloc_throttle_enabled;
uint64_t mc_alloc_io_size;
uint64_t mc_alloc_max;

uint64_t mc_alloc_groups; /* # of allocatable groups */

Expand All @@ -216,11 +217,10 @@ struct metaslab_class {
* Per-allocator data structure.
*/
typedef struct metaslab_group_allocator {
uint64_t mga_cur_max_alloc_queue_depth;
zfs_refcount_t mga_alloc_queue_depth;
zfs_refcount_t mga_queue_depth;
metaslab_t *mga_primary;
metaslab_t *mga_secondary;
} metaslab_group_allocator_t;
} ____cacheline_aligned metaslab_group_allocator_t;

/*
* Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
Expand All @@ -235,6 +235,7 @@ struct metaslab_group {
kmutex_t mg_lock;
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
uint64_t mg_queue_target;
boolean_t mg_allocatable; /* can we allocate? */
uint64_t mg_ms_ready;

Expand All @@ -246,40 +247,12 @@ struct metaslab_group {
*/
boolean_t mg_initialized;

uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
metaslab_class_t *mg_class;
vdev_t *mg_vd;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;

/*
* In order for the allocation throttle to function properly, we cannot
* have too many IOs going to each disk by default; the throttle
* operates by allocating more work to disks that finish quickly, so
* allocating larger chunks to each disk reduces its effectiveness.
* However, if the number of IOs going to each allocator is too small,
* we will not perform proper aggregation at the vdev_queue layer,
* also resulting in decreased performance. Therefore, we will use a
* ramp-up strategy.
*
* Each allocator in each metaslab group has a current queue depth
* (mg_alloc_queue_depth[allocator]) and a current max queue depth
* (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group
* has an absolute max queue depth (mg_max_alloc_queue_depth). We
* add IOs to an allocator until the mg_alloc_queue_depth for that
* allocator hits the cur_max. Every time an IO completes for a given
* allocator on a given metaslab group, we increment its cur_max until
* it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
* help protect against disks that decrease in performance over time.
*
* It's possible for an allocator to handle more allocations than
* its max. This can occur when gang blocks are required or when other
* groups are unable to handle their share of allocations.
*/
uint64_t mg_max_alloc_queue_depth;

/*
* A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out
Expand All @@ -288,8 +261,6 @@ struct metaslab_group {
*/
boolean_t mg_no_free_space;

uint64_t mg_allocations;
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE];

Expand Down Expand Up @@ -508,7 +479,7 @@ struct metaslab {
*/
hrtime_t ms_load_time; /* time last loaded */
hrtime_t ms_unload_time; /* time last unloaded */
hrtime_t ms_selected_time; /* time last allocated from */
uint64_t ms_selected_time; /* time last allocated from (secs) */

uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
uint64_t ms_max_size; /* maximum allocatable size */
Expand Down
11 changes: 0 additions & 11 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@
extern "C" {
#endif

typedef struct spa_alloc {
kmutex_t spaa_lock;
avl_tree_t spaa_tree;
} ____cacheline_aligned spa_alloc_t;

typedef struct spa_allocs_use {
kmutex_t sau_lock;
uint_t sau_rotor;
Expand Down Expand Up @@ -273,12 +268,6 @@ struct spa {
uint64_t spa_last_synced_guid; /* last synced guid */
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
/*
* spa_allocs is an array, whose lengths is stored in spa_alloc_count.
* There is one tree and one lock for each allocator, to help improve
* allocation performance in write-heavy workloads.
*/
spa_alloc_t *spa_allocs;
spa_allocs_use_t *spa_allocs_use;
int spa_alloc_count;
int spa_active_allocator; /* selectable allocator */
Expand Down
4 changes: 0 additions & 4 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,6 @@ extern "C" {
typedef struct vdev_queue vdev_queue_t;
struct abd;

extern uint_t zfs_vdev_queue_depth_pct;
extern uint_t zfs_vdev_def_queue_depth;
extern uint_t zfs_vdev_async_write_max_active;

/*
* Virtual device operations
*/
Expand Down
44 changes: 17 additions & 27 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -245,16 +245,26 @@ For L2ARC devices less than 1 GiB, the amount of data
evicts is significant compared to the amount of restored L2ARC data.
In this case, do not write log blocks in L2ARC in order not to waste space.
.
.It Sy metaslab_aliquot Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
Metaslab granularity, in bytes.
.It Sy metaslab_aliquot Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq u64
Metaslab group's per child vdev allocation granularity, in bytes.
This is roughly similar to what would be referred to as the "stripe size"
in traditional RAID arrays.
In normal operation, ZFS will try to write this amount of data to each disk
before moving on to the next top-level vdev.
In normal operation, ZFS will try to write this amount of data to each child
of a top-level vdev before moving on to the next top-level vdev.
.
.It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
Enable metaslab group biasing based on their vdevs' over- or under-utilization
relative to the pool.
Enable metaslab groups biasing based on their over- or under-utilization
relative to the metaslab class average.
If disabled, each metaslab group will receive allocations proportional to its
capacity.
.
.It Sy metaslab_perf_bias Ns = Ns Sy 1 Ns | Ns 0 Ns | Ns 2 Pq int
Controls metaslab groups biasing based on their write performance.
Setting to 0 makes all metaslab groups receive fixed amounts of allocations.
Setting to 2 allows faster metaslab groups to allocate more.
Setting to 1 equals to 2 if the pool is write-bound or 0 otherwise.
That is, if the pool is limited by write throughput, then allocate more from
faster metaslab groups, but if not, try to evenly distribute the allocations.
.
.It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Po 16 MiB + 1 B Pc Pq u64
Make some blocks above a certain size be gang blocks.
Expand Down Expand Up @@ -1527,23 +1537,6 @@ This enforced wait ensures the HDD services the interactive I/O
within a reasonable amount of time.
.No See Sx ZFS I/O SCHEDULER .
.
.It Sy zfs_vdev_queue_depth_pct Ns = Ns Sy 1000 Ns % Pq uint
Maximum number of queued allocations per top-level vdev expressed as
a percentage of
.Sy zfs_vdev_async_write_max_active ,
which allows the system to detect devices that are more capable
of handling allocations and to allocate more blocks to those devices.
This allows for dynamic allocation distribution when devices are imbalanced,
as fuller devices will tend to be slower than empty devices.
.Pp
Also see
.Sy zio_dva_throttle_enabled .
.
.It Sy zfs_vdev_def_queue_depth Ns = Ns Sy 32 Pq uint
Default queue depth for each vdev IO allocator.
Higher values allow for better coalescing of sequential writes before sending
them to the disk, but can increase transaction commit times.
.
.It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint
Defines if the driver should retire on a given error type.
The following options may be bitwise-ored together:
Expand Down Expand Up @@ -2488,10 +2481,7 @@ Slow I/O counters can be seen with
.
.It Sy zio_dva_throttle_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
Throttle block allocations in the I/O pipeline.
This allows for dynamic allocation distribution when devices are imbalanced.
When enabled, the maximum number of pending allocations per top-level vdev
is limited by
.Sy zfs_vdev_queue_depth_pct .
This allows for dynamic allocation distribution based on device performance.
.
.It Sy zfs_xattr_compat Ns = Ns 0 Ns | Ns 1 Pq int
Control the naming scheme used when setting new xattrs in the user namespace.
Expand Down
Loading
Loading