Skip to content

Optimize allocation throttling. #12314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ typedef struct metaslab_class_allocator {
*/
uint64_t mca_alloc_max_slots;
zfs_refcount_t mca_alloc_slots;
} metaslab_class_allocator_t;
} ____cacheline_aligned metaslab_class_allocator_t;

/*
* A metaslab class encompasses a category of allocatable top-level vdevs.
Expand Down
15 changes: 9 additions & 6 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@
extern "C" {
#endif

typedef struct spa_alloc {
kmutex_t spaa_lock;
avl_tree_t spaa_tree;
} ____cacheline_aligned spa_alloc_t;

typedef struct spa_error_entry {
zbookmark_phys_t se_bookmark;
char *se_name;
Expand Down Expand Up @@ -250,13 +255,11 @@ struct spa {
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
/*
* spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
* stored in spa_alloc_count. There is one tree and one lock for each
* allocator, to help improve allocation performance in write-heavy
* workloads.
* spa_allocs is an array, whose lengths is stored in spa_alloc_count.
* There is one tree and one lock for each allocator, to help improve
* allocation performance in write-heavy workloads.
*/
kmutex_t *spa_alloc_locks;
avl_tree_t *spa_alloc_trees;
spa_alloc_t *spa_allocs;
int spa_alloc_count;

spa_aux_vdev_t spa_spares; /* hot spares */
Expand Down
20 changes: 4 additions & 16 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -5611,31 +5611,21 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
zio_t *zio, int flags)
{
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE;
uint64_t max = mca->mca_alloc_max_slots;

ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);

uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
if (reserved_slots < max)
available_slots = max - reserved_slots;

if (slots <= available_slots || GANG_ALLOCATION(flags) ||
flags & METASLAB_MUST_RESERVE) {
if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
*/
for (int d = 0; d < slots; d++)
zfs_refcount_add(&mca->mca_alloc_slots, zio);
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE;
return (B_TRUE);
}

mutex_exit(&mc->mc_lock);
return (slot_reserved);
return (B_FALSE);
}

void
Expand All @@ -5645,10 +5635,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];

ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++)
zfs_refcount_remove(&mca->mca_alloc_slots, zio);
mutex_exit(&mc->mc_lock);
}

static int
Expand Down
12 changes: 6 additions & 6 deletions module/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -9197,9 +9197,9 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_sync_pass = 0;

for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_enter(&spa->spa_alloc_locks[i]);
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
mutex_exit(&spa->spa_alloc_locks[i]);
mutex_enter(&spa->spa_allocs[i].spaa_lock);
VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
mutex_exit(&spa->spa_allocs[i].spaa_lock);
}

/*
Expand Down Expand Up @@ -9309,9 +9309,9 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync_done(dp, txg);

for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_enter(&spa->spa_alloc_locks[i]);
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
mutex_exit(&spa->spa_alloc_locks[i]);
mutex_enter(&spa->spa_allocs[i].spaa_lock);
VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
mutex_exit(&spa->spa_allocs[i].spaa_lock);
}

/*
Expand Down
21 changes: 9 additions & 12 deletions module/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -700,13 +700,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_root = spa_strdup(altroot);

spa->spa_alloc_count = spa_allocators;
spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
sizeof (kmutex_t), KM_SLEEP);
spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
sizeof (avl_tree_t), KM_SLEEP);
spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
sizeof (spa_alloc_t), KM_SLEEP);
for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
Expand Down Expand Up @@ -799,13 +798,11 @@ spa_remove(spa_t *spa)
}

for (int i = 0; i < spa->spa_alloc_count; i++) {
avl_destroy(&spa->spa_alloc_trees[i]);
mutex_destroy(&spa->spa_alloc_locks[i]);
avl_destroy(&spa->spa_allocs[i].spaa_tree);
mutex_destroy(&spa->spa_allocs[i].spaa_lock);
}
kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
sizeof (kmutex_t));
kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
sizeof (avl_tree_t));
kmem_free(spa->spa_allocs, spa->spa_alloc_count *
sizeof (spa_alloc_t));

avl_destroy(&spa->spa_metaslabs_by_flushed);
avl_destroy(&spa->spa_sm_logs_by_txg);
Expand Down
33 changes: 16 additions & 17 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -877,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;

if (pio != NULL) {
if (zio->io_metaslab_class == NULL)
zio->io_metaslab_class = pio->io_metaslab_class;
zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
Expand Down Expand Up @@ -3379,9 +3378,9 @@ zio_io_to_allocate(spa_t *spa, int allocator)
{
zio_t *zio;

ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));

zio = avl_first(&spa->spa_alloc_trees[allocator]);
zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
if (zio == NULL)
return (NULL);

Expand All @@ -3393,11 +3392,11 @@ zio_io_to_allocate(spa_t *spa, int allocator)
*/
ASSERT3U(zio->io_allocator, ==, allocator);
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
zio->io_prop.zp_copies, allocator, zio, 0)) {
return (NULL);
}

avl_remove(&spa->spa_alloc_trees[allocator], zio);
avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);

return (zio);
Expand All @@ -3421,8 +3420,8 @@ zio_dva_throttle(zio_t *zio)
return (zio);
}

ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);

ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);

Expand All @@ -3434,14 +3433,14 @@ zio_dva_throttle(zio_t *zio)
* into 2^20 block regions, and then hash based on the objset, object,
* level, and region to accomplish both of these goals.
*/
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio->io_allocator = allocator;
zio->io_metaslab_class = mc;
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
nio = zio_io_to_allocate(spa, zio->io_allocator);
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
nio = zio_io_to_allocate(spa, allocator);
mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
return (nio);
}

Expand All @@ -3450,9 +3449,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;

mutex_enter(&spa->spa_alloc_locks[allocator]);
mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
zio = zio_io_to_allocate(spa, allocator);
mutex_exit(&spa->spa_alloc_locks[allocator]);
mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
if (zio == NULL)
return;

Expand Down Expand Up @@ -3642,8 +3641,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* some parallelism.
*/
int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
spa->spa_alloc_count;
int allocator = (uint_t)cityhash4(0, 0, 0,
os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, NULL, flags, &io_alloc_list, NULL, allocator);
*slog = (error == 0);
Expand Down