Skip to content

Commit a608884

Browse files
Jianchao Wangaxboe
authored andcommitted
block: kyber: make kyber more friendly with merging
Currently, kyber is very unfriendly with merging. kyber depends on ctx rq_list to do merging, however, most of time, it will not leave any requests in ctx rq_list. This is because even if tokens of one domain is used up, kyber will try to dispatch requests from other domain and flush the rq_list there. To improve this, we setup kyber_ctx_queue (kcq) which is similar with ctx, but it has rq_lists for different domain and build same mapping between kcq and khd as the ctx & hctx. Then we could merge, insert and dispatch for different domains separately. At the same time, only flush the rq_list of kcq when get domain token successfully. Then if one domain token is used up, the requests could be left in the rq_list of that domain and maybe merged with following io. Following is my test result on machine with 8 cores and NVMe card INTEL SSDPEKKR128G7 fio size=256m ioengine=libaio iodepth=64 direct=1 numjobs=8 seq/random +------+---------------------------------------------------------------+ |patch?| bw(MB/s) | iops | slat(usec) | clat(usec) | merge | +----------------------------------------------------------------------+ | w/o | 606/612 | 151k/153k | 6.89/7.03 | 3349.21/3305.40 | 0/0 | +----------------------------------------------------------------------+ | w/ | 1083/616 | 277k/154k | 4.93/6.95 | 1830.62/3279.95 | 223k/3k | +----------------------------------------------------------------------+ When set numjobs to 16, the bw and iops could reach 1662MB/s and 425k on my platform. Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Reviewed-by: Omar Sandoval <osandov@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 9c55873 commit a608884

File tree

1 file changed

+158
-32
lines changed

1 file changed

+158
-32
lines changed

block/kyber-iosched.c

Lines changed: 158 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,19 @@ static const unsigned int kyber_batch_size[] = {
7272
[KYBER_OTHER] = 8,
7373
};
7474

75+
/*
76+
* There is a same mapping between ctx & hctx and kcq & khd,
77+
* we use request->mq_ctx->index_hw to index the kcq in khd.
78+
*/
79+
struct kyber_ctx_queue {
80+
/*
81+
* Used to ensure operations on rq_list and kcq_map to be an atmoic one.
82+
* Also protect the rqs on rq_list when merge.
83+
*/
84+
spinlock_t lock;
85+
struct list_head rq_list[KYBER_NUM_DOMAINS];
86+
} ____cacheline_aligned_in_smp;
87+
7588
struct kyber_queue_data {
7689
struct request_queue *q;
7790

@@ -99,6 +112,8 @@ struct kyber_hctx_data {
99112
struct list_head rqs[KYBER_NUM_DOMAINS];
100113
unsigned int cur_domain;
101114
unsigned int batching;
115+
struct kyber_ctx_queue *kcqs;
116+
struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
102117
wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
103118
struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
104119
atomic_t wait_index[KYBER_NUM_DOMAINS];
@@ -107,10 +122,8 @@ struct kyber_hctx_data {
107122
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
108123
void *key);
109124

110-
static int rq_sched_domain(const struct request *rq)
125+
static unsigned int kyber_sched_domain(unsigned int op)
111126
{
112-
unsigned int op = rq->cmd_flags;
113-
114127
if ((op & REQ_OP_MASK) == REQ_OP_READ)
115128
return KYBER_READ;
116129
else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
@@ -284,6 +297,11 @@ static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
284297
return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
285298
}
286299

300+
static int kyber_bucket_fn(const struct request *rq)
301+
{
302+
return kyber_sched_domain(rq->cmd_flags);
303+
}
304+
287305
static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
288306
{
289307
struct kyber_queue_data *kqd;
@@ -297,7 +315,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
297315
goto err;
298316
kqd->q = q;
299317

300-
kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
318+
kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
301319
KYBER_NUM_DOMAINS, kqd);
302320
if (!kqd->cb)
303321
goto err_kqd;
@@ -376,6 +394,15 @@ static void kyber_exit_sched(struct elevator_queue *e)
376394
kfree(kqd);
377395
}
378396

397+
static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
398+
{
399+
unsigned int i;
400+
401+
spin_lock_init(&kcq->lock);
402+
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
403+
INIT_LIST_HEAD(&kcq->rq_list[i]);
404+
}
405+
379406
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
380407
{
381408
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
@@ -386,6 +413,24 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
386413
if (!khd)
387414
return -ENOMEM;
388415

416+
khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
417+
sizeof(struct kyber_ctx_queue),
418+
GFP_KERNEL, hctx->numa_node);
419+
if (!khd->kcqs)
420+
goto err_khd;
421+
422+
for (i = 0; i < hctx->nr_ctx; i++)
423+
kyber_ctx_queue_init(&khd->kcqs[i]);
424+
425+
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
426+
if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
427+
ilog2(8), GFP_KERNEL, hctx->numa_node)) {
428+
while (--i >= 0)
429+
sbitmap_free(&khd->kcq_map[i]);
430+
goto err_kcqs;
431+
}
432+
}
433+
389434
spin_lock_init(&khd->lock);
390435

391436
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
@@ -405,10 +450,22 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
405450
kqd->async_depth);
406451

407452
return 0;
453+
454+
err_kcqs:
455+
kfree(khd->kcqs);
456+
err_khd:
457+
kfree(khd);
458+
return -ENOMEM;
408459
}
409460

410461
static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
411462
{
463+
struct kyber_hctx_data *khd = hctx->sched_data;
464+
int i;
465+
466+
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
467+
sbitmap_free(&khd->kcq_map[i]);
468+
kfree(khd->kcqs);
412469
kfree(hctx->sched_data);
413470
}
414471

@@ -430,7 +487,7 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
430487

431488
nr = rq_get_domain_token(rq);
432489
if (nr != -1) {
433-
sched_domain = rq_sched_domain(rq);
490+
sched_domain = kyber_sched_domain(rq->cmd_flags);
434491
sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
435492
rq->mq_ctx->cpu);
436493
}
@@ -449,11 +506,51 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
449506
}
450507
}
451508

509+
static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
510+
{
511+
struct kyber_hctx_data *khd = hctx->sched_data;
512+
struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
513+
struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
514+
unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
515+
struct list_head *rq_list = &kcq->rq_list[sched_domain];
516+
bool merged;
517+
518+
spin_lock(&kcq->lock);
519+
merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
520+
spin_unlock(&kcq->lock);
521+
blk_mq_put_ctx(ctx);
522+
523+
return merged;
524+
}
525+
452526
static void kyber_prepare_request(struct request *rq, struct bio *bio)
453527
{
454528
rq_set_domain_token(rq, -1);
455529
}
456530

531+
static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
532+
struct list_head *rq_list, bool at_head)
533+
{
534+
struct kyber_hctx_data *khd = hctx->sched_data;
535+
struct request *rq, *next;
536+
537+
list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538+
unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539+
struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
540+
struct list_head *head = &kcq->rq_list[sched_domain];
541+
542+
spin_lock(&kcq->lock);
543+
if (at_head)
544+
list_move(&rq->queuelist, head);
545+
else
546+
list_move_tail(&rq->queuelist, head);
547+
sbitmap_set_bit(&khd->kcq_map[sched_domain],
548+
rq->mq_ctx->index_hw);
549+
blk_mq_sched_request_inserted(rq);
550+
spin_unlock(&kcq->lock);
551+
}
552+
}
553+
457554
static void kyber_finish_request(struct request *rq)
458555
{
459556
struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
@@ -472,7 +569,7 @@ static void kyber_completed_request(struct request *rq)
472569
* Check if this request met our latency goal. If not, quickly gather
473570
* some statistics and start throttling.
474571
*/
475-
sched_domain = rq_sched_domain(rq);
572+
sched_domain = kyber_sched_domain(rq->cmd_flags);
476573
switch (sched_domain) {
477574
case KYBER_READ:
478575
target = kqd->read_lat_nsec;
@@ -498,19 +595,38 @@ static void kyber_completed_request(struct request *rq)
498595
blk_stat_activate_msecs(kqd->cb, 10);
499596
}
500597

501-
static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
502-
struct blk_mq_hw_ctx *hctx)
598+
struct flush_kcq_data {
599+
struct kyber_hctx_data *khd;
600+
unsigned int sched_domain;
601+
struct list_head *list;
602+
};
603+
604+
static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
503605
{
504-
LIST_HEAD(rq_list);
505-
struct request *rq, *next;
606+
struct flush_kcq_data *flush_data = data;
607+
struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
506608

507-
blk_mq_flush_busy_ctxs(hctx, &rq_list);
508-
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
509-
unsigned int sched_domain;
609+
spin_lock(&kcq->lock);
610+
list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
611+
flush_data->list);
612+
sbitmap_clear_bit(sb, bitnr);
613+
spin_unlock(&kcq->lock);
510614

511-
sched_domain = rq_sched_domain(rq);
512-
list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
513-
}
615+
return true;
616+
}
617+
618+
static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
619+
unsigned int sched_domain,
620+
struct list_head *list)
621+
{
622+
struct flush_kcq_data data = {
623+
.khd = khd,
624+
.sched_domain = sched_domain,
625+
.list = list,
626+
};
627+
628+
sbitmap_for_each_set(&khd->kcq_map[sched_domain],
629+
flush_busy_kcq, &data);
514630
}
515631

516632
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
@@ -573,26 +689,23 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
573689
static struct request *
574690
kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
575691
struct kyber_hctx_data *khd,
576-
struct blk_mq_hw_ctx *hctx,
577-
bool *flushed)
692+
struct blk_mq_hw_ctx *hctx)
578693
{
579694
struct list_head *rqs;
580695
struct request *rq;
581696
int nr;
582697

583698
rqs = &khd->rqs[khd->cur_domain];
584-
rq = list_first_entry_or_null(rqs, struct request, queuelist);
585699

586700
/*
587-
* If there wasn't already a pending request and we haven't flushed the
588-
* software queues yet, flush the software queues and check again.
701+
* If we already have a flushed request, then we just need to get a
702+
* token for it. Otherwise, if there are pending requests in the kcqs,
703+
* flush the kcqs, but only if we can get a token. If not, we should
704+
* leave the requests in the kcqs so that they can be merged. Note that
705+
* khd->lock serializes the flushes, so if we observed any bit set in
706+
* the kcq_map, we will always get a request.
589707
*/
590-
if (!rq && !*flushed) {
591-
kyber_flush_busy_ctxs(khd, hctx);
592-
*flushed = true;
593-
rq = list_first_entry_or_null(rqs, struct request, queuelist);
594-
}
595-
708+
rq = list_first_entry_or_null(rqs, struct request, queuelist);
596709
if (rq) {
597710
nr = kyber_get_domain_token(kqd, khd, hctx);
598711
if (nr >= 0) {
@@ -601,6 +714,16 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
601714
list_del_init(&rq->queuelist);
602715
return rq;
603716
}
717+
} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
718+
nr = kyber_get_domain_token(kqd, khd, hctx);
719+
if (nr >= 0) {
720+
kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
721+
rq = list_first_entry(rqs, struct request, queuelist);
722+
khd->batching++;
723+
rq_set_domain_token(rq, nr);
724+
list_del_init(&rq->queuelist);
725+
return rq;
726+
}
604727
}
605728

606729
/* There were either no pending requests or no tokens. */
@@ -611,7 +734,6 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
611734
{
612735
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
613736
struct kyber_hctx_data *khd = hctx->sched_data;
614-
bool flushed = false;
615737
struct request *rq;
616738
int i;
617739

@@ -622,7 +744,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
622744
* from the batch.
623745
*/
624746
if (khd->batching < kyber_batch_size[khd->cur_domain]) {
625-
rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
747+
rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
626748
if (rq)
627749
goto out;
628750
}
@@ -643,7 +765,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
643765
else
644766
khd->cur_domain++;
645767

646-
rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
768+
rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
647769
if (rq)
648770
goto out;
649771
}
@@ -660,10 +782,12 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
660782
int i;
661783

662784
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
663-
if (!list_empty_careful(&khd->rqs[i]))
785+
if (!list_empty_careful(&khd->rqs[i]) ||
786+
sbitmap_any_bit_set(&khd->kcq_map[i]))
664787
return true;
665788
}
666-
return sbitmap_any_bit_set(&hctx->ctx_map);
789+
790+
return false;
667791
}
668792

669793
#define KYBER_LAT_SHOW_STORE(op) \
@@ -834,7 +958,9 @@ static struct elevator_type kyber_sched = {
834958
.init_hctx = kyber_init_hctx,
835959
.exit_hctx = kyber_exit_hctx,
836960
.limit_depth = kyber_limit_depth,
961+
.bio_merge = kyber_bio_merge,
837962
.prepare_request = kyber_prepare_request,
963+
.insert_requests = kyber_insert_requests,
838964
.finish_request = kyber_finish_request,
839965
.requeue_request = kyber_finish_request,
840966
.completed_request = kyber_completed_request,

0 commit comments

Comments
 (0)