Skip to content

Commit 86ff7c2

Browse files
Ming Leiaxboe
authored andcommitted
blk-mq: introduce BLK_STS_DEV_RESOURCE
This status is returned from driver to block layer if device related resource is unavailable, but driver can guarantee that IO dispatch will be triggered in future when the resource is available. Convert some drivers to return BLK_STS_DEV_RESOURCE. Also, if driver returns BLK_STS_RESOURCE and SCHED_RESTART is set, rerun queue after a delay (BLK_MQ_DELAY_QUEUE) to avoid IO stalls. BLK_MQ_DELAY_QUEUE is 3 ms because both scsi-mq and nvmefc are using that magic value. If a driver can make sure there is in-flight IO, it is safe to return BLK_STS_DEV_RESOURCE because: 1) If all in-flight IOs complete before examining SCHED_RESTART in blk_mq_dispatch_rq_list(), SCHED_RESTART must be cleared, so queue is run immediately in this case by blk_mq_dispatch_rq_list(); 2) if there is any in-flight IO after/when examining SCHED_RESTART in blk_mq_dispatch_rq_list(): - if SCHED_RESTART isn't set, queue is run immediately as handled in 1) - otherwise, this request will be dispatched after any in-flight IO is completed via blk_mq_sched_restart() 3) if SCHED_RESTART is set concurently in context because of BLK_STS_RESOURCE, blk_mq_delay_run_hw_queue() will cover the above two cases and make sure IO hang can be avoided. One invariant is that queue will be rerun if SCHED_RESTART is set. Suggested-by: Jens Axboe <axboe@kernel.dk> Tested-by: Laurence Oberman <loberman@redhat.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 0a4b6e2 commit 86ff7c2

File tree

9 files changed

+45
-23
lines changed

9 files changed

+45
-23
lines changed

block/blk-core.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ static const struct {
145145
[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
146146
[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
147147
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
148+
[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
148149
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
149150

150151
/* device mapper special case, should not leak out: */

block/blk-mq.c

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,13 +1162,16 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
11621162
return true;
11631163
}
11641164

1165+
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
1166+
11651167
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
11661168
bool got_budget)
11671169
{
11681170
struct blk_mq_hw_ctx *hctx;
11691171
struct request *rq, *nxt;
11701172
bool no_tag = false;
11711173
int errors, queued;
1174+
blk_status_t ret = BLK_STS_OK;
11721175

11731176
if (list_empty(list))
11741177
return false;
@@ -1181,7 +1184,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
11811184
errors = queued = 0;
11821185
do {
11831186
struct blk_mq_queue_data bd;
1184-
blk_status_t ret;
11851187

11861188
rq = list_first_entry(list, struct request, queuelist);
11871189
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
@@ -1226,7 +1228,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
12261228
}
12271229

12281230
ret = q->mq_ops->queue_rq(hctx, &bd);
1229-
if (ret == BLK_STS_RESOURCE) {
1231+
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
12301232
/*
12311233
* If an I/O scheduler has been configured and we got a
12321234
* driver tag for the next request already, free it
@@ -1257,6 +1259,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
12571259
* that is where we will continue on next queue run.
12581260
*/
12591261
if (!list_empty(list)) {
1262+
bool needs_restart;
1263+
12601264
spin_lock(&hctx->lock);
12611265
list_splice_init(list, &hctx->dispatch);
12621266
spin_unlock(&hctx->lock);
@@ -1280,10 +1284,17 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
12801284
* - Some but not all block drivers stop a queue before
12811285
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
12821286
* and dm-rq.
1287+
*
1288+
* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1289+
* bit is set, run queue after a delay to avoid IO stalls
1290+
* that could otherwise occur if the queue is idle.
12831291
*/
1284-
if (!blk_mq_sched_needs_restart(hctx) ||
1292+
needs_restart = blk_mq_sched_needs_restart(hctx);
1293+
if (!needs_restart ||
12851294
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
12861295
blk_mq_run_hw_queue(hctx, true);
1296+
else if (needs_restart && (ret == BLK_STS_RESOURCE))
1297+
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
12871298
}
12881299

12891300
return (queued + errors) != 0;
@@ -1764,6 +1775,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
17641775
*cookie = new_cookie;
17651776
break;
17661777
case BLK_STS_RESOURCE:
1778+
case BLK_STS_DEV_RESOURCE:
17671779
__blk_mq_requeue_request(rq);
17681780
break;
17691781
default:
@@ -1826,7 +1838,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
18261838
hctx_lock(hctx, &srcu_idx);
18271839

18281840
ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
1829-
if (ret == BLK_STS_RESOURCE)
1841+
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
18301842
blk_mq_sched_insert_request(rq, false, true, false);
18311843
else if (ret != BLK_STS_OK)
18321844
blk_mq_end_request(rq, ret);

drivers/block/null_blk.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1230,7 +1230,7 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
12301230
return BLK_STS_OK;
12311231
} else
12321232
/* requeue request */
1233-
return BLK_STS_RESOURCE;
1233+
return BLK_STS_DEV_RESOURCE;
12341234
}
12351235
}
12361236

drivers/block/virtio_blk.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
276276
/* Out of mem doesn't actually happen, since we fall back
277277
* to direct descriptors */
278278
if (err == -ENOMEM || err == -ENOSPC)
279-
return BLK_STS_RESOURCE;
279+
return BLK_STS_DEV_RESOURCE;
280280
return BLK_STS_IOERR;
281281
}
282282

drivers/block/xen-blkfront.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -911,7 +911,7 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
911911
out_busy:
912912
blk_mq_stop_hw_queue(hctx);
913913
spin_unlock_irqrestore(&rinfo->ring_lock, flags);
914-
return BLK_STS_RESOURCE;
914+
return BLK_STS_DEV_RESOURCE;
915915
}
916916

917917
static void blkif_complete_rq(struct request *rq)

drivers/md/dm-rq.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ
404404

405405
clone->start_time = jiffies;
406406
r = blk_insert_cloned_request(clone->q, clone);
407-
if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
407+
if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
408408
/* must complete clone in terms of original request */
409409
dm_complete_request(rq, r);
410410
return r;
@@ -496,7 +496,7 @@ static int map_request(struct dm_rq_target_io *tio)
496496
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
497497
blk_rq_pos(rq));
498498
ret = dm_dispatch_clone_request(clone, rq);
499-
if (ret == BLK_STS_RESOURCE) {
499+
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
500500
blk_rq_unprep_clone(clone);
501501
tio->ti->type->release_clone_rq(clone);
502502
tio->clone = NULL;
@@ -769,7 +769,6 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
769769
/* Undo dm_start_request() before requeuing */
770770
rq_end_stats(md, rq);
771771
rq_completed(md, rq_data_dir(rq), false);
772-
blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
773772
return BLK_STS_RESOURCE;
774773
}
775774

drivers/nvme/host/fc.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@ enum nvme_fc_queue_flags {
3535
NVME_FC_Q_LIVE,
3636
};
3737

38-
#define NVMEFC_QUEUE_DELAY 3 /* ms units */
39-
4038
#define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */
4139

4240
struct nvme_fc_queue {
@@ -2231,7 +2229,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
22312229
* the target device is present
22322230
*/
22332231
if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
2234-
goto busy;
2232+
return BLK_STS_RESOURCE;
22352233

22362234
if (!nvme_fc_ctrl_get(ctrl))
22372235
return BLK_STS_IOERR;
@@ -2311,16 +2309,10 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
23112309
ret != -EBUSY)
23122310
return BLK_STS_IOERR;
23132311

2314-
goto busy;
2312+
return BLK_STS_RESOURCE;
23152313
}
23162314

23172315
return BLK_STS_OK;
2318-
2319-
busy:
2320-
if (!(op->flags & FCOP_FLAGS_AEN) && queue->hctx)
2321-
blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY);
2322-
2323-
return BLK_STS_RESOURCE;
23242316
}
23252317

23262318
static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue,

drivers/scsi/scsi_lib.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2030,9 +2030,9 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
20302030
case BLK_STS_OK:
20312031
break;
20322032
case BLK_STS_RESOURCE:
2033-
if (atomic_read(&sdev->device_busy) == 0 &&
2034-
!scsi_device_blocked(sdev))
2035-
blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
2033+
if (atomic_read(&sdev->device_busy) ||
2034+
scsi_device_blocked(sdev))
2035+
ret = BLK_STS_DEV_RESOURCE;
20362036
break;
20372037
default:
20382038
/*

include/linux/blk_types.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,24 @@ typedef u8 __bitwise blk_status_t;
3939

4040
#define BLK_STS_AGAIN ((__force blk_status_t)12)
4141

42+
/*
43+
* BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if
44+
* device related resources are unavailable, but the driver can guarantee
45+
* that the queue will be rerun in the future once resources become
46+
* available again. This is typically the case for device specific
47+
* resources that are consumed for IO. If the driver fails allocating these
48+
* resources, we know that inflight (or pending) IO will free these
49+
* resource upon completion.
50+
*
51+
* This is different from BLK_STS_RESOURCE in that it explicitly references
52+
* a device specific resource. For resources of wider scope, allocation
53+
* failure can happen without having pending IO. This means that we can't
54+
* rely on request completions freeing these resources, as IO may not be in
55+
* flight. Examples of that are kernel memory allocations, DMA mappings, or
56+
* any other system wide resources.
57+
*/
58+
#define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13)
59+
4260
/**
4361
* blk_path_error - returns true if error may be path related
4462
* @error: status the request was completed with

0 commit comments

Comments
 (0)