Skip to content

Commit

Permalink
Merge tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk…
Browse files Browse the repository at this point in the history
…/linux

Pull io_uring async discard support from Jens Axboe:
 "Sitting on top of both the 6.12 block and io_uring core branches,
  here's support for async discard through io_uring.

  This allows applications to issue async discards, rather than rely on
  the blocking sync ioctl discards we already have. The sync support is
  difficult to use outside of idle/cleanup periods.

  On a real (but slow) device, testing shows the following results when
  compared to sync discard:

	qd64 sync discard: 21K IOPS, lat avg 3 msec (max 21 msec)
	qd64 async discard: 76K IOPS, lat avg 845 usec (max 2.2 msec)

	qd64 sync discard: 14K IOPS, lat avg 5 msec (max 25 msec)
	qd64 async discard: 56K IOPS, lat avg 1153 usec (max 3.6 msec)

  and synthetic null_blk testing with the same queue depth and block
  size settings as above shows:

	Type    Trim size       IOPS    Lat avg (usec)  Lat Max (usec)
	==============================================================
	sync    4k               144K       444            20314
	async   4k              1353K        47              595
	sync    1M                56K      1136            21031
	async   1M                94K       680              760"

* tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux:
  block: implement async io_uring discard cmd
  block: introduce blk_validate_byte_range()
  filemap: introduce filemap_invalidate_pages
  io_uring/cmd: give inline space in request to cmds
  io_uring/cmd: expose iowq to cmds
  • Loading branch information
torvalds committed Sep 16, 2024
2 parents 26bb0d3 + 50c5225 commit adfc3de
Show file tree
Hide file tree
Showing 10 changed files with 209 additions and 24 deletions.
1 change: 1 addition & 0 deletions block/blk.h
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);

extern const struct address_space_operations def_blk_aops;
Expand Down
2 changes: 2 additions & 0 deletions block/fops.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/module.h>
#include <linux/io_uring/cmd.h>
#include "blk.h"

static inline struct inode *bdev_file_inode(struct file *file)
Expand Down Expand Up @@ -865,6 +866,7 @@ const struct file_operations def_blk_fops = {
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC,
};

Expand Down
163 changes: 144 additions & 19 deletions block/ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#include <linux/blktrace_api.h>
#include <linux/pr.h>
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/io_uring/cmd.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"

static int blkpg_do_ioctl(struct block_device *bdev,
Expand Down Expand Up @@ -92,41 +95,54 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif

/*
* Check that [start, start + len) is a valid range from the block device's
* perspective, including verifying that it can be correctly translated into
* logical block addresses.
*/
static int blk_validate_byte_range(struct block_device *bdev,
uint64_t start, uint64_t len)
{
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
uint64_t end;

if ((start | len) & bs_mask)
return -EINVAL;
if (!len)
return -EINVAL;
if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev))
return -EINVAL;

return 0;
}

static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
unsigned long arg)
{
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
uint64_t range[2], start, len, end;
uint64_t range[2], start, len;
struct bio *prev = NULL, *bio;
sector_t sector, nr_sects;
struct blk_plug plug;
int err;

if (!(mode & BLK_OPEN_WRITE))
return -EBADF;

if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (bdev_read_only(bdev))
return -EPERM;

if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;

start = range[0];
len = range[1];

if (!len)
return -EINVAL;
if ((start | len) & bs_mask)
return -EINVAL;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;

if (check_add_overflow(start, len, &end) ||
end > bdev_nr_bytes(bdev))
return -EINVAL;
if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
if (bdev_read_only(bdev))
return -EPERM;
err = blk_validate_byte_range(bdev, start, len);
if (err)
return err;

filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end - 1);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
goto fail;

Expand Down Expand Up @@ -735,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return ret;
}
#endif

struct blk_iou_cmd {
int res;
bool nowait;
};

static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);

if (bic->res == -EAGAIN && bic->nowait)
io_uring_cmd_issue_blocking(cmd);
else
io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
}

static void bio_cmd_bio_end_io(struct bio *bio)
{
struct io_uring_cmd *cmd = bio->bi_private;
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);

if (unlikely(bio->bi_status) && !bic->res)
bic->res = blk_status_to_errno(bio->bi_status);

io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
bio_put(bio);
}

static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
struct block_device *bdev,
uint64_t start, uint64_t len, bool nowait)
{
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
sector_t sector = start >> SECTOR_SHIFT;
sector_t nr_sects = len >> SECTOR_SHIFT;
struct bio *prev = NULL, *bio;
int err;

if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
return -EBADF;
if (bdev_read_only(bdev))
return -EPERM;
err = blk_validate_byte_range(bdev, start, len);
if (err)
return err;

err = filemap_invalidate_pages(bdev->bd_mapping, start,
start + len - 1, nowait);
if (err)
return err;

while (true) {
bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
if (!bio)
break;
if (nowait) {
/*
* Don't allow multi-bio non-blocking submissions as
* subsequent bios may fail but we won't get a direct
* indication of that. Normally, the caller should
* retry from a blocking context.
*/
if (unlikely(nr_sects)) {
bio_put(bio);
return -EAGAIN;
}
bio->bi_opf |= REQ_NOWAIT;
}

prev = bio_chain_and_submit(prev, bio);
}
if (unlikely(!prev))
return -EAGAIN;
if (unlikely(nr_sects))
bic->res = -EAGAIN;

prev->bi_private = cmd;
prev->bi_end_io = bio_cmd_bio_end_io;
submit_bio(prev);
return -EIOCBQUEUED;
}

int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
const struct io_uring_sqe *sqe = cmd->sqe;
u32 cmd_op = cmd->cmd_op;
uint64_t start, len;

if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
sqe->rw_flags || sqe->file_index))
return -EINVAL;

bic->res = 0;
bic->nowait = issue_flags & IO_URING_F_NONBLOCK;

start = READ_ONCE(sqe->addr);
len = READ_ONCE(sqe->addr3);

switch (cmd_op) {
case BLOCK_URING_CMD_DISCARD:
return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
}
return -EINVAL;
}
15 changes: 15 additions & 0 deletions include/linux/io_uring/cmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
return sqe->cmd;
}

static inline void io_uring_cmd_private_sz_check(size_t cmd_sz)
{
BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu));
}
#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \
io_uring_cmd_private_sz_check(sizeof(pdu_type)), \
((pdu_type *)&(cmd)->pdu) \
)

#if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd);
Expand All @@ -48,6 +57,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags);

/* Execute the request from a blocking context */
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);

#else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
Expand All @@ -67,6 +79,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
}
static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
}
#endif

/*
Expand Down
2 changes: 2 additions & 0 deletions include/linux/pagemap.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
int filemap_invalidate_pages(struct address_space *mapping,
loff_t pos, loff_t end, bool nowait);

int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
Expand Down
14 changes: 14 additions & 0 deletions include/uapi/linux/blkdev.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_BLKDEV_H
#define _UAPI_LINUX_BLKDEV_H

#include <linux/ioctl.h>
#include <linux/types.h>

/*
* io_uring block file commands, see IORING_OP_URING_CMD.
* It's a different number space from ioctl(), reuse the block's code 0x12.
*/
#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0)

#endif
11 changes: 11 additions & 0 deletions io_uring/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,17 @@ static void io_queue_iowq(struct io_kiocb *req)
io_queue_linked_timeout(link);
}

static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts)
{
io_queue_iowq(req);
}

void io_req_queue_iowq(struct io_kiocb *req)
{
req->io_task_work.func = io_req_queue_iowq_tw;
io_req_task_work_add(req);
}

static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {
Expand Down
1 change: 1 addition & 0 deletions io_uring/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ int io_uring_alloc_task_context(struct task_struct *task,

int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end);
void io_req_queue_iowq(struct io_kiocb *req);

int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
Expand Down
7 changes: 7 additions & 0 deletions io_uring/uring_cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,13 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);

void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);

io_req_queue_iowq(req);
}

static inline int io_uring_cmd_getsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
unsigned int issue_flags)
Expand Down
17 changes: 12 additions & 5 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2712,14 +2712,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
}
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);

int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
int filemap_invalidate_pages(struct address_space *mapping,
loff_t pos, loff_t end, bool nowait)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
loff_t pos = iocb->ki_pos;
loff_t end = pos + count - 1;
int ret;

if (iocb->ki_flags & IOCB_NOWAIT) {
if (nowait) {
/* we could block if there are any pages in the range */
if (filemap_range_has_page(mapping, pos, end))
return -EAGAIN;
Expand All @@ -2738,6 +2736,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}

int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;

return filemap_invalidate_pages(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1,
iocb->ki_flags & IOCB_NOWAIT);
}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);

/**
Expand Down

0 comments on commit adfc3de

Please sign in to comment.