Skip to content

Commit 27cb27b

Browse files
keithbuschaxboe
authored andcommitted
io_uring: add support for kernel registered bvecs
Provide an interface for the kernel to leverage the existing pre-registered buffers that io_uring provides. User space can reference these later to achieve zero-copy IO. User space must register an empty fixed buffer table with io_uring in order for the kernel to make use of it. Signed-off-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20250227223916.143006-5-kbusch@meta.com Reviewed-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 99fde89 commit 27cb27b

File tree

5 files changed

+138
-7
lines changed

5 files changed

+138
-7
lines changed

include/linux/io_uring/cmd.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <uapi/linux/io_uring.h>
66
#include <linux/io_uring_types.h>
7+
#include <linux/blk-mq.h>
78

89
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
910
#define IORING_URING_CMD_CANCELABLE (1U << 30)
@@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
125126
return cmd_to_io_kiocb(cmd)->async_data;
126127
}
127128

129+
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
130+
void (*release)(void *), unsigned int index,
131+
unsigned int issue_flags);
132+
void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
133+
unsigned int issue_flags);
134+
128135
#endif /* _LINUX_IO_URING_CMD_H */

io_uring/io_uring.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3940,6 +3940,9 @@ static int __init io_uring_init(void)
39403940

39413941
io_uring_optable_init();
39423942

3943+
/* imu->dir is u8 */
3944+
BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX);
3945+
39433946
/*
39443947
* Allow user copy in the per-command field, which starts after the
39453948
* file in io_kiocb and until the opcode field. The openat2 handling

io_uring/rsrc.c

Lines changed: 116 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/hugetlb.h>
1010
#include <linux/compat.h>
1111
#include <linux/io_uring.h>
12+
#include <linux/io_uring/cmd.h>
1213

1314
#include <uapi/linux/io_uring.h>
1415

@@ -101,17 +102,23 @@ static int io_buffer_validate(struct iovec *iov)
101102
return 0;
102103
}
103104

104-
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
105+
static void io_release_ubuf(void *priv)
105106
{
106-
struct io_mapped_ubuf *imu = node->buf;
107+
struct io_mapped_ubuf *imu = priv;
107108
unsigned int i;
108109

109-
if (!refcount_dec_and_test(&imu->refs))
110-
return;
111110
for (i = 0; i < imu->nr_bvecs; i++)
112111
unpin_user_page(imu->bvec[i].bv_page);
112+
}
113+
114+
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
115+
{
116+
if (!refcount_dec_and_test(&imu->refs))
117+
return;
118+
113119
if (imu->acct_pages)
114120
io_unaccount_mem(ctx, imu->acct_pages);
121+
imu->release(imu->priv);
115122
kvfree(imu);
116123
}
117124

@@ -451,7 +458,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
451458
break;
452459
case IORING_RSRC_BUFFER:
453460
if (node->buf)
454-
io_buffer_unmap(ctx, node);
461+
io_buffer_unmap(ctx, node->buf);
455462
break;
456463
default:
457464
WARN_ON_ONCE(1);
@@ -761,6 +768,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
761768
imu->len = iov->iov_len;
762769
imu->nr_bvecs = nr_pages;
763770
imu->folio_shift = PAGE_SHIFT;
771+
imu->release = io_release_ubuf;
772+
imu->priv = imu;
773+
imu->is_kbuf = false;
774+
imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
764775
if (coalesced)
765776
imu->folio_shift = data.folio_shift;
766777
refcount_set(&imu->refs, 1);
@@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
857868
return ret;
858869
}
859870

871+
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
872+
void (*release)(void *), unsigned int index,
873+
unsigned int issue_flags)
874+
{
875+
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
876+
struct io_rsrc_data *data = &ctx->buf_table;
877+
struct req_iterator rq_iter;
878+
struct io_mapped_ubuf *imu;
879+
struct io_rsrc_node *node;
880+
struct bio_vec bv, *bvec;
881+
u16 nr_bvecs;
882+
int ret = 0;
883+
884+
io_ring_submit_lock(ctx, issue_flags);
885+
if (index >= data->nr) {
886+
ret = -EINVAL;
887+
goto unlock;
888+
}
889+
index = array_index_nospec(index, data->nr);
890+
891+
if (data->nodes[index]) {
892+
ret = -EBUSY;
893+
goto unlock;
894+
}
895+
896+
node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
897+
if (!node) {
898+
ret = -ENOMEM;
899+
goto unlock;
900+
}
901+
902+
nr_bvecs = blk_rq_nr_phys_segments(rq);
903+
imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
904+
if (!imu) {
905+
kfree(node);
906+
ret = -ENOMEM;
907+
goto unlock;
908+
}
909+
910+
imu->ubuf = 0;
911+
imu->len = blk_rq_bytes(rq);
912+
imu->acct_pages = 0;
913+
imu->folio_shift = PAGE_SHIFT;
914+
imu->nr_bvecs = nr_bvecs;
915+
refcount_set(&imu->refs, 1);
916+
imu->release = release;
917+
imu->priv = rq;
918+
imu->is_kbuf = true;
919+
920+
if (op_is_write(req_op(rq)))
921+
imu->dir = IO_IMU_SOURCE;
922+
else
923+
imu->dir = IO_IMU_DEST;
924+
925+
bvec = imu->bvec;
926+
rq_for_each_bvec(bv, rq, rq_iter)
927+
*bvec++ = bv;
928+
929+
node->buf = imu;
930+
data->nodes[index] = node;
931+
unlock:
932+
io_ring_submit_unlock(ctx, issue_flags);
933+
return ret;
934+
}
935+
EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
936+
937+
void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
938+
unsigned int issue_flags)
939+
{
940+
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
941+
struct io_rsrc_data *data = &ctx->buf_table;
942+
struct io_rsrc_node *node;
943+
944+
io_ring_submit_lock(ctx, issue_flags);
945+
if (index >= data->nr)
946+
goto unlock;
947+
index = array_index_nospec(index, data->nr);
948+
949+
node = data->nodes[index];
950+
if (!node || !node->buf->is_kbuf)
951+
goto unlock;
952+
953+
io_put_rsrc_node(ctx, node);
954+
data->nodes[index] = NULL;
955+
unlock:
956+
io_ring_submit_unlock(ctx, issue_flags);
957+
}
958+
EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
959+
860960
static int io_import_fixed(int ddir, struct iov_iter *iter,
861961
struct io_mapped_ubuf *imu,
862962
u64 buf_addr, size_t len)
@@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
871971
/* not inside the mapped region */
872972
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
873973
return -EFAULT;
974+
if (!(imu->dir & (1 << ddir)))
975+
return -EFAULT;
874976

875977
/*
876978
* Might not be a start of buffer, set size appropriately
@@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
883985
/*
884986
* Don't use iov_iter_advance() here, as it's really slow for
885987
* using the latter parts of a big fixed buffer - it iterates
886-
* over each segment manually. We can cheat a bit here, because
887-
* we know that:
988+
* over each segment manually. We can cheat a bit here for user
989+
* registered nodes, because we know that:
888990
*
889991
* 1) it's a BVEC iter, we set it up
890992
* 2) all bvecs are the same in size, except potentially the
@@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
8981000
*/
8991001
const struct bio_vec *bvec = imu->bvec;
9001002

1003+
/*
1004+
* Kernel buffer bvecs, on the other hand, don't necessarily
1005+
* have the size property of user registered ones, so we have
1006+
* to use the slow iter advance.
1007+
*/
9011008
if (offset < bvec->bv_len) {
9021009
iter->iov_offset = offset;
1010+
} else if (imu->is_kbuf) {
1011+
iov_iter_advance(iter, offset);
9031012
} else {
9041013
unsigned long seg_skip;
9051014

io_uring/rsrc.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,22 @@ struct io_rsrc_node {
2020
};
2121
};
2222

23+
enum {
24+
IO_IMU_DEST = 1 << ITER_DEST,
25+
IO_IMU_SOURCE = 1 << ITER_SOURCE,
26+
};
27+
2328
struct io_mapped_ubuf {
2429
u64 ubuf;
2530
unsigned int len;
2631
unsigned int nr_bvecs;
2732
unsigned int folio_shift;
2833
refcount_t refs;
2934
unsigned long acct_pages;
35+
void (*release)(void *);
36+
void *priv;
37+
bool is_kbuf;
38+
u8 dir;
3039
struct bio_vec bvec[] __counted_by(nr_bvecs);
3140
};
3241

io_uring/rw.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
629629
*/
630630
static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
631631
{
632+
struct io_kiocb *req = cmd_to_io_kiocb(rw);
632633
struct kiocb *kiocb = &rw->kiocb;
633634
struct file *file = kiocb->ki_filp;
634635
ssize_t ret = 0;
@@ -644,6 +645,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
644645
if ((kiocb->ki_flags & IOCB_NOWAIT) &&
645646
!(kiocb->ki_filp->f_flags & O_NONBLOCK))
646647
return -EAGAIN;
648+
if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf)
649+
return -EFAULT;
647650

648651
ppos = io_kiocb_ppos(kiocb);
649652

0 commit comments

Comments
 (0)