From 9fe99eed91e8273d3750367af759fe11e9512759 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Sat, 29 Mar 2025 10:15:24 -0600
Subject: [PATCH 01/59] io_uring/wq: avoid indirect do_work/free_work calls

struct io_wq stores do_work and free_work function pointers which are
called on each work item. But these function pointers are always set to
io_wq_submit_work and io_wq_free_work, respectively. So remove these
function pointers and just call the functions directly.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Link: https://lore.kernel.org/r/20250329161527.3281314-1-csander@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c    | 15 ++++-----------
 io_uring/io-wq.h    |  5 -----
 io_uring/io_uring.c |  2 +-
 io_uring/tctx.c     |  2 --
 4 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 04a75d666195..d52069b1177b 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -114,9 +114,6 @@ enum {
 struct io_wq {
 	unsigned long state;
 
-	free_work_fn *free_work;
-	io_wq_work_fn *do_work;
-
 	struct io_wq_hash *hash;
 
 	atomic_t worker_refs;
@@ -612,10 +609,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 			if (do_kill &&
 			    (work_flags & IO_WQ_WORK_UNBOUND))
 				atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
-			wq->do_work(work);
+			io_wq_submit_work(work);
 			io_assign_current_work(worker, NULL);
 
-			linked = wq->free_work(work);
+			linked = io_wq_free_work(work);
 			work = next_hashed;
 			if (!work && linked && !io_wq_is_hashed(linked)) {
 				work = linked;
@@ -934,8 +931,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
 {
 	do {
 		atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
-		wq->do_work(work);
-		work = wq->free_work(work);
+		io_wq_submit_work(work);
+		work = io_wq_free_work(work);
 	} while (work);
 }
 
@@ -1195,8 +1192,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	int ret, i;
 	struct io_wq *wq;
 
-	if (WARN_ON_ONCE(!data->free_work || !data->do_work))
-		return ERR_PTR(-EINVAL);
 	if (WARN_ON_ONCE(!bounded))
 		return ERR_PTR(-EINVAL);
 
@@ -1206,8 +1201,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 	refcount_inc(&data->hash->refs);
 	wq->hash = data->hash;
-	wq->free_work = data->free_work;
-	wq->do_work = data->do_work;
 
 	ret = -ENOMEM;
 
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index d4fb2940e435..774abab54732 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,9 +21,6 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work *);
-
 struct io_wq_hash {
 	refcount_t refs;
 	unsigned long map;
@@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash)
 struct io_wq_data {
 	struct io_wq_hash *hash;
 	struct task_struct *task;
-	io_wq_work_fn *do_work;
-	free_work_fn *free_work;
 };
 
 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c6209fe44cb1..61514b14ee3f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1812,7 +1812,7 @@ void io_wq_submit_work(struct io_wq_work *work)
 	bool needs_poll = false;
 	int ret = 0, err = -ECANCELED;
 
-	/* one will be dropped by ->io_wq_free_work() after returning to io-wq */
+	/* one will be dropped by io_wq_free_work() after returning to io-wq */
 	if (!(req->flags & REQ_F_REFCOUNT))
 		__io_req_set_refcount(req, 2);
 	else
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index adc6e42c14df..5b66755579c0 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 
 	data.hash = hash;
 	data.task = task;
-	data.free_work = io_wq_free_work;
-	data.do_work = io_wq_submit_work;
 
 	/* Do QD, or 4 * CPUS, whatever is smallest */
 	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());

From e9ff9ae103573c7393d80533214a567654987ed5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 31 Mar 2025 17:17:59 +0100
Subject: [PATCH 02/59] io_uring/net: don't use io_do_buffer_select at prep

Prep code is interested whether it's a selected buffer request, not
whether a buffer has already been selected like what
io_do_buffer_select() returns. Check for REQ_F_BUFFER_SELECT directly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4488a029ac698554bebf732263fe19d7734affa6.1743437358.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 24040bc3916a..b9a03aeda4bb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -361,13 +361,9 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 	if (sr->flags & IORING_RECVSEND_FIXED_BUF)
 		return 0;
-	if (!io_do_buffer_select(req)) {
-		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
-				  &kmsg->msg.msg_iter);
-		if (unlikely(ret < 0))
-			return ret;
-	}
-	return 0;
+	if (req->flags & REQ_F_BUFFER_SELECT)
+		return 0;
+	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
 }
 
 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -723,7 +719,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr *kmsg;
-	int ret;
 
 	kmsg = io_msg_alloc_async(req);
 	if (unlikely(!kmsg))
@@ -739,13 +734,10 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
 		kmsg->msg.msg_iocb = NULL;
 		kmsg->msg.msg_ubuf = NULL;
 
-		if (!io_do_buffer_select(req)) {
-			ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
-					  &kmsg->msg.msg_iter);
-			if (unlikely(ret))
-				return ret;
-		}
-		return 0;
+		if (req->flags & REQ_F_BUFFER_SELECT)
+			return 0;
+		return import_ubuf(ITER_DEST, sr->buf, sr->len,
+				   &kmsg->msg.msg_iter);
 	}
 
 	return io_recvmsg_copy_hdr(req, kmsg);

From e6f74fd67d50c8a938fcfa83c97cd06995f6aaa1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 31 Mar 2025 17:18:00 +0100
Subject: [PATCH 03/59] io_uring: set IMPORT_BUFFER in generic send setup

Move REQ_F_IMPORT_BUFFER to the common send setup. Currently, the only
user is send zc, but we'll want for normal sends to support that in the
future.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/18b74edfc61d8a1b6c9fed3b78a9276fe80f8ced.1743437358.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index b9a03aeda4bb..eb2130112e0e 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -359,8 +359,10 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		kmsg->msg.msg_name = &kmsg->addr;
 		kmsg->msg.msg_namelen = addr_len;
 	}
-	if (sr->flags & IORING_RECVSEND_FIXED_BUF)
+	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
+		req->flags |= REQ_F_IMPORT_BUFFER;
 		return 0;
+	}
 	if (req->flags & REQ_F_BUFFER_SELECT)
 		return 0;
 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
@@ -1313,8 +1315,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -ENOMEM;
 
 	if (req->opcode == IORING_OP_SEND_ZC) {
-		if (zc->flags & IORING_RECVSEND_FIXED_BUF)
-			req->flags |= REQ_F_IMPORT_BUFFER;
 		ret = io_send_setup(req, sqe);
 	} else {
 		if (unlikely(sqe->addr2 || sqe->file_index))

From c0e965052149c883317774711205456d08285741 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 31 Mar 2025 17:18:01 +0100
Subject: [PATCH 04/59] io_uring/kbuf: pass bgid to io_buffer_select()

The current situation with buffer group id juggling is not ideal.
req->buf_index first stores the bgid, then it's overwritten by a buffer
id, and then it can get restored back no recycling / etc. It's not so
easy to control, and it's not handled consistently across request types
with receive requests saving and restoring the bgid it by hand.

It's a prep patch that adds a buffer group id argument to
io_buffer_select(). The caller will be responsible for stashing a copy
somewhere and passing it into the function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a210d6427cc3f4f42271a6853274cd5a50e56820.1743437358.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 4 ++--
 io_uring/kbuf.h | 2 +-
 io_uring/net.c  | 9 ++++-----
 io_uring/rw.c   | 5 ++++-
 io_uring/rw.h   | 2 ++
 5 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 953d5e742569..f195876732be 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -193,7 +193,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 }
 
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
-			      unsigned int issue_flags)
+			      unsigned buf_group, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
@@ -201,7 +201,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 
 	io_ring_submit_lock(req->ctx, issue_flags);
 
-	bl = io_buffer_get_list(ctx, req->buf_index);
+	bl = io_buffer_get_list(ctx, buf_group);
 	if (likely(bl)) {
 		if (bl->flags & IOBL_BUF_RING)
 			ret = io_ring_buffer_select(req, len, bl, issue_flags);
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 2ec0b983ce24..09129115f3ef 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -58,7 +58,7 @@ struct buf_sel_arg {
 };
 
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
-			      unsigned int issue_flags);
+			      unsigned buf_group, unsigned int issue_flags);
 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
 		      unsigned int issue_flags);
 int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
diff --git a/io_uring/net.c b/io_uring/net.c
index eb2130112e0e..6314b1583c8c 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -407,13 +407,12 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
+	if (req->flags & REQ_F_BUFFER_SELECT)
+		sr->buf_group = req->buf_index;
 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
 		if (req->opcode == IORING_OP_SENDMSG)
 			return -EINVAL;
-		if (!(req->flags & REQ_F_BUFFER_SELECT))
-			return -EINVAL;
 		sr->msg_flags |= MSG_WAITALL;
-		sr->buf_group = req->buf_index;
 		req->buf_list = NULL;
 		req->flags |= REQ_F_MULTISHOT;
 	}
@@ -979,7 +978,7 @@ retry_multishot:
 		void __user *buf;
 		size_t len = sr->len;
 
-		buf = io_buffer_select(req, &len, issue_flags);
+		buf = io_buffer_select(req, &len, sr->buf_group, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 
@@ -1089,7 +1088,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
 		void __user *buf;
 
 		*len = sr->len;
-		buf = io_buffer_select(req, len, issue_flags);
+		buf = io_buffer_select(req, len, sr->buf_group, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		sr->buf = buf;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 039e063f7091..17a12a1cf3a6 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -119,7 +119,7 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req,
 		return io_import_vec(ddir, req, io, buf, sqe_len);
 
 	if (io_do_buffer_select(req)) {
-		buf = io_buffer_select(req, &sqe_len, issue_flags);
+		buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		rw->addr = (unsigned long) buf;
@@ -253,16 +253,19 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			int ddir)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct io_async_rw *io;
 	unsigned ioprio;
 	u64 attr_type_mask;
 	int ret;
 
 	if (io_rw_alloc_async(req))
 		return -ENOMEM;
+	io = req->async_data;
 
 	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
 	/* used for fixed read/write too - just read unconditionally */
 	req->buf_index = READ_ONCE(sqe->buf_index);
+	io->buf_group = req->buf_index;
 
 	ioprio = READ_ONCE(sqe->ioprio);
 	if (ioprio) {
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 81d6d9a8cf69..129a53fe5482 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -16,6 +16,8 @@ struct io_async_rw {
 		struct iov_iter			iter;
 		struct iov_iter_state		iter_state;
 		struct iovec			fast_iov;
+		unsigned			buf_group;
+
 		/*
 		 * wpq is for buffered io, while meta fields are used with
 		 * direct io

From bd32923e5f02fa7b04d487ec265dc8080d27a257 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 31 Mar 2025 17:18:02 +0100
Subject: [PATCH 05/59] io_uring: don't store bgid in req->buf_index

Pass buffer group id into the rest of helpers via struct buf_sel_arg
and remove all reassignments of req->buf_index back to bgid. Now, it
only stores buffer indexes, and the group is provided by callers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3ea9fa08113ecb4d9224b943e7806e80a324bdf9.1743437358.git.asml.silence@gmail.com
Link: https://lore.kernel.org/io-uring/0c01d76ff12986c2f48614db8610caff8f78c869.1743500909.git.asml.silence@gmail.com/
[axboe: fold in patch from second link]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +--
 io_uring/kbuf.c                | 11 ++++-------
 io_uring/kbuf.h                |  2 +-
 io_uring/net.c                 |  3 ++-
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b44d201520d8..3b467879bca8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -653,8 +653,7 @@ struct io_kiocb {
 	u8				iopoll_completed;
 	/*
 	 * Can be either a fixed buffer index, or used with provided buffers.
-	 * For the latter, before issue it points to the buffer group ID,
-	 * and after selection it points to the buffer ID itself.
+	 * For the latter, it points to the selected buffer ID.
 	 */
 	u16				buf_index;
 
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index f195876732be..1cf0d2c01287 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -92,7 +92,6 @@ void io_kbuf_drop_legacy(struct io_kiocb *req)
 {
 	if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED)))
 		return;
-	req->buf_index = req->kbuf->bgid;
 	req->flags &= ~REQ_F_BUFFER_SELECTED;
 	kfree(req->kbuf);
 	req->kbuf = NULL;
@@ -110,7 +109,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
 	bl = io_buffer_get_list(ctx, buf->bgid);
 	list_add(&buf->list, &bl->buf_list);
 	req->flags &= ~REQ_F_BUFFER_SELECTED;
-	req->buf_index = buf->bgid;
 
 	io_ring_submit_unlock(ctx, issue_flags);
 	return true;
@@ -302,7 +300,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
 	int ret = -ENOENT;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	bl = io_buffer_get_list(ctx, req->buf_index);
+	bl = io_buffer_get_list(ctx, arg->buf_group);
 	if (unlikely(!bl))
 		goto out_unlock;
 
@@ -335,7 +333,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
 
 	lockdep_assert_held(&ctx->uring_lock);
 
-	bl = io_buffer_get_list(ctx, req->buf_index);
+	bl = io_buffer_get_list(ctx, arg->buf_group);
 	if (unlikely(!bl))
 		return -ENOENT;
 
@@ -355,10 +353,9 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
 	struct io_buffer_list *bl = req->buf_list;
 	bool ret = true;
 
-	if (bl) {
+	if (bl)
 		ret = io_kbuf_commit(req, bl, len, nr);
-		req->buf_index = bl->bgid;
-	}
+
 	req->flags &= ~REQ_F_BUFFER_RING;
 	return ret;
 }
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 09129115f3ef..0798a732e6cb 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -55,6 +55,7 @@ struct buf_sel_arg {
 	size_t max_len;
 	unsigned short nr_iovs;
 	unsigned short mode;
+	unsigned buf_group;
 };
 
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
@@ -94,7 +95,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
 	 * to monopolize the buffer.
 	 */
 	if (req->buf_list) {
-		req->buf_index = req->buf_list->bgid;
 		req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
 		return true;
 	}
diff --git a/io_uring/net.c b/io_uring/net.c
index 6314b1583c8c..5f1a519d1fc6 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -190,7 +190,6 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
 	sr->done_io = 0;
 	sr->retry = false;
 	sr->len = 0; /* get from the provided buffer */
-	req->buf_index = sr->buf_group;
 }
 
 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
@@ -568,6 +567,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
 		.iovs = &kmsg->fast_iov,
 		.max_len = min_not_zero(sr->len, INT_MAX),
 		.nr_iovs = 1,
+		.buf_group = sr->buf_group,
 	};
 
 	if (kmsg->vec.iovec) {
@@ -1056,6 +1056,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
 			.iovs = &kmsg->fast_iov,
 			.nr_iovs = 1,
 			.mode = KBUF_MODE_EXPAND,
+			.buf_group = sr->buf_group,
 		};
 
 		if (kmsg->vec.iovec) {

From 53db8a71ecb42c2ec5e9c6925269a750255f9af5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 4 Apr 2025 14:50:59 -0600
Subject: [PATCH 06/59] io_uring: add support for IORING_OP_PIPE

This works just like pipe2(2), except it also supports fixed file
descriptors. Used in a similar fashion as for other fd instantiating
opcodes (like accept, socket, open, etc), where sqe->file_slot is set
appropriately if two direct descriptors are desired rather than a set
of normal file descriptors.

sqe->addr must be set to a pointer to an array of 2 integers, which
is where the fixed/normal file descriptors are copied to.

sqe->pipe_flags contains flags, same as what is allowed for pipe2(2).

Future expansion of per-op private flags can go in sqe->ioprio,
like we do for other opcodes that take both a "syscall" flag set and
an io_uring opcode specific flag set.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   2 +
 io_uring/opdef.c              |   7 ++
 io_uring/openclose.c          | 133 ++++++++++++++++++++++++++++++++++
 io_uring/openclose.h          |   3 +
 4 files changed, 145 insertions(+)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8f1fc12bac46..130f3bc71a69 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -73,6 +73,7 @@ struct io_uring_sqe {
 		__u32		futex_flags;
 		__u32		install_fd_flags;
 		__u32		nop_flags;
+		__u32		pipe_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -283,6 +284,7 @@ enum io_uring_op {
 	IORING_OP_EPOLL_WAIT,
 	IORING_OP_READV_FIXED,
 	IORING_OP_WRITEV_FIXED,
+	IORING_OP_PIPE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 489384c0438b..db36433c2294 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -569,6 +569,10 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_prep_writev_fixed,
 		.issue			= io_write,
 	},
+	[IORING_OP_PIPE] = {
+		.prep			= io_pipe_prep,
+		.issue			= io_pipe,
+	},
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -815,6 +819,9 @@ const struct io_cold_def io_cold_defs[] = {
 		.cleanup		= io_readv_writev_cleanup,
 		.fail			= io_rw_fail,
 	},
+	[IORING_OP_PIPE] = {
+		.name			= "PIPE",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index e3357dfa14ca..4dd461163457 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -6,6 +6,8 @@
 #include <linux/fdtable.h>
 #include <linux/fsnotify.h>
 #include <linux/namei.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/watch_queue.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
@@ -302,3 +304,134 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
+
+struct io_pipe {
+	struct file *file;
+	int __user *fds;
+	int flags;
+	int file_slot;
+	unsigned long nofile;
+};
+
+int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+
+	if (sqe->fd || sqe->off || sqe->addr3)
+		return -EINVAL;
+
+	p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	p->flags = READ_ONCE(sqe->pipe_flags);
+	if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
+		return -EINVAL;
+
+	p->file_slot = READ_ONCE(sqe->file_index);
+	p->nofile = rlimit(RLIMIT_NOFILE);
+	return 0;
+}
+
+static int io_pipe_fixed(struct io_kiocb *req, struct file **files,
+			 unsigned int issue_flags)
+{
+	struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret, fds[2] = { -1, -1 };
+	int slot = p->file_slot;
+
+	if (p->flags & O_CLOEXEC)
+		return -EINVAL;
+
+	io_ring_submit_lock(ctx, issue_flags);
+
+	ret = __io_fixed_fd_install(ctx, files[0], slot);
+	if (ret < 0)
+		goto err;
+	fds[0] = ret;
+	files[0] = NULL;
+
+	/*
+	 * If a specific slot is given, next one will be used for
+	 * the write side.
+	 */
+	if (slot != IORING_FILE_INDEX_ALLOC)
+		slot++;
+
+	ret = __io_fixed_fd_install(ctx, files[1], slot);
+	if (ret < 0)
+		goto err;
+	fds[1] = ret;
+	files[1] = NULL;
+
+	io_ring_submit_unlock(ctx, issue_flags);
+
+	if (!copy_to_user(p->fds, fds, sizeof(fds)))
+		return 0;
+
+	ret = -EFAULT;
+	io_ring_submit_lock(ctx, issue_flags);
+err:
+	if (fds[0] != -1)
+		io_fixed_fd_remove(ctx, fds[0]);
+	if (fds[1] != -1)
+		io_fixed_fd_remove(ctx, fds[1]);
+	io_ring_submit_unlock(ctx, issue_flags);
+	return ret;
+}
+
+static int io_pipe_fd(struct io_kiocb *req, struct file **files)
+{
+	struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+	int ret, fds[2] = { -1, -1 };
+
+	ret = __get_unused_fd_flags(p->flags, p->nofile);
+	if (ret < 0)
+		goto err;
+	fds[0] = ret;
+
+	ret = __get_unused_fd_flags(p->flags, p->nofile);
+	if (ret < 0)
+		goto err;
+	fds[1] = ret;
+
+	if (!copy_to_user(p->fds, fds, sizeof(fds))) {
+		fd_install(fds[0], files[0]);
+		fd_install(fds[1], files[1]);
+		return 0;
+	}
+	ret = -EFAULT;
+err:
+	if (fds[0] != -1)
+		put_unused_fd(fds[0]);
+	if (fds[1] != -1)
+		put_unused_fd(fds[1]);
+	return ret;
+}
+
+int io_pipe(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+	struct file *files[2];
+	int ret;
+
+	ret = create_pipe_files(files, p->flags);
+	if (ret)
+		return ret;
+	files[0]->f_mode |= FMODE_NOWAIT;
+	files[1]->f_mode |= FMODE_NOWAIT;
+
+	if (!!p->file_slot)
+		ret = io_pipe_fixed(req, files, issue_flags);
+	else
+		ret = io_pipe_fd(req, files);
+
+	io_req_set_res(req, ret, 0);
+	if (!ret)
+		return IOU_OK;
+
+	req_set_fail(req);
+	if (files[0])
+		fput(files[0]);
+	if (files[1])
+		fput(files[1]);
+	return ret;
+}
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 8a93c98ad0ad..4ca2a9935abc 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
 int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_close(struct io_kiocb *req, unsigned int issue_flags);
 
+int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_pipe(struct io_kiocb *req, unsigned int issue_flags);
+
 int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);

From 8a2dacd49f1d19f4e3a4ebc99d11622cafd1659e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 16 Apr 2025 13:25:03 -0600
Subject: [PATCH 07/59] io_uring/rsrc: remove node assignment helpers

There are two helpers here, one assigns and increments the node ref
count, and the other is simply a wrapper around that for the buffer node
handling.

The buffer node assignment benefits from checking and setting
REQ_F_BUF_NODE together, otherwise stalls have been observed on setting
that flag later in the process. Hence re-do it so that it's set when
checked, and cleared in case of (unlikely) failure. With that, the
buffer node helper can go, and then drop the generic
io_req_assign_rsrc_node() helper as well as there's only a single user
of it left at that point.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  3 ++-
 io_uring/rsrc.c     | 12 +++++++++---
 io_uring/rsrc.h     | 14 --------------
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 61514b14ee3f..75c022526548 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1912,7 +1912,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	io_ring_submit_lock(ctx, issue_flags);
 	node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
 	if (node) {
-		io_req_assign_rsrc_node(&req->file_node, node);
+		node->refs++;
+		req->file_node = node;
 		req->flags |= io_slot_flags(node);
 		file = io_slot_file(node);
 	}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f80a77c4973f..107064c9c6fc 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1110,13 +1110,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
 
 	if (req->flags & REQ_F_BUF_NODE)
 		return req->buf_node;
+	req->flags |= REQ_F_BUF_NODE;
 
 	io_ring_submit_lock(ctx, issue_flags);
 	node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
-	if (node)
-		io_req_assign_buf_node(req, node);
+	if (node) {
+		node->refs++;
+		req->buf_node = node;
+		io_ring_submit_unlock(ctx, issue_flags);
+		return node;
+	}
+	req->flags &= ~REQ_F_BUF_NODE;
 	io_ring_submit_unlock(ctx, issue_flags);
-	return node;
+	return NULL;
 }
 
 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index b52242852ff3..6008ad2e6d9e 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -127,20 +127,6 @@ static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 	}
 }
 
-static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
-					   struct io_rsrc_node *node)
-{
-	node->refs++;
-	*dst_node = node;
-}
-
-static inline void io_req_assign_buf_node(struct io_kiocb *req,
-					  struct io_rsrc_node *node)
-{
-	io_req_assign_rsrc_node(&req->buf_node, node);
-	req->flags |= REQ_F_BUF_NODE;
-}
-
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 

From ea76925614189bdcb6571e2ea8de68af409ebd56 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 19 Apr 2025 18:47:04 +0100
Subject: [PATCH 08/59] io_uring/rsrc: use unpin_user_folio

We want to have a full folio to be left pinned but with only one
reference, for that we "unpin" all but the first page with
unpin_user_pages(), which can be confusing. There is a new helper to
achieve that called unpin_user_folio(), so use that.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Link: https://lore.kernel.org/r/e0b2be8f9ea68f6b351ec3bb046f04f437f68491.1745083025.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 107064c9c6fc..6bf8dff4adf3 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -699,10 +699,9 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
 	 * The pages are bound to the folio, it doesn't
 	 * actually unpin them but drops all but one reference,
 	 * which is usually put down by io_buffer_unmap().
-	 * Note, needs a better helper.
 	 */
 	if (data->nr_pages_head > 1)
-		unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
+		unpin_user_folio(page_folio(new_array[0]), data->nr_pages_head - 1);
 
 	j = data->nr_pages_head;
 	nr_pages_left -= data->nr_pages_head;
@@ -713,7 +712,7 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
 		nr_unpin = min_t(unsigned int, nr_pages_left - 1,
 					data->nr_pages_mid - 1);
 		if (nr_unpin)
-			unpin_user_pages(&page_array[j+1], nr_unpin);
+			unpin_user_folio(page_folio(new_array[i]), nr_unpin);
 		j += data->nr_pages_mid;
 		nr_pages_left -= data->nr_pages_mid;
 	}

From 9cebcf7b0c38bca1b501d8716163aa254b230559 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 19 Apr 2025 18:47:05 +0100
Subject: [PATCH 09/59] io_uring/rsrc: clean up io_coalesce_buffer()

We don't need special handling for the first page in
io_coalesce_buffer(), move it inside the loop.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Link: https://lore.kernel.org/r/ad698cddc1eadb3d92a7515e95bb13f79420323d.1745083025.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 6bf8dff4adf3..5d25f3391650 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -685,37 +685,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
 				struct io_imu_folio_data *data)
 {
 	struct page **page_array = *pages, **new_array = NULL;
-	int nr_pages_left = *nr_pages, i, j;
-	int nr_folios = data->nr_folios;
+	unsigned nr_pages_left = *nr_pages;
+	unsigned nr_folios = data->nr_folios;
+	unsigned i, j;
 
 	/* Store head pages only*/
-	new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
-					GFP_KERNEL);
+	new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
 	if (!new_array)
 		return false;
 
-	new_array[0] = compound_head(page_array[0]);
-	/*
-	 * The pages are bound to the folio, it doesn't
-	 * actually unpin them but drops all but one reference,
-	 * which is usually put down by io_buffer_unmap().
-	 */
-	if (data->nr_pages_head > 1)
-		unpin_user_folio(page_folio(new_array[0]), data->nr_pages_head - 1);
+	for (i = 0, j = 0; i < nr_folios; i++) {
+		struct page *p = compound_head(page_array[j]);
+		struct folio *folio = page_folio(p);
+		unsigned int nr;
 
-	j = data->nr_pages_head;
-	nr_pages_left -= data->nr_pages_head;
-	for (i = 1; i < nr_folios; i++) {
-		unsigned int nr_unpin;
+		WARN_ON_ONCE(i > 0 && p != page_array[j]);
 
-		new_array[i] = page_array[j];
-		nr_unpin = min_t(unsigned int, nr_pages_left - 1,
-					data->nr_pages_mid - 1);
-		if (nr_unpin)
-			unpin_user_folio(page_folio(new_array[i]), nr_unpin);
-		j += data->nr_pages_mid;
-		nr_pages_left -= data->nr_pages_mid;
+		nr = i ? data->nr_pages_mid : data->nr_pages_head;
+		nr = min(nr, nr_pages_left);
+		/* Drop all but one ref, the entire folio will remain pinned. */
+		if (nr > 1)
+			unpin_user_folio(folio, nr - 1);
+		j += nr;
+		nr_pages_left -= nr;
+		new_array[i] = p;
 	}
+
+	WARN_ON_ONCE(j != *nr_pages);
+
 	kvfree(page_array);
 	*pages = new_array;
 	*nr_pages = nr_folios;

From be6bad57b217491733754ae8113eec94a90a2769 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 19 Apr 2025 18:47:06 +0100
Subject: [PATCH 10/59] io_uring/rsrc: remove null check on import

WARN_ON_ONCE() checking imu for NULL in io_import_fixed() is in the hot
path and too protective, it's time to get rid of it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3782c01e311dbd474bca45aefaf79a2f2822fafb.1745083025.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5d25f3391650..b4c5f3ee8855 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1058,8 +1058,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
 	size_t offset;
 	int ret;
 
-	if (WARN_ON_ONCE(!imu))
-		return -EFAULT;
 	ret = validate_fixed_range(buf_addr, len, imu);
 	if (unlikely(ret))
 		return ret;

From 37d26edd6bb4984849a71e21c6824c961fcd19db Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 20 Apr 2025 10:31:15 +0100
Subject: [PATCH 11/59] io_uring/zcrx: remove duplicated freelist init

Several lines below we already initialise the freelist, don't do it
twice.

Reviewed-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/71b07c4e00d1db65899d1ed8d603d961fe7d1c28.1745141261.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index fe86606b9f30..79f58c94494b 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -245,9 +245,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 	if (!area->freelist)
 		goto err;
 
-	for (i = 0; i < nr_iovs; i++)
-		area->freelist[i] = i;
-
 	area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
 					GFP_KERNEL | __GFP_ZERO);
 	if (!area->user_refs)

From a79154ae5df9e21dbacb1eb77fad984fd4c45cca Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 20 Apr 2025 10:31:16 +0100
Subject: [PATCH 12/59] io_uring/zcrx: move io_zcrx_iov_page

We'll need io_zcrx_iov_page at the top to keep offset calculations
closer together, move it there.

Reviewed-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/575617033a8b84a5985c7eb760b7121efdbe7e56.1745141261.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 79f58c94494b..ecfe080b24c5 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -31,6 +31,20 @@ static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
 	return pp->mp_priv;
 }
 
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+	struct net_iov_area *owner = net_iov_owner(niov);
+
+	return container_of(owner, struct io_zcrx_area, nia);
+}
+
+static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
+{
+	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+	return area->pages[net_iov_idx(niov)];
+}
+
 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
 
 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
@@ -118,13 +132,6 @@ struct io_zcrx_args {
 
 static const struct memory_provider_ops io_uring_pp_zc_ops;
 
-static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
-{
-	struct net_iov_area *owner = net_iov_owner(niov);
-
-	return container_of(owner, struct io_zcrx_area, nia);
-}
-
 static inline atomic_t *io_get_user_counter(struct net_iov *niov)
 {
 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
@@ -147,13 +154,6 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
 	atomic_inc(io_get_user_counter(niov));
 }
 
-static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
-{
-	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
-
-	return area->pages[net_iov_idx(niov)];
-}
-
 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 				 struct io_uring_zcrx_ifq_reg *reg,
 				 struct io_uring_region_desc *rd)

From 59bc1ab922bbb36558292c204e56ab951e833384 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 20 Apr 2025 10:31:17 +0100
Subject: [PATCH 13/59] io_uring/zcrx: remove sqe->file_index check

sqe->file_index and sqe->zcrx_ifq_idx are aliased, so even though
io_recvzc_prep() has all necessary handling and checking for
->zcrx_ifq_idx, it's already rejected a couple of lines above.

It's not a real problem, however, as we can only have one ifq at the
moment, which index is always zero, and it returns correct error
codes to the user.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b51acedd326d1a87bf53e33303e6fc87661a3e3f.1745141261.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 5f1a519d1fc6..782f8e76c5c7 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1185,8 +1185,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
 	unsigned ifq_idx;
 
-	if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr ||
-		     sqe->addr3))
+	if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
 		return -EINVAL;
 
 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);

From 77231d4e46555d30289b1909c2a2f26bcf00f08c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 20 Apr 2025 10:31:18 +0100
Subject: [PATCH 14/59] io_uring/zcrx: let zcrx choose region for mmaping

In preparation for adding multiple ifqs, add a helper returning a region
for mmaping zcrx refill queue. For now it's trivial and returns the same
ctx global ->zcrx_region.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d9006e2ef8cd5e5b337c2ba2228ede8409eb60f2.1745141261.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/memmap.c | 11 +++++++----
 io_uring/memmap.h |  2 ++
 io_uring/zcrx.c   | 10 ++++++++++
 io_uring/zcrx.h   |  7 +++++++
 4 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 76fcc79656b0..5cf3f23e751b 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -13,6 +13,7 @@
 #include "memmap.h"
 #include "kbuf.h"
 #include "rsrc.h"
+#include "zcrx.h"
 
 static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
 				   size_t size, gfp_t gfp)
@@ -258,7 +259,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
 						   loff_t pgoff)
 {
 	loff_t offset = pgoff << PAGE_SHIFT;
-	unsigned int bgid;
+	unsigned int id;
+
 
 	switch (offset & IORING_OFF_MMAP_MASK) {
 	case IORING_OFF_SQ_RING:
@@ -267,12 +269,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
 	case IORING_OFF_SQES:
 		return &ctx->sq_region;
 	case IORING_OFF_PBUF_RING:
-		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
-		return io_pbuf_get_region(ctx, bgid);
+		id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+		return io_pbuf_get_region(ctx, id);
 	case IORING_MAP_OFF_PARAM_REGION:
 		return &ctx->param_region;
 	case IORING_MAP_OFF_ZCRX_REGION:
-		return &ctx->zcrx_region;
+		id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT;
+		return io_zcrx_get_region(ctx, id);
 	}
 	return NULL;
 }
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index dad0aa5b1b45..24afb298e974 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -4,6 +4,8 @@
 #define IORING_MAP_OFF_PARAM_REGION		0x20000000ULL
 #define IORING_MAP_OFF_ZCRX_REGION		0x30000000ULL
 
+#define IORING_OFF_ZCRX_SHIFT		16
+
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 
 #ifndef CONFIG_MMU
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index ecfe080b24c5..b7e65ae413b6 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -338,6 +338,16 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 	kfree(ifq);
 }
 
+struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+					    unsigned int id)
+{
+	lockdep_assert_held(&ctx->mmap_lock);
+
+	if (id != 0)
+		return NULL;
+	return &ctx->zcrx_region;
+}
+
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zcrx_ifq_reg __user *arg)
 {
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index f2bc811f022c..43134e5c9213 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -49,6 +49,8 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 		 struct socket *sock, unsigned int flags,
 		 unsigned issue_flags, unsigned int *len);
+struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+					    unsigned int id);
 #else
 static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 					struct io_uring_zcrx_ifq_reg __user *arg)
@@ -67,6 +69,11 @@ static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 {
 	return -EOPNOTSUPP;
 }
+static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+							  unsigned int id)
+{
+	return NULL;
+}
 #endif
 
 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);

From 632b3186726984319e2337987de86a442407f30e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 20 Apr 2025 10:31:19 +0100
Subject: [PATCH 15/59] io_uring/zcrx: move zcrx region to struct io_zcrx_ifq

Refill queue region is a part of zcrx and should stay in struct
io_zcrx_ifq. We can't have multiple queues without it, so move it there.

As a result there is no context global zcrx region anymore, and the
region is looked up together with its ifq. To protect a concurrent mmap
from seeing an inconsistent region we were protecting changes to
->zcrx_region with mmap_lock, but now it protect the publishing of the
ifq.

Reviewed-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/24f1a728fc03d0166f16d099575457e10d9d90f2.1745141261.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 --
 io_uring/zcrx.c                | 20 ++++++++++++--------
 io_uring/zcrx.h                |  1 +
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3b467879bca8..06d722289fc5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -448,8 +448,6 @@ struct io_ring_ctx {
 	struct io_mapped_region		ring_region;
 	/* used for optimised request parameter and wait argument passing  */
 	struct io_mapped_region		param_region;
-	/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
-	struct io_mapped_region		zcrx_region;
 };
 
 /*
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index b7e65ae413b6..033284b695c7 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -167,12 +167,11 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 	if (size > rd->size)
 		return -EINVAL;
 
-	ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
-					 IORING_MAP_OFF_ZCRX_REGION);
+	ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION);
 	if (ret < 0)
 		return ret;
 
-	ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
+	ptr = io_region_get_ptr(&ifq->region);
 	ifq->rq_ring = (struct io_uring *)ptr;
 	ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
 	return 0;
@@ -180,7 +179,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 
 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
 {
-	io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
+	if (WARN_ON_ONCE(ifq->ctx->ifq))
+		return;
+
+	io_free_region(ifq->ctx, &ifq->region);
 	ifq->rq_ring = NULL;
 	ifq->rqes = NULL;
 }
@@ -343,9 +345,9 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
 {
 	lockdep_assert_held(&ctx->mmap_lock);
 
-	if (id != 0)
+	if (id != 0 || !ctx->ifq)
 		return NULL;
-	return &ctx->zcrx_region;
+	return &ctx->ifq->region;
 }
 
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
@@ -433,7 +435,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		ret = -EFAULT;
 		goto err;
 	}
-	ctx->ifq = ifq;
+	scoped_guard(mutex, &ctx->mmap_lock)
+		ctx->ifq = ifq;
 	return 0;
 err:
 	io_zcrx_ifq_free(ifq);
@@ -449,7 +452,8 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
 	if (!ifq)
 		return;
 
-	ctx->ifq = NULL;
+	scoped_guard(mutex, &ctx->mmap_lock)
+		ctx->ifq = NULL;
 	io_zcrx_ifq_free(ifq);
 }
 
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 43134e5c9213..e3c7c4e647f1 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -39,6 +39,7 @@ struct io_zcrx_ifq {
 	netdevice_tracker		netdev_tracker;
 	spinlock_t			lock;
 	struct mutex			dma_lock;
+	struct io_mapped_region		region;
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)

From 76f1cc98b23cefd1f0ae90c51f1fb837e5f46528 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 20 Apr 2025 10:31:20 +0100
Subject: [PATCH 16/59] io_uring/zcrx: add support for multiple ifqs

Allow the user to register multiple ifqs / zcrx contexts. With that we
can use multiple interfaces / interface queues in a single io_uring
instance.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/668b03bee03b5216564482edcfefbc2ee337dd30.1745141261.git.asml.silence@gmail.com
[axboe: fold in fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  5 +--
 io_uring/io_uring.c            |  3 +-
 io_uring/net.c                 |  5 +--
 io_uring/zcrx.c                | 73 +++++++++++++++++++++++-----------
 4 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 06d722289fc5..7e23e993280e 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -40,8 +40,6 @@ enum io_uring_cmd_flags {
 	IO_URING_F_TASK_DEAD		= (1 << 13),
 };
 
-struct io_zcrx_ifq;
-
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
@@ -394,7 +392,8 @@ struct io_ring_ctx {
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
 
-	struct io_zcrx_ifq		*ifq;
+	/* Stores zcrx object pointers of type struct io_zcrx_ifq */
+	struct xarray			zcrx_ctxs;
 
 	u32			pers_next;
 	struct xarray		personalities;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 75c022526548..0dc6c2f1295e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
 	INIT_HLIST_HEAD(&ctx->waitid_list);
+	xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
 #ifdef CONFIG_FUTEX
 	INIT_HLIST_HEAD(&ctx->futex_list);
 #endif
@@ -2889,7 +2890,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 			io_cqring_overflow_kill(ctx);
 			mutex_unlock(&ctx->uring_lock);
 		}
-		if (ctx->ifq) {
+		if (!xa_empty(&ctx->zcrx_ctxs)) {
 			mutex_lock(&ctx->uring_lock);
 			io_shutdown_zcrx_ifqs(ctx);
 			mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/net.c b/io_uring/net.c
index 782f8e76c5c7..b3a643675ce8 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1189,11 +1189,10 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
-	if (ifq_idx != 0)
-		return -EINVAL;
-	zc->ifq = req->ctx->ifq;
+	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
 	if (!zc->ifq)
 		return -EINVAL;
+
 	zc->len = READ_ONCE(sqe->len);
 	zc->flags = READ_ONCE(sqe->ioprio);
 	zc->msg_flags = READ_ONCE(sqe->msg_flags);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 033284b695c7..22f420d6fbb9 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -156,8 +156,10 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
 
 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 				 struct io_uring_zcrx_ifq_reg *reg,
-				 struct io_uring_region_desc *rd)
+				 struct io_uring_region_desc *rd,
+				 u32 id)
 {
+	u64 mmap_offset;
 	size_t off, size;
 	void *ptr;
 	int ret;
@@ -167,7 +169,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 	if (size > rd->size)
 		return -EINVAL;
 
-	ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION);
+	mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
+	mmap_offset += id << IORING_OFF_PBUF_SHIFT;
+
+	ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
 	if (ret < 0)
 		return ret;
 
@@ -179,9 +184,6 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 
 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
 {
-	if (WARN_ON_ONCE(ifq->ctx->ifq))
-		return;
-
 	io_free_region(ifq->ctx, &ifq->region);
 	ifq->rq_ring = NULL;
 	ifq->rqes = NULL;
@@ -343,11 +345,11 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
 					    unsigned int id)
 {
+	struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
+
 	lockdep_assert_held(&ctx->mmap_lock);
 
-	if (id != 0 || !ctx->ifq)
-		return NULL;
-	return &ctx->ifq->region;
+	return ifq ? &ifq->region : NULL;
 }
 
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
@@ -359,6 +361,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	struct io_uring_region_desc rd;
 	struct io_zcrx_ifq *ifq;
 	int ret;
+	u32 id;
 
 	/*
 	 * 1. Interface queue allocation.
@@ -371,8 +374,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
 	      ctx->flags & IORING_SETUP_CQE32))
 		return -EINVAL;
-	if (ctx->ifq)
-		return -EBUSY;
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
@@ -396,7 +397,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	if (!ifq)
 		return -ENOMEM;
 
-	ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
+	scoped_guard(mutex, &ctx->mmap_lock) {
+		/* preallocate id */
+		ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
+		if (ret)
+			goto ifq_free;
+	}
+
+	ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id);
 	if (ret)
 		goto err;
 
@@ -428,6 +436,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	reg.offsets.rqes = sizeof(struct io_uring);
 	reg.offsets.head = offsetof(struct io_uring, head);
 	reg.offsets.tail = offsetof(struct io_uring, tail);
+	reg.zcrx_id = id;
+
+	scoped_guard(mutex, &ctx->mmap_lock) {
+		/* publish ifq */
+		ret = -ENOMEM;
+		if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
+			goto err;
+	}
 
 	if (copy_to_user(arg, &reg, sizeof(reg)) ||
 	    copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
@@ -435,26 +451,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		ret = -EFAULT;
 		goto err;
 	}
-	scoped_guard(mutex, &ctx->mmap_lock)
-		ctx->ifq = ifq;
 	return 0;
 err:
+	scoped_guard(mutex, &ctx->mmap_lock)
+		xa_erase(&ctx->zcrx_ctxs, id);
+ifq_free:
 	io_zcrx_ifq_free(ifq);
 	return ret;
 }
 
 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
 {
-	struct io_zcrx_ifq *ifq = ctx->ifq;
+	struct io_zcrx_ifq *ifq;
+	unsigned long id;
 
 	lockdep_assert_held(&ctx->uring_lock);
 
-	if (!ifq)
-		return;
+	while (1) {
+		scoped_guard(mutex, &ctx->mmap_lock) {
+			ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
+			if (ifq)
+				xa_erase(&ctx->zcrx_ctxs, id);
+		}
+		if (!ifq)
+			break;
+		io_zcrx_ifq_free(ifq);
+	}
 
-	scoped_guard(mutex, &ctx->mmap_lock)
-		ctx->ifq = NULL;
-	io_zcrx_ifq_free(ifq);
+	xa_destroy(&ctx->zcrx_ctxs);
 }
 
 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
@@ -511,12 +535,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
 
 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
 {
+	struct io_zcrx_ifq *ifq;
+	unsigned long index;
+
 	lockdep_assert_held(&ctx->uring_lock);
 
-	if (!ctx->ifq)
-		return;
-	io_zcrx_scrub(ctx->ifq);
-	io_close_queue(ctx->ifq);
+	xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
+		io_zcrx_scrub(ifq);
+		io_close_queue(ifq);
+	}
 }
 
 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)

From 62f666df765ecaf9cc1892ca056d5c071a335d85 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Apr 2025 12:31:16 +0100
Subject: [PATCH 17/59] io_uring/eventfd: dedup signalling helpers

Consolidate io_eventfd_flush_signal() and io_eventfd_signal(). Not much
of a difference for now, but it prepares it for following changes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/5beecd4da65d8d2d83df499196f84b329387f6a2.1745493845.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/eventfd.c  | 26 +++++++++-----------------
 io_uring/eventfd.h  |  3 +--
 io_uring/io_uring.c |  4 ++--
 3 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index 100d5da94cb9..a9da2d0d7510 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -112,23 +112,16 @@ static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
 	return NULL;
 }
 
-void io_eventfd_signal(struct io_ring_ctx *ctx)
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
 {
+	bool skip = false, put_ref = true;
 	struct io_ev_fd *ev_fd;
 
 	ev_fd = io_eventfd_grab(ctx);
-	if (ev_fd)
-		io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
-}
-
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
-{
-	struct io_ev_fd *ev_fd;
-
-	ev_fd = io_eventfd_grab(ctx);
-	if (ev_fd) {
-		bool skip, put_ref = true;
+	if (!ev_fd)
+		return;
 
+	if (cqe_event) {
 		/*
 		 * Eventfd should only get triggered when at least one event
 		 * has been posted. Some applications rely on the eventfd
@@ -142,12 +135,11 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
 		skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
 		ev_fd->last_cq_tail = ctx->cached_cq_tail;
 		spin_unlock(&ctx->completion_lock);
-
-		if (!skip)
-			put_ref = __io_eventfd_signal(ev_fd);
-
-		io_eventfd_release(ev_fd, put_ref);
 	}
+
+	if (!skip)
+		put_ref = __io_eventfd_signal(ev_fd);
+	io_eventfd_release(ev_fd, put_ref);
 }
 
 int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h
index d394f49c6321..e2f1985c2cf9 100644
--- a/io_uring/eventfd.h
+++ b/io_uring/eventfd.h
@@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int eventfd_async);
 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
-void io_eventfd_signal(struct io_ring_ctx *ctx);
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0dc6c2f1295e..2e4d8e76316a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -584,7 +584,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 	if (ctx->drain_active)
 		io_queue_deferred(ctx);
 	if (ctx->has_evfd)
-		io_eventfd_flush_signal(ctx);
+		io_eventfd_signal(ctx, true);
 }
 
 static inline void __io_cq_lock(struct io_ring_ctx *ctx)
@@ -1199,7 +1199,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 			atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 		if (ctx->has_evfd)
-			io_eventfd_signal(ctx);
+			io_eventfd_signal(ctx, false);
 	}
 
 	nr_wait = atomic_read(&ctx->cq_wait_nr);

From da01f60f8ad144a8a0844833a8d0f0005b0a7c51 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Apr 2025 12:31:17 +0100
Subject: [PATCH 18/59] io_uring/eventfd: clean up rcu locking

Conditional locking is never welcome if there are better options. Move
rcu locking into io_eventfd_signal(), make it unconditional and use
guards. It also helps with sparse warnings.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/91a925e708ca8a5aa7fee61f96d29b24ea9adeaf.1745493845.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/eventfd.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index a9da2d0d7510..8c2835ac17a0 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
 	io_eventfd_put(ev_fd);
 }
 
-static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
-{
-	if (put_ref)
-		io_eventfd_put(ev_fd);
-	rcu_read_unlock();
-}
-
 /*
  * Returns true if the caller should put the ev_fd reference, false if not.
  */
@@ -89,11 +82,6 @@ static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd;
 
-	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
-		return NULL;
-
-	rcu_read_lock();
-
 	/*
 	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
 	 * and eventfd_signal
@@ -108,15 +96,18 @@ static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
 	if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
 		return ev_fd;
 
-	rcu_read_unlock();
 	return NULL;
 }
 
 void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
 {
-	bool skip = false, put_ref = true;
+	bool skip = false;
 	struct io_ev_fd *ev_fd;
 
+	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
+		return;
+
+	guard(rcu)();
 	ev_fd = io_eventfd_grab(ctx);
 	if (!ev_fd)
 		return;
@@ -137,9 +128,8 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
 		spin_unlock(&ctx->completion_lock);
 	}
 
-	if (!skip)
-		put_ref = __io_eventfd_signal(ev_fd);
-	io_eventfd_release(ev_fd, put_ref);
+	if (skip || __io_eventfd_signal(ev_fd))
+		io_eventfd_put(ev_fd);
 }
 
 int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,

From f6da4fee69860d4a02d14b016021ffb516a25f38 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Apr 2025 12:31:18 +0100
Subject: [PATCH 19/59] io_uring/eventfd: open code io_eventfd_grab()

io_eventfd_grab() doesn't help wit understanding the path, it'll be
simpler to keep the helper open coded.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/5cb53ce3876c2819db9e8055cf41dca4398521db.1745493845.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/eventfd.c | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index 8c2835ac17a0..78f8ab7db104 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -65,38 +65,11 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
 
 /*
  * Trigger if eventfd_async isn't set, or if it's set and the caller is
- * an async worker. If ev_fd isn't valid, obviously return false.
+ * an async worker.
  */
 static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
 {
-	if (ev_fd)
-		return !ev_fd->eventfd_async || io_wq_current_is_worker();
-	return false;
-}
-
-/*
- * On success, returns with an ev_fd reference grabbed and the RCU read
- * lock held.
- */
-static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
-{
-	struct io_ev_fd *ev_fd;
-
-	/*
-	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
-	 * and eventfd_signal
-	 */
-	ev_fd = rcu_dereference(ctx->io_ev_fd);
-
-	/*
-	 * Check again if ev_fd exists in case an io_eventfd_unregister call
-	 * completed between the NULL check of ctx->io_ev_fd at the start of
-	 * the function and rcu_read_lock.
-	 */
-	if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
-		return ev_fd;
-
-	return NULL;
+	return !ev_fd->eventfd_async || io_wq_current_is_worker();
 }
 
 void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
@@ -108,9 +81,16 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
 		return;
 
 	guard(rcu)();
-	ev_fd = io_eventfd_grab(ctx);
+	ev_fd = rcu_dereference(ctx->io_ev_fd);
+	/*
+	 * Check again if ev_fd exists in case an io_eventfd_unregister call
+	 * completed between the NULL check of ctx->io_ev_fd at the start of
+	 * the function and rcu_read_lock.
+	 */
 	if (!ev_fd)
 		return;
+	if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs))
+		return;
 
 	if (cqe_event) {
 		/*

From 27d2fed790ce6407e321e89aac3c8c0e28986fff Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 28 Apr 2025 13:52:32 +0100
Subject: [PATCH 20/59] io_uring: delete misleading comment in
 io_fill_cqe_aux()

io_fill_cqe_aux() doesn't overflow completions, however it might fail
them and lets the caller handle it. Remove the comment, which doesn't
make any sense.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/021aa8c1d8f20ef2b66da6aeabb6b511938fd2c5.1745843119.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2e4d8e76316a..703251f6f4d8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -814,11 +814,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 
 	ctx->cq_extra++;
 
-	/*
-	 * If we can't get a cq entry, userspace overflowed the
-	 * submission (by quite a lot). Increment the overflow count in
-	 * the ring.
-	 */
 	if (likely(io_get_cqe(ctx, &cqe))) {
 		WRITE_ONCE(cqe->user_data, user_data);
 		WRITE_ONCE(cqe->res, res);

From 91db6edc573bf238c277602b2ea4b4f4688fdedc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 28 Apr 2025 13:52:33 +0100
Subject: [PATCH 21/59] io_uring/cmd: move net cmd into a separate file

We keep socket io_uring command implementation in io_uring/uring_cmd.c.
Separate it from generic command code into a separate file.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/747d0519a2255bd055ae76b691d38d2b4c311001.1745843119.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile    |  1 +
 io_uring/cmd_net.c   | 83 ++++++++++++++++++++++++++++++++++++++++++++
 io_uring/uring_cmd.c | 83 --------------------------------------------
 3 files changed, 84 insertions(+), 83 deletions(-)
 create mode 100644 io_uring/cmd_net.c

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 3e28a741ca15..75e0ca795685 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_IO_WQ)		+= io-wq.o
 obj-$(CONFIG_FUTEX)		+= futex.o
 obj-$(CONFIG_EPOLL)		+= epoll.o
 obj-$(CONFIG_NET_RX_BUSY_POLL)	+= napi.o
+obj-$(CONFIG_NET) += cmd_net.o
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
new file mode 100644
index 000000000000..e99170c7d41a
--- /dev/null
+++ b/io_uring/cmd_net.c
@@ -0,0 +1,83 @@
+#include <asm/ioctls.h>
+#include <linux/io_uring/net.h>
+#include <net/sock.h>
+
+#include "uring_cmd.h"
+
+static inline int io_uring_cmd_getsockopt(struct socket *sock,
+					  struct io_uring_cmd *cmd,
+					  unsigned int issue_flags)
+{
+	const struct io_uring_sqe *sqe = cmd->sqe;
+	bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+	int optlen, optname, level, err;
+	void __user *optval;
+
+	level = READ_ONCE(sqe->level);
+	if (level != SOL_SOCKET)
+		return -EOPNOTSUPP;
+
+	optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+	optname = READ_ONCE(sqe->optname);
+	optlen = READ_ONCE(sqe->optlen);
+
+	err = do_sock_getsockopt(sock, compat, level, optname,
+				 USER_SOCKPTR(optval),
+				 KERNEL_SOCKPTR(&optlen));
+	if (err)
+		return err;
+
+	/* On success, return optlen */
+	return optlen;
+}
+
+static inline int io_uring_cmd_setsockopt(struct socket *sock,
+					  struct io_uring_cmd *cmd,
+					  unsigned int issue_flags)
+{
+	const struct io_uring_sqe *sqe = cmd->sqe;
+	bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+	int optname, optlen, level;
+	void __user *optval;
+	sockptr_t optval_s;
+
+	optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+	optname = READ_ONCE(sqe->optname);
+	optlen = READ_ONCE(sqe->optlen);
+	level = READ_ONCE(sqe->level);
+	optval_s = USER_SOCKPTR(optval);
+
+	return do_sock_setsockopt(sock, compat, level, optname, optval_s,
+				  optlen);
+}
+
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	struct socket *sock = cmd->file->private_data;
+	struct sock *sk = sock->sk;
+	struct proto *prot = READ_ONCE(sk->sk_prot);
+	int ret, arg = 0;
+
+	if (!prot || !prot->ioctl)
+		return -EOPNOTSUPP;
+
+	switch (cmd->cmd_op) {
+	case SOCKET_URING_OP_SIOCINQ:
+		ret = prot->ioctl(sk, SIOCINQ, &arg);
+		if (ret)
+			return ret;
+		return arg;
+	case SOCKET_URING_OP_SIOCOUTQ:
+		ret = prot->ioctl(sk, SIOCOUTQ, &arg);
+		if (ret)
+			return ret;
+		return arg;
+	case SOCKET_URING_OP_GETSOCKOPT:
+		return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
+	case SOCKET_URING_OP_SETSOCKOPT:
+		return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index a9ea7d29cdd9..34b450c78e2b 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -3,13 +3,10 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/io_uring/cmd.h>
-#include <linux/io_uring/net.h>
 #include <linux/security.h>
 #include <linux/nospec.h>
-#include <net/sock.h>
 
 #include <uapi/linux/io_uring.h>
-#include <asm/ioctls.h>
 
 #include "io_uring.h"
 #include "alloc_cache.h"
@@ -302,83 +299,3 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
 
 	io_req_queue_iowq(req);
 }
-
-static inline int io_uring_cmd_getsockopt(struct socket *sock,
-					  struct io_uring_cmd *cmd,
-					  unsigned int issue_flags)
-{
-	const struct io_uring_sqe *sqe = cmd->sqe;
-	bool compat = !!(issue_flags & IO_URING_F_COMPAT);
-	int optlen, optname, level, err;
-	void __user *optval;
-
-	level = READ_ONCE(sqe->level);
-	if (level != SOL_SOCKET)
-		return -EOPNOTSUPP;
-
-	optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
-	optname = READ_ONCE(sqe->optname);
-	optlen = READ_ONCE(sqe->optlen);
-
-	err = do_sock_getsockopt(sock, compat, level, optname,
-				 USER_SOCKPTR(optval),
-				 KERNEL_SOCKPTR(&optlen));
-	if (err)
-		return err;
-
-	/* On success, return optlen */
-	return optlen;
-}
-
-static inline int io_uring_cmd_setsockopt(struct socket *sock,
-					  struct io_uring_cmd *cmd,
-					  unsigned int issue_flags)
-{
-	const struct io_uring_sqe *sqe = cmd->sqe;
-	bool compat = !!(issue_flags & IO_URING_F_COMPAT);
-	int optname, optlen, level;
-	void __user *optval;
-	sockptr_t optval_s;
-
-	optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
-	optname = READ_ONCE(sqe->optname);
-	optlen = READ_ONCE(sqe->optlen);
-	level = READ_ONCE(sqe->level);
-	optval_s = USER_SOCKPTR(optval);
-
-	return do_sock_setsockopt(sock, compat, level, optname, optval_s,
-				  optlen);
-}
-
-#if defined(CONFIG_NET)
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
-{
-	struct socket *sock = cmd->file->private_data;
-	struct sock *sk = sock->sk;
-	struct proto *prot = READ_ONCE(sk->sk_prot);
-	int ret, arg = 0;
-
-	if (!prot || !prot->ioctl)
-		return -EOPNOTSUPP;
-
-	switch (cmd->cmd_op) {
-	case SOCKET_URING_OP_SIOCINQ:
-		ret = prot->ioctl(sk, SIOCINQ, &arg);
-		if (ret)
-			return ret;
-		return arg;
-	case SOCKET_URING_OP_SIOCOUTQ:
-		ret = prot->ioctl(sk, SIOCOUTQ, &arg);
-		if (ret)
-			return ret;
-		return arg;
-	case SOCKET_URING_OP_GETSOCKOPT:
-		return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
-	case SOCKET_URING_OP_SETSOCKOPT:
-		return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
-#endif

From d760d3f59f0d8d0df2895db30d36cf23106d6b05 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 May 2025 13:17:14 +0100
Subject: [PATCH 22/59] io_uring/zcrx: improve area validation

dmabuf backed area will be taking an offset instead of addresses, and
io_buffer_validate() is not flexible enough to facilitate it. It also
takes an iovec, which may truncate the u64 length zcrx takes. Add a new
helper function for validation.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0b3b735391a0a8f8971bf0121c19765131fddd3b.1746097431.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 27 +++++++++++++++------------
 io_uring/rsrc.h |  2 +-
 io_uring/zcrx.c |  7 +++----
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index b4c5f3ee8855..1657d775c8ba 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 	return 0;
 }
 
-int io_buffer_validate(struct iovec *iov)
+int io_validate_user_buf_range(u64 uaddr, u64 ulen)
 {
-	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+	unsigned long tmp, base = (unsigned long)uaddr;
+	unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
 
+	/* arbitrary limit, but we need something */
+	if (ulen > SZ_1G || !ulen)
+		return -EFAULT;
+	if (check_add_overflow(base, acct_len, &tmp))
+		return -EOVERFLOW;
+	return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
 	/*
 	 * Don't impose further limits on the size and buffer
 	 * constraints here, we'll -EINVAL later when IO is
@@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov)
 	 */
 	if (!iov->iov_base)
 		return iov->iov_len ? -EFAULT : 0;
-	if (!iov->iov_len)
-		return -EFAULT;
 
-	/* arbitrary limit, but we need something */
-	if (iov->iov_len > SZ_1G)
-		return -EFAULT;
-
-	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
-		return -EOVERFLOW;
-
-	return 0;
+	return io_validate_user_buf_range((unsigned long)iov->iov_base,
+					  iov->iov_len);
 }
 
 static void io_release_ubuf(void *priv)
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 6008ad2e6d9e..2818aa0d0472 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -83,7 +83,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned size, unsigned type);
 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int size, unsigned int type);
-int io_buffer_validate(struct iovec *iov);
+int io_validate_user_buf_range(u64 uaddr, u64 ulen);
 
 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
 			      struct io_imu_folio_data *data);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 22f420d6fbb9..5e918587fdc5 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -209,7 +209,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 {
 	struct io_zcrx_area *area;
 	int i, ret, nr_pages, nr_iovs;
-	struct iovec iov;
 
 	if (area_reg->flags || area_reg->rq_area_token)
 		return -EINVAL;
@@ -218,11 +217,11 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
 		return -EINVAL;
 
-	iov.iov_base = u64_to_user_ptr(area_reg->addr);
-	iov.iov_len = area_reg->len;
-	ret = io_buffer_validate(&iov);
+	ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
 	if (ret)
 		return ret;
+	if (!area_reg->addr)
+		return -EFAULT;
 
 	ret = -ENOMEM;
 	area = kzalloc(sizeof(*area), GFP_KERNEL);

From 6c9589aa08471f8984cdb5e743d2a2c048dc2403 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 May 2025 13:17:15 +0100
Subject: [PATCH 23/59] io_uring/zcrx: resolve netdev before area creation

Some area types will require a valid struct device to be created, so
resolve netdev and struct device before creating an area.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ac8c1482be22acfe9ca788d2c3ce31b7451ce488.1746097431.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 5e918587fdc5..b5335dd4f5b1 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -395,6 +395,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	ifq = io_zcrx_ifq_alloc(ctx);
 	if (!ifq)
 		return -ENOMEM;
+	ifq->rq_entries = reg.rq_entries;
 
 	scoped_guard(mutex, &ctx->mmap_lock) {
 		/* preallocate id */
@@ -407,24 +408,24 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	if (ret)
 		goto err;
 
+	ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
+					  &ifq->netdev_tracker, GFP_KERNEL);
+	if (!ifq->netdev) {
+		ret = -ENODEV;
+		goto err;
+	}
+
+	ifq->dev = ifq->netdev->dev.parent;
+	if (!ifq->dev) {
+		ret = -EOPNOTSUPP;
+		goto err;
+	}
+	get_device(ifq->dev);
+
 	ret = io_zcrx_create_area(ifq, &ifq->area, &area);
 	if (ret)
 		goto err;
 
-	ifq->rq_entries = reg.rq_entries;
-
-	ret = -ENODEV;
-	ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
-					  &ifq->netdev_tracker, GFP_KERNEL);
-	if (!ifq->netdev)
-		goto err;
-
-	ifq->dev = ifq->netdev->dev.parent;
-	ret = -EOPNOTSUPP;
-	if (!ifq->dev)
-		goto err;
-	get_device(ifq->dev);
-
 	mp_param.mp_ops = &io_uring_pp_zc_ops;
 	mp_param.mp_priv = ifq;
 	ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);

From 782dfa329ac9d1b5ca7b6df56a7696bac58cb829 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 May 2025 13:17:16 +0100
Subject: [PATCH 24/59] io_uring/zcrx: split out memory holders from area

In the data path users of struct io_zcrx_area don't need to know what
kind of memory it's backed by. Only keep there generic bits in there and
and split out memory type dependent fields into a new structure. It also
logically separates the step that actually imports the memory, e.g.
pinning user pages, from the generic area initialisation.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b60fc09c76921bf69e77eb17e07eb4decedb3bf4.1746097431.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 71 ++++++++++++++++++++++++++++++++-----------------
 io_uring/zcrx.h | 11 ++++++--
 2 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index b5335dd4f5b1..8d4cfd957e38 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -26,6 +26,8 @@
 #include "zcrx.h"
 #include "rsrc.h"
 
+#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
 {
 	return pp->mp_priv;
@@ -42,10 +44,43 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
 {
 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
 
-	return area->pages[net_iov_idx(niov)];
+	return area->mem.pages[net_iov_idx(niov)];
 }
 
-#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+static void io_release_area_mem(struct io_zcrx_mem *mem)
+{
+	if (mem->pages) {
+		unpin_user_pages(mem->pages, mem->nr_folios);
+		kvfree(mem->pages);
+	}
+}
+
+static int io_import_area(struct io_zcrx_ifq *ifq,
+			  struct io_zcrx_mem *mem,
+			  struct io_uring_zcrx_area_reg *area_reg)
+{
+	struct page **pages;
+	int nr_pages;
+	int ret;
+
+	ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
+	if (ret)
+		return ret;
+	if (!area_reg->addr)
+		return -EFAULT;
+	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+		return -EINVAL;
+
+	pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
+				   &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	mem->pages = pages;
+	mem->nr_folios = nr_pages;
+	mem->size = area_reg->len;
+	return 0;
+}
 
 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
 				 struct io_zcrx_area *area, int nr_mapped)
@@ -84,8 +119,8 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 		struct net_iov *niov = &area->nia.niovs[i];
 		dma_addr_t dma;
 
-		dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE,
-					 DMA_FROM_DEVICE, IO_DMA_ATTR);
+		dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
+					 PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
 		if (dma_mapping_error(ifq->dev, dma))
 			break;
 		if (net_mp_niov_set_dma_addr(niov, dma)) {
@@ -192,14 +227,11 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
 static void io_zcrx_free_area(struct io_zcrx_area *area)
 {
 	io_zcrx_unmap_area(area->ifq, area);
+	io_release_area_mem(&area->mem);
 
 	kvfree(area->freelist);
 	kvfree(area->nia.niovs);
 	kvfree(area->user_refs);
-	if (area->pages) {
-		unpin_user_pages(area->pages, area->nr_folios);
-		kvfree(area->pages);
-	}
 	kfree(area);
 }
 
@@ -208,36 +240,27 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 			       struct io_uring_zcrx_area_reg *area_reg)
 {
 	struct io_zcrx_area *area;
-	int i, ret, nr_pages, nr_iovs;
+	unsigned nr_iovs;
+	int i, ret;
 
 	if (area_reg->flags || area_reg->rq_area_token)
 		return -EINVAL;
 	if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
 		return -EINVAL;
-	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
-		return -EINVAL;
-
-	ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
-	if (ret)
-		return ret;
-	if (!area_reg->addr)
-		return -EFAULT;
 
 	ret = -ENOMEM;
 	area = kzalloc(sizeof(*area), GFP_KERNEL);
 	if (!area)
 		goto err;
 
-	area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
-				   &nr_pages);
-	if (IS_ERR(area->pages)) {
-		ret = PTR_ERR(area->pages);
-		area->pages = NULL;
+	ret = io_import_area(ifq, &area->mem, area_reg);
+	if (ret)
 		goto err;
-	}
-	area->nr_folios = nr_iovs = nr_pages;
+
+	nr_iovs = area->mem.size >> PAGE_SHIFT;
 	area->nia.num_niovs = nr_iovs;
 
+	ret = -ENOMEM;
 	area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
 					 GFP_KERNEL | __GFP_ZERO);
 	if (!area->nia.niovs)
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index e3c7c4e647f1..9c22807af807 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -7,6 +7,13 @@
 #include <net/page_pool/types.h>
 #include <net/net_trackers.h>
 
+struct io_zcrx_mem {
+	unsigned long			size;
+
+	struct page			**pages;
+	unsigned long			nr_folios;
+};
+
 struct io_zcrx_area {
 	struct net_iov_area	nia;
 	struct io_zcrx_ifq	*ifq;
@@ -14,13 +21,13 @@ struct io_zcrx_area {
 
 	bool			is_mapped;
 	u16			area_id;
-	struct page		**pages;
-	unsigned long		nr_folios;
 
 	/* freelist */
 	spinlock_t		freelist_lock ____cacheline_aligned_in_smp;
 	u32			free_count;
 	u32			*freelist;
+
+	struct io_zcrx_mem	mem;
 };
 
 struct io_zcrx_ifq {

From 8a62804248fff77749048a0f5511649b2569bba9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 May 2025 13:17:17 +0100
Subject: [PATCH 25/59] io_uring/zcrx: split common area map/unmap parts

Extract area type depedent parts of io_zcrx_[un]map_area from the
generic path. It'll be helpful once there are more area memory types
and not only user pages.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/50f6e893e2d20f937e628196cbf528d15f81c289.1746097431.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 8d4cfd957e38..34b09beba992 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -82,20 +82,29 @@ static int io_import_area(struct io_zcrx_ifq *ifq,
 	return 0;
 }
 
+static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
+				struct io_zcrx_area *area, int nr_mapped)
+{
+	int i;
+
+	for (i = 0; i < nr_mapped; i++) {
+		netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
+		dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
+
+		dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
+				     DMA_FROM_DEVICE, IO_DMA_ATTR);
+	}
+}
+
 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
 				 struct io_zcrx_area *area, int nr_mapped)
 {
 	int i;
 
-	for (i = 0; i < nr_mapped; i++) {
-		struct net_iov *niov = &area->nia.niovs[i];
-		dma_addr_t dma;
+	io_zcrx_unmap_umem(ifq, area, nr_mapped);
 
-		dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
-		dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
-				     DMA_FROM_DEVICE, IO_DMA_ATTR);
-		net_mp_niov_set_dma_addr(niov, 0);
-	}
+	for (i = 0; i < area->nia.num_niovs; i++)
+		net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
 }
 
 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
@@ -107,14 +116,10 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *are
 	area->is_mapped = false;
 }
 
-static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 {
 	int i;
 
-	guard(mutex)(&ifq->dma_lock);
-	if (area->is_mapped)
-		return 0;
-
 	for (i = 0; i < area->nia.num_niovs; i++) {
 		struct net_iov *niov = &area->nia.niovs[i];
 		dma_addr_t dma;
@@ -129,9 +134,20 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 			break;
 		}
 	}
+	return i;
+}
 
-	if (i != area->nia.num_niovs) {
-		__io_zcrx_unmap_area(ifq, area, i);
+static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+{
+	unsigned nr;
+
+	guard(mutex)(&ifq->dma_lock);
+	if (area->is_mapped)
+		return 0;
+
+	nr = io_zcrx_map_area_umem(ifq, area);
+	if (nr != area->nia.num_niovs) {
+		__io_zcrx_unmap_area(ifq, area, nr);
 		return -EINVAL;
 	}
 

From a5c98e9424573649e59988199a3356a79c9e1fd9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 May 2025 13:17:18 +0100
Subject: [PATCH 26/59] io_uring/zcrx: dmabuf backed zerocopy receive

Add support for dmabuf backed zcrx areas. To use it, the user should
pass IORING_ZCRX_AREA_DMABUF in the struct io_uring_zcrx_area_reg flags
field and pass a dmabuf fd in the dmabuf_fd field.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20bb1890e60a82ec945ab36370d1fd54be414ab6.1746097431.git.asml.silence@gmail.com
Link: https://lore.kernel.org/io-uring/6e37db97303212bbd8955f9501cf99b579f8aece.1746547722.git.asml.silence@gmail.com
[axboe: fold in fixup]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   6 +-
 io_uring/zcrx.c               | 165 ++++++++++++++++++++++++++++++----
 io_uring/zcrx.h               |   7 ++
 3 files changed, 160 insertions(+), 18 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 130f3bc71a69..5ce096090b0c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -990,12 +990,16 @@ struct io_uring_zcrx_offsets {
 	__u64	__resv[2];
 };
 
+enum io_uring_zcrx_area_flags {
+	IORING_ZCRX_AREA_DMABUF		= 1,
+};
+
 struct io_uring_zcrx_area_reg {
 	__u64	addr;
 	__u64	len;
 	__u64	rq_area_token;
 	__u32	flags;
-	__u32	__resv1;
+	__u32	dmabuf_fd;
 	__u64	__resv2[2];
 };
 
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 34b09beba992..9a568d049204 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -47,8 +47,135 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
 	return area->mem.pages[net_iov_idx(niov)];
 }
 
+static void io_release_dmabuf(struct io_zcrx_mem *mem)
+{
+	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+		return;
+
+	if (mem->sgt)
+		dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
+						  DMA_FROM_DEVICE);
+	if (mem->attach)
+		dma_buf_detach(mem->dmabuf, mem->attach);
+	if (mem->dmabuf)
+		dma_buf_put(mem->dmabuf);
+
+	mem->sgt = NULL;
+	mem->attach = NULL;
+	mem->dmabuf = NULL;
+}
+
+static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
+			    struct io_zcrx_mem *mem,
+			    struct io_uring_zcrx_area_reg *area_reg)
+{
+	unsigned long off = (unsigned long)area_reg->addr;
+	unsigned long len = (unsigned long)area_reg->len;
+	unsigned long total_size = 0;
+	struct scatterlist *sg;
+	int dmabuf_fd = area_reg->dmabuf_fd;
+	int i, ret;
+
+	if (WARN_ON_ONCE(!ifq->dev))
+		return -EFAULT;
+	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+		return -EINVAL;
+
+	mem->is_dmabuf = true;
+	mem->dmabuf = dma_buf_get(dmabuf_fd);
+	if (IS_ERR(mem->dmabuf)) {
+		ret = PTR_ERR(mem->dmabuf);
+		mem->dmabuf = NULL;
+		goto err;
+	}
+
+	mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
+	if (IS_ERR(mem->attach)) {
+		ret = PTR_ERR(mem->attach);
+		mem->attach = NULL;
+		goto err;
+	}
+
+	mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
+	if (IS_ERR(mem->sgt)) {
+		ret = PTR_ERR(mem->sgt);
+		mem->sgt = NULL;
+		goto err;
+	}
+
+	for_each_sgtable_dma_sg(mem->sgt, sg, i)
+		total_size += sg_dma_len(sg);
+
+	if (total_size < off + len)
+		return -EINVAL;
+
+	mem->dmabuf_offset = off;
+	mem->size = len;
+	return 0;
+err:
+	io_release_dmabuf(mem);
+	return ret;
+}
+
+static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+{
+	unsigned long off = area->mem.dmabuf_offset;
+	struct scatterlist *sg;
+	unsigned i, niov_idx = 0;
+
+	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+		return -EINVAL;
+
+	for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
+		dma_addr_t dma = sg_dma_address(sg);
+		unsigned long sg_len = sg_dma_len(sg);
+		unsigned long sg_off = min(sg_len, off);
+
+		off -= sg_off;
+		sg_len -= sg_off;
+		dma += sg_off;
+
+		while (sg_len && niov_idx < area->nia.num_niovs) {
+			struct net_iov *niov = &area->nia.niovs[niov_idx];
+
+			if (net_mp_niov_set_dma_addr(niov, dma))
+				return 0;
+			sg_len -= PAGE_SIZE;
+			dma += PAGE_SIZE;
+			niov_idx++;
+		}
+	}
+	return niov_idx;
+}
+
+static int io_import_umem(struct io_zcrx_ifq *ifq,
+			  struct io_zcrx_mem *mem,
+			  struct io_uring_zcrx_area_reg *area_reg)
+{
+	struct page **pages;
+	int nr_pages;
+
+	if (area_reg->dmabuf_fd)
+		return -EINVAL;
+	if (!area_reg->addr)
+		return -EFAULT;
+	pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
+				   &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	mem->pages = pages;
+	mem->nr_folios = nr_pages;
+	mem->size = area_reg->len;
+	return 0;
+}
+
 static void io_release_area_mem(struct io_zcrx_mem *mem)
 {
+	if (mem->is_dmabuf) {
+		io_release_dmabuf(mem);
+		return;
+	}
 	if (mem->pages) {
 		unpin_user_pages(mem->pages, mem->nr_folios);
 		kvfree(mem->pages);
@@ -59,27 +186,17 @@ static int io_import_area(struct io_zcrx_ifq *ifq,
 			  struct io_zcrx_mem *mem,
 			  struct io_uring_zcrx_area_reg *area_reg)
 {
-	struct page **pages;
-	int nr_pages;
 	int ret;
 
 	ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
 	if (ret)
 		return ret;
-	if (!area_reg->addr)
-		return -EFAULT;
 	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
 		return -EINVAL;
 
-	pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
-				   &nr_pages);
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
-
-	mem->pages = pages;
-	mem->nr_folios = nr_pages;
-	mem->size = area_reg->len;
-	return 0;
+	if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
+		return io_import_dmabuf(ifq, mem, area_reg);
+	return io_import_umem(ifq, mem, area_reg);
 }
 
 static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
@@ -101,7 +218,10 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
 {
 	int i;
 
-	io_zcrx_unmap_umem(ifq, area, nr_mapped);
+	if (area->mem.is_dmabuf)
+		io_release_dmabuf(&area->mem);
+	else
+		io_zcrx_unmap_umem(ifq, area, nr_mapped);
 
 	for (i = 0; i < area->nia.num_niovs; i++)
 		net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
@@ -145,7 +265,11 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 	if (area->is_mapped)
 		return 0;
 
-	nr = io_zcrx_map_area_umem(ifq, area);
+	if (area->mem.is_dmabuf)
+		nr = io_zcrx_map_area_dmabuf(ifq, area);
+	else
+		nr = io_zcrx_map_area_umem(ifq, area);
+
 	if (nr != area->nia.num_niovs) {
 		__io_zcrx_unmap_area(ifq, area, nr);
 		return -EINVAL;
@@ -251,6 +375,8 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
 	kfree(area);
 }
 
+#define IO_ZCRX_AREA_SUPPORTED_FLAGS	(IORING_ZCRX_AREA_DMABUF)
+
 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 			       struct io_zcrx_area **res,
 			       struct io_uring_zcrx_area_reg *area_reg)
@@ -259,9 +385,11 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 	unsigned nr_iovs;
 	int i, ret;
 
-	if (area_reg->flags || area_reg->rq_area_token)
+	if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
 		return -EINVAL;
-	if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
+	if (area_reg->rq_area_token)
+		return -EINVAL;
+	if (area_reg->__resv2[0] || area_reg->__resv2[1])
 		return -EINVAL;
 
 	ret = -ENOMEM;
@@ -819,6 +947,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 	size_t copied = 0;
 	int ret = 0;
 
+	if (area->mem.is_dmabuf)
+		return -EFAULT;
+
 	while (len) {
 		size_t copy_size = min_t(size_t, PAGE_SIZE, len);
 		const int dst_off = 0;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 9c22807af807..2f5e26389f22 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -3,15 +3,22 @@
 #define IOU_ZC_RX_H
 
 #include <linux/io_uring_types.h>
+#include <linux/dma-buf.h>
 #include <linux/socket.h>
 #include <net/page_pool/types.h>
 #include <net/net_trackers.h>
 
 struct io_zcrx_mem {
 	unsigned long			size;
+	bool				is_dmabuf;
 
 	struct page			**pages;
 	unsigned long			nr_folios;
+
+	struct dma_buf_attachment	*attach;
+	struct dma_buf			*dmabuf;
+	struct sg_table			*sgt;
+	unsigned long			dmabuf_offset;
 };
 
 struct io_zcrx_area {

From 78967aabf6138bd43798c966a75167579ce42955 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 6 May 2025 13:30:47 +0100
Subject: [PATCH 27/59] io_uring/timeout: don't export link t-out disarm helper

[__]io_disarm_linked_timeout() are only used inside timeout.c. so
confine them inside the file.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1eb200911255e643bf252a8e65fb2c787340cf18.1746533800.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 11 ++++++++---
 io_uring/timeout.h | 13 -------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 2a107665230b..a6ff8c026b1f 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -35,6 +35,9 @@ struct io_timeout_rem {
 	bool				ltimeout;
 };
 
+static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+						   struct io_kiocb *link);
+
 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 {
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
@@ -218,7 +221,9 @@ void io_disarm_next(struct io_kiocb *req)
 		struct io_ring_ctx *ctx = req->ctx;
 
 		raw_spin_lock_irq(&ctx->timeout_lock);
-		link = io_disarm_linked_timeout(req);
+		if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT)
+			link = __io_disarm_linked_timeout(req, req->link);
+
 		raw_spin_unlock_irq(&ctx->timeout_lock);
 		if (link)
 			io_req_queue_tw_complete(link, -ECANCELED);
@@ -228,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req)
 		io_fail_links(req);
 }
 
-struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
-					    struct io_kiocb *link)
+static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+						   struct io_kiocb *link)
 	__must_hold(&req->ctx->completion_lock)
 	__must_hold(&req->ctx->timeout_lock)
 {
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
index e91b32448dcf..2b7c9ad72992 100644
--- a/io_uring/timeout.h
+++ b/io_uring/timeout.h
@@ -8,19 +8,6 @@ struct io_timeout_data {
 	u32				flags;
 };
 
-struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
-					    struct io_kiocb *link);
-
-static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
-{
-	struct io_kiocb *link = req->link;
-
-	if (link && link->opcode == IORING_OP_LINK_TIMEOUT)
-		return __io_disarm_linked_timeout(req, link);
-
-	return NULL;
-}
-
 __cold void io_flush_timeouts(struct io_ring_ctx *ctx);
 struct io_cancel_data;
 int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);

From 9c2ff3f9b5e0202d1cc1f6193b1e96df203ae4a4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 6 May 2025 13:31:07 +0100
Subject: [PATCH 28/59] io_uring: remove io_preinit_req()

Apart from setting ->ctx, io_preinit_req() zeroes a bunch of fields of a
request, from which only ->file_node is mandatory. Remove the function
and zero the entire request on first allocation. With that, we also need
to initialise ->ctx every time, which might be a good thing for
performance as now we're likely overwriting the entire cache line, and
so it can write combined and avoid RMW.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ba5485dc913f1e275862ce88f5169d4ac4a33836.1746533807.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 21 ++-------------------
 io_uring/notif.c    |  1 +
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 703251f6f4d8..3d20f3b63443 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -927,22 +927,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
 	io_req_complete_defer(req);
 }
 
-/*
- * Don't initialise the fields below on every allocation, but do that in
- * advance and keep them valid across allocations.
- */
-static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
-	req->ctx = ctx;
-	req->buf_node = NULL;
-	req->file_node = NULL;
-	req->link = NULL;
-	req->async_data = NULL;
-	/* not necessary, but safer to zero */
-	memset(&req->cqe, 0, sizeof(req->cqe));
-	memset(&req->big_cqe, 0, sizeof(req->big_cqe));
-}
-
 /*
  * A request might get retired back into the request caches even before opcode
  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
@@ -952,7 +936,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
-	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO;
 	void *reqs[IO_REQ_ALLOC_BATCH];
 	int ret;
 
@@ -973,7 +957,6 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	while (ret--) {
 		struct io_kiocb *req = reqs[ret];
 
-		io_preinit_req(req, ctx);
 		io_req_add_to_cache(req, ctx);
 	}
 	return true;
@@ -2049,7 +2032,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	int personality;
 	u8 opcode;
 
-	/* req is partially pre-initialised, see io_preinit_req() */
+	req->ctx = ctx;
 	req->opcode = opcode = READ_ONCE(sqe->opcode);
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	sqe_flags = READ_ONCE(sqe->flags);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 7bd92538dccb..9a6f6e92d742 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -112,6 +112,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 
 	if (unlikely(!io_alloc_req(ctx, &notif)))
 		return NULL;
+	notif->ctx = ctx;
 	notif->opcode = IORING_OP_NOP;
 	notif->flags = 0;
 	notif->file = NULL;

From 35adea1d018ab1e450ea2304e58dc2f987a639d3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 6 May 2025 13:31:16 +0100
Subject: [PATCH 29/59] io_uring: move io_req_put_rsrc_nodes()

It'd be nice to hide details of how rsrc nodes are used by a request
from rsrc.c, specifically which request fields store them, and what bits
are signifying if there is a node in a request. It rather belong to
generic request handling, so move the helper to io_uring.c. While doing
so remove clearing of ->buf_node as it's controlled by REQ_F_BUF_NODE
and doesn't require zeroing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bb73fb42baf825edb39344365aff48cdfdd4c692.1746533789.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 10 ++++++++++
 io_uring/rsrc.h     | 12 ------------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3d20f3b63443..0d051476008c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1369,6 +1369,16 @@ void io_queue_next(struct io_kiocb *req)
 		io_req_task_queue(nxt);
 }
 
+static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
+{
+	if (req->file_node) {
+		io_put_rsrc_node(req->ctx, req->file_node);
+		req->file_node = NULL;
+	}
+	if (req->flags & REQ_F_BUF_NODE)
+		io_put_rsrc_node(req->ctx, req->buf_node);
+}
+
 static void io_free_batch_list(struct io_ring_ctx *ctx,
 			       struct io_wq_work_node *node)
 	__must_hold(&ctx->uring_lock)
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 2818aa0d0472..0d2138f16322 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -115,18 +115,6 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
 	return true;
 }
 
-static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
-{
-	if (req->file_node) {
-		io_put_rsrc_node(req->ctx, req->file_node);
-		req->file_node = NULL;
-	}
-	if (req->flags & REQ_F_BUF_NODE) {
-		io_put_rsrc_node(req->ctx, req->buf_node);
-		req->buf_node = NULL;
-	}
-}
-
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 

From 6ae4308116f1033ceb11b419c01e9c5f17a35633 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong@huawei.com>
Date: Fri, 9 May 2025 14:30:15 +0800
Subject: [PATCH 30/59] io_uring: update parameter name in io_pin_pages
 function declaration

Rename first parameter in io_pin_pages from ubuf to uaddr for consistency
between declaration and implementation.

Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20250509063015.3799255-1-leo.lilong@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/memmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index 24afb298e974..08419684e4bc 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -6,7 +6,7 @@
 
 #define IORING_OFF_ZCRX_SHIFT		16
 
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages);
 
 #ifndef CONFIG_MMU
 unsigned int io_uring_nommu_mmap_capabilities(struct file *file);

From 28b8cd864da516ea87162015e490b1dea444888f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:03:28 +0100
Subject: [PATCH 31/59] io_uring/net: move CONFIG_NET guards to Makefile

Instruct Makefile to never try to compile net.c without CONFIG_NET and
kill ifdefs in the file.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f466400e20c3f536191bfd559b1f3cd2a2ab5a1e.1746788579.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile | 4 ++--
 io_uring/net.c    | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 75e0ca795685..11a739927a62 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,7 +7,7 @@ GCOV_PROFILE := y
 endif
 
 obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \
-					tctx.o filetable.o rw.o net.o poll.o \
+					tctx.o filetable.o rw.o poll.o \
 					eventfd.o uring_cmd.o openclose.o \
 					sqpoll.o xattr.o nop.o fs.o splice.o \
 					sync.o msg_ring.o advise.o openclose.o \
@@ -19,4 +19,4 @@ obj-$(CONFIG_IO_WQ)		+= io-wq.o
 obj-$(CONFIG_FUTEX)		+= futex.o
 obj-$(CONFIG_EPOLL)		+= epoll.o
 obj-$(CONFIG_NET_RX_BUSY_POLL)	+= napi.o
-obj-$(CONFIG_NET) += cmd_net.o
+obj-$(CONFIG_NET) += net.o cmd_net.o
diff --git a/io_uring/net.c b/io_uring/net.c
index b3a643675ce8..1fbdb2bbb3f3 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -18,7 +18,6 @@
 #include "rsrc.h"
 #include "zcrx.h"
 
-#if defined(CONFIG_NET)
 struct io_shutdown {
 	struct file			*file;
 	int				how;
@@ -1836,4 +1835,3 @@ void io_netmsg_cache_free(const void *entry)
 	io_vec_free(&kmsg->vec);
 	kfree(kmsg);
 }
-#endif

From 81a22c86ec7060be43404d4e6d68fca03dbafcf4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:03:43 +0100
Subject: [PATCH 32/59] io_uring: add lockdep asserts to io_add_aux_cqe

io_add_aux_cqe() can only be called for rings with uring_lock protected
completion queues, add a couple of assertions in regards to that.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c010eab7b94a187c00a9d46d8b67bf7fcad18af4.1746788592.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0d051476008c..4034b1db7135 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -848,6 +848,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
  */
 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 {
+	lockdep_assert_held(&ctx->uring_lock);
+	lockdep_assert(ctx->lockless_cq);
+
 	if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
 		spin_lock(&ctx->completion_lock);
 		io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);

From f979c20547e72568e3c793bc92c7522bc3166246 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:47 +0100
Subject: [PATCH 33/59] io_uring: account drain memory to cgroup

Account drain allocations against memcg. It's not a big problem as each
such allocation is paired with a request, which is accounted, but it's
nicer to follow the limits more closely.

Cc: stable@vger.kernel.org # 6.1
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f8dfdbd755c41fd9c75d12b858af07dfba5bbb68.1746788718.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4034b1db7135..76b33762a28d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1675,7 +1675,7 @@ queue:
 	spin_unlock(&ctx->completion_lock);
 
 	io_prep_async_link(req);
-	de = kmalloc(sizeof(*de), GFP_KERNEL);
+	de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
 	if (!de) {
 		ret = -ENOMEM;
 		io_req_defer_failed(req, ret);

From fde04c7e2775feb0746301e0ef86a04d3598c3fe Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:48 +0100
Subject: [PATCH 34/59] io_uring: fix spurious drain flushing

io_queue_deferred() is not tolerant to spurious calls not completing
some requests. You can have an inflight drain-marked request and another
request that came after and got queued into the drain list. Now, if
io_queue_deferred() is called before the first request completes, it'll
check the 2nd req with req_need_defer(), find that there is no drain
flag set, and queue it for execution.

To make io_queue_deferred() work, it should at least check sequences for
the first request, and then we need also need to check if there is
another drain request creating another bubble.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/972bde11b7d4ef25b3f5e3fd34f80e4d2aa345b8.1746788718.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 76b33762a28d..7491793c80ba 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -559,18 +559,30 @@ void io_req_queue_iowq(struct io_kiocb *req)
 	io_req_task_work_add(req);
 }
 
+static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
+}
+
 static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
 {
+	bool drain_seen = false, first = true;
+
 	spin_lock(&ctx->completion_lock);
 	while (!list_empty(&ctx->defer_list)) {
 		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 						struct io_defer_entry, list);
 
-		if (req_need_defer(de->req, de->seq))
+		drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
+		if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq))
 			break;
+
 		list_del_init(&de->list);
 		io_req_task_queue(de->req);
 		kfree(de);
+		first = false;
 	}
 	spin_unlock(&ctx->completion_lock);
 }

From 05b334110fdc85f536d7dd573120d573801fb2d1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:49 +0100
Subject: [PATCH 35/59] io_uring: simplify drain ret passing

"ret" in io_drain_req() is only used in one place, remove it and pass
-ENOMEM directly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ece724b77e66e6caabcc215e0032ee7ff140f289.1746788718.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7491793c80ba..6b606e9dffa3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1672,7 +1672,6 @@ static __cold void io_drain_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_defer_entry *de;
-	int ret;
 	u32 seq = io_get_sequence(req);
 
 	/* Still need defer if there is pending req in defer list. */
@@ -1689,8 +1688,7 @@ queue:
 	io_prep_async_link(req);
 	de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
 	if (!de) {
-		ret = -ENOMEM;
-		io_req_defer_failed(req, ret);
+		io_req_defer_failed(req, -ENOMEM);
 		return;
 	}
 

From e91e4f692f7993d5d192228c5f8a9a2e12ff5250 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:50 +0100
Subject: [PATCH 36/59] io_uring: remove drain prealloc checks

Currently io_drain_req() has two steps. The first is fast path checking
sequence numbers. The second is allocations, rechecking and actual
queuing. Further simplify it by removing the first step.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4d06e89ed07611993d7bf89182de2300858379bd.1746788718.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6b606e9dffa3..f83abdf8a056 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1674,17 +1674,6 @@ static __cold void io_drain_req(struct io_kiocb *req)
 	struct io_defer_entry *de;
 	u32 seq = io_get_sequence(req);
 
-	/* Still need defer if there is pending req in defer list. */
-	spin_lock(&ctx->completion_lock);
-	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
-		spin_unlock(&ctx->completion_lock);
-queue:
-		ctx->drain_active = false;
-		io_req_task_queue(req);
-		return;
-	}
-	spin_unlock(&ctx->completion_lock);
-
 	io_prep_async_link(req);
 	de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
 	if (!de) {
@@ -1696,7 +1685,9 @@ queue:
 	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
 		spin_unlock(&ctx->completion_lock);
 		kfree(de);
-		goto queue;
+		ctx->drain_active = false;
+		io_req_task_queue(req);
+		return;
 	}
 
 	trace_io_uring_defer(req);

From 19a94da447f832ee614f8f5532d31c1c70061520 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:51 +0100
Subject: [PATCH 37/59] io_uring: consolidate drain seq checking

We check sequences when queuing drained requests as well when flushing
them. Instead, always queue and immediately try to flush, so that all
seq handling can be kept contained in the flushing code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d4651f742e671af5b3216581e539ea5d31bc7125.1746788718.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 45 +++++++++++++++++----------------------------
 1 file changed, 17 insertions(+), 28 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f83abdf8a056..3d1f4b2e4536 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -389,17 +389,6 @@ static void io_account_cq_overflow(struct io_ring_ctx *ctx)
 	ctx->cq_extra--;
 }
 
-static bool req_need_defer(struct io_kiocb *req, u32 seq)
-{
-	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
-		struct io_ring_ctx *ctx = req->ctx;
-
-		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
-	}
-
-	return false;
-}
-
 static void io_clean_op(struct io_kiocb *req)
 {
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
@@ -566,11 +555,10 @@ static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq)
 	return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
 }
 
-static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx)
 {
 	bool drain_seen = false, first = true;
 
-	spin_lock(&ctx->completion_lock);
 	while (!list_empty(&ctx->defer_list)) {
 		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 						struct io_defer_entry, list);
@@ -584,7 +572,12 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
 		kfree(de);
 		first = false;
 	}
-	spin_unlock(&ctx->completion_lock);
+}
+
+static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
+{
+	guard(spinlock)(&ctx->completion_lock);
+	__io_queue_deferred(ctx);
 }
 
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -1671,30 +1664,26 @@ static __cold void io_drain_req(struct io_kiocb *req)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	bool drain = req->flags & IOSQE_IO_DRAIN;
 	struct io_defer_entry *de;
-	u32 seq = io_get_sequence(req);
 
-	io_prep_async_link(req);
 	de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
 	if (!de) {
 		io_req_defer_failed(req, -ENOMEM);
 		return;
 	}
 
-	spin_lock(&ctx->completion_lock);
-	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
-		spin_unlock(&ctx->completion_lock);
-		kfree(de);
-		ctx->drain_active = false;
-		io_req_task_queue(req);
-		return;
-	}
-
+	io_prep_async_link(req);
 	trace_io_uring_defer(req);
 	de->req = req;
-	de->seq = seq;
-	list_add_tail(&de->list, &ctx->defer_list);
-	spin_unlock(&ctx->completion_lock);
+	de->seq = io_get_sequence(req);
+
+	scoped_guard(spinlock, &ctx->completion_lock) {
+		list_add_tail(&de->list, &ctx->defer_list);
+		__io_queue_deferred(ctx);
+		if (!drain && list_empty(&ctx->defer_list))
+			ctx->drain_active = false;
+	}
 }
 
 static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,

From b0c8a6401fbca91da4fe0dc10d61a770f1581e45 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:52 +0100
Subject: [PATCH 38/59] io_uring: open code io_account_cq_overflow()

io_account_cq_overflow() doesn't help explaining what's going on in
there, and it'll become even smaller with following patches, so open
code it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e4333fa0d371f519e52a71148ebdffed4b8d3aa9.1746788718.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3d1f4b2e4536..6efecb46c828 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -381,14 +381,6 @@ err:
 	return NULL;
 }
 
-static void io_account_cq_overflow(struct io_ring_ctx *ctx)
-{
-	struct io_rings *r = ctx->rings;
-
-	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
-	ctx->cq_extra--;
-}
-
 static void io_clean_op(struct io_kiocb *req)
 {
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
@@ -742,12 +734,15 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
 	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
 	if (!ocqe) {
+		struct io_rings *r = ctx->rings;
+
 		/*
 		 * If we're in ring overflow flush mode, or in task cancel mode,
 		 * or cannot allocate an overflow entry, then we need to drop it
 		 * on the floor.
 		 */
-		io_account_cq_overflow(ctx);
+		WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
+		ctx->cq_extra--;
 		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
 		return false;
 	}

From 63de899cb6220357dea9d0f4e5aa459ff5193bb0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:53 +0100
Subject: [PATCH 39/59] io_uring: count allocated requests

Keep track of the number requests a ring currently has allocated (and
not freed), it'll be needed in the next patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c8f8308294dc2a1cb8925d984d937d4fc14ab5d4.1746788718.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 1 +
 io_uring/io_uring.c            | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 7e23e993280e..73b289b48280 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -435,6 +435,7 @@ struct io_ring_ctx {
 
 	/* protected by ->completion_lock */
 	unsigned			evfd_last_cq_tail;
+	unsigned			nr_req_allocated;
 
 	/*
 	 * Protection for resize vs mmap races - both the mmap and resize
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6efecb46c828..714b66ab34b0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -957,6 +957,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	}
 
 	percpu_ref_get_many(&ctx->refs, ret);
+	ctx->nr_req_allocated += ret;
+
 	while (ret--) {
 		struct io_kiocb *req = reqs[ret];
 
@@ -2694,8 +2696,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 		kmem_cache_free(req_cachep, req);
 		nr++;
 	}
-	if (nr)
+	if (nr) {
+		ctx->nr_req_allocated -= nr;
 		percpu_ref_put_many(&ctx->refs, nr);
+	}
 	mutex_unlock(&ctx->uring_lock);
 }
 
@@ -2732,6 +2736,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
 	io_req_caches_free(ctx);
+
+	WARN_ON_ONCE(ctx->nr_req_allocated);
+
 	if (ctx->hash_map)
 		io_wq_put_hash(ctx->hash_map);
 	io_napi_free(ctx);

From 8fb7aee05591fd4d3dca1460448a59e95fa821c3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:54 +0100
Subject: [PATCH 40/59] io_uring: drain based on allocates reqs

Don't rely on CQ sequence numbers for draining, as it has become messy
and needs cq_extra adjustments. Instead, base it on the number of
allocated requests and only allow flushing when all requests are in the
drain list.

As a result, cq_extra is gone, no overhead for its accounting in aux cqe
posting, less bloating as it was inlined before, and it's in general
simpler than trying to track where we should bump it and where it should
be put back like in cases of overflow. Also, it'll likely help with
cleaning and unifying some of the CQ posting helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com
Link: https://lore.kernel.org/r/24497b04b004bceada496033d3c9d09ff8e81ae9.1746944903.git.asml.silence@gmail.com
[axboe: fold in fix from link2]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 79 ++++++++++++++--------------------
 io_uring/io_uring.h            |  3 +-
 3 files changed, 34 insertions(+), 50 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 73b289b48280..00dbd7cd0e7d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -341,7 +341,6 @@ struct io_ring_ctx {
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
-		unsigned		cq_extra;
 
 		void			*cq_wait_arg;
 		size_t			cq_wait_size;
@@ -417,6 +416,7 @@ struct io_ring_ctx {
 
 	struct callback_head		poll_wq_task_work;
 	struct list_head		defer_list;
+	unsigned			nr_drained;
 
 	struct io_alloc_cache		msg_cache;
 	spinlock_t			msg_lock;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 714b66ab34b0..9a9b8d35349b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -129,7 +129,6 @@
 struct io_defer_entry {
 	struct list_head	list;
 	struct io_kiocb		*req;
-	u32			seq;
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool is_sqpoll_thread);
 
 static void io_queue_sqe(struct io_kiocb *req);
+static void __io_req_caches_free(struct io_ring_ctx *ctx);
 
 static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
 
@@ -540,46 +540,45 @@ void io_req_queue_iowq(struct io_kiocb *req)
 	io_req_task_work_add(req);
 }
 
-static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq)
+static unsigned io_linked_nr(struct io_kiocb *req)
 {
-	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *tmp;
+	unsigned nr = 0;
 
-	return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
+	io_for_each_link(tmp, req)
+		nr++;
+	return nr;
 }
 
-static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
 {
 	bool drain_seen = false, first = true;
 
+	lockdep_assert_held(&ctx->uring_lock);
+	__io_req_caches_free(ctx);
+
 	while (!list_empty(&ctx->defer_list)) {
 		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 						struct io_defer_entry, list);
 
 		drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
-		if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq))
-			break;
+		if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
+			return;
 
 		list_del_init(&de->list);
+		ctx->nr_drained -= io_linked_nr(de->req);
 		io_req_task_queue(de->req);
 		kfree(de);
 		first = false;
 	}
 }
 
-static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
-{
-	guard(spinlock)(&ctx->completion_lock);
-	__io_queue_deferred(ctx);
-}
-
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
 	if (ctx->poll_activated)
 		io_poll_wq_wake(ctx);
 	if (ctx->off_timeout_used)
 		io_flush_timeouts(ctx);
-	if (ctx->drain_active)
-		io_queue_deferred(ctx);
 	if (ctx->has_evfd)
 		io_eventfd_signal(ctx, true);
 }
@@ -742,7 +741,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 		 * on the floor.
 		 */
 		WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
-		ctx->cq_extra--;
 		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
 		return false;
 	}
@@ -812,8 +810,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 {
 	struct io_uring_cqe *cqe;
 
-	ctx->cq_extra++;
-
 	if (likely(io_get_cqe(ctx, &cqe))) {
 		WRITE_ONCE(cqe->user_data, user_data);
 		WRITE_ONCE(cqe->res, res);
@@ -1459,6 +1455,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		io_free_batch_list(ctx, state->compl_reqs.first);
 		INIT_WQ_LIST(&state->compl_reqs);
 	}
+
+	if (unlikely(ctx->drain_active))
+		io_queue_deferred(ctx);
+
 	ctx->submit_state.cq_flush = false;
 }
 
@@ -1646,17 +1646,6 @@ io_req_flags_t io_file_get_flags(struct file *file)
 	return res;
 }
 
-static u32 io_get_sequence(struct io_kiocb *req)
-{
-	u32 seq = req->ctx->cached_sq_head;
-	struct io_kiocb *cur;
-
-	/* need original cached_sq_head, but it was increased for each req */
-	io_for_each_link(cur, req)
-		seq--;
-	return seq;
-}
-
 static __cold void io_drain_req(struct io_kiocb *req)
 	__must_hold(&ctx->uring_lock)
 {
@@ -1673,14 +1662,12 @@ static __cold void io_drain_req(struct io_kiocb *req)
 	io_prep_async_link(req);
 	trace_io_uring_defer(req);
 	de->req = req;
-	de->seq = io_get_sequence(req);
 
-	scoped_guard(spinlock, &ctx->completion_lock) {
-		list_add_tail(&de->list, &ctx->defer_list);
-		__io_queue_deferred(ctx);
-		if (!drain && list_empty(&ctx->defer_list))
-			ctx->drain_active = false;
-	}
+	ctx->nr_drained += io_linked_nr(req);
+	list_add_tail(&de->list, &ctx->defer_list);
+	io_queue_deferred(ctx);
+	if (!drain && list_empty(&ctx->defer_list))
+		ctx->drain_active = false;
 }
 
 static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -2263,10 +2250,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 	    (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
 		head = READ_ONCE(ctx->sq_array[head]);
 		if (unlikely(head >= ctx->sq_entries)) {
-			/* drop invalid entries */
-			spin_lock(&ctx->completion_lock);
-			ctx->cq_extra--;
-			spin_unlock(&ctx->completion_lock);
 			WRITE_ONCE(ctx->rings->sq_dropped,
 				   READ_ONCE(ctx->rings->sq_dropped) + 1);
 			return false;
@@ -2684,13 +2667,11 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
 	return off;
 }
 
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
 {
 	struct io_kiocb *req;
 	int nr = 0;
 
-	mutex_lock(&ctx->uring_lock);
-
 	while (!io_req_cache_empty(ctx)) {
 		req = io_extract_req(ctx);
 		kmem_cache_free(req_cachep, req);
@@ -2700,7 +2681,12 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 		ctx->nr_req_allocated -= nr;
 		percpu_ref_put_many(&ctx->refs, nr);
 	}
-	mutex_unlock(&ctx->uring_lock);
+}
+
+static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
+{
+	guard(mutex)(&ctx->uring_lock);
+	__io_req_caches_free(ctx);
 }
 
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -3005,20 +2991,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 	struct io_defer_entry *de;
 	LIST_HEAD(list);
 
-	spin_lock(&ctx->completion_lock);
 	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
 		if (io_match_task_safe(de->req, tctx, cancel_all)) {
 			list_cut_position(&list, &ctx->defer_list, &de->list);
 			break;
 		}
 	}
-	spin_unlock(&ctx->completion_lock);
 	if (list_empty(&list))
 		return false;
 
 	while (!list_empty(&list)) {
 		de = list_first_entry(&list, struct io_defer_entry, list);
 		list_del_init(&de->list);
+		ctx->nr_drained -= io_linked_nr(de->req);
 		io_req_task_queue_fail(de->req, -ECANCELED);
 		kfree(de);
 	}
@@ -3093,8 +3078,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
 	    io_allowed_defer_tw_run(ctx))
 		ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
-	ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
 	mutex_lock(&ctx->uring_lock);
+	ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
 	ret |= io_poll_remove_all(ctx, tctx, cancel_all);
 	ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
 	ret |= io_futex_remove_all(ctx, tctx, cancel_all);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e4050b2d0821..81f22196a57d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
 {
 	io_lockdep_assert_cq_locked(ctx);
 
-	ctx->cq_extra++;
 	ctx->submit_state.cq_flush = true;
 	return io_get_cqe(ctx, cqe_ret);
 }
@@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
 
 static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
-	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+	if (unlikely(ctx->off_timeout_used ||
 		     ctx->has_evfd || ctx->poll_activated))
 		__io_commit_cqring_flush(ctx);
 }

From 475a8d30371604a6363da8e304a608a5959afc40 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 May 2025 18:26:46 +0100
Subject: [PATCH 41/59] io_uring/kbuf: account ring io_buffer_list memory

Follow the non-ringed pbuf struct io_buffer_list allocations and account
it against the memcg. There is low chance of that being an actual
problem as ring provided buffer should either pin user memory or
allocate it, which is already accounted.

Cc: stable@vger.kernel.org # 6.1
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3985218b50d341273cafff7234e1a7e6d0db9808.1747150490.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 1cf0d2c01287..446207db1edf 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -621,7 +621,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 		io_destroy_bl(ctx, bl);
 	}
 
-	free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+	free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
 	if (!bl)
 		return -ENOMEM;
 

From 1724849072854a66861d461b298b04612702d685 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 May 2025 18:26:47 +0100
Subject: [PATCH 42/59] io_uring/kbuf: use mem_is_zero()

Make use of mem_is_zero() for reserved fields checking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/11fe27b7a831329bcdb4ea087317ef123ba7c171.1747150490.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 446207db1edf..344517d1d921 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -602,8 +602,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
-
-	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
+	if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
 		return -EINVAL;
 	if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
 		return -EINVAL;
@@ -679,9 +678,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
-	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
-		return -EINVAL;
-	if (reg.flags)
+	if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags)
 		return -EINVAL;
 
 	bl = io_buffer_get_list(ctx, reg.bgid);
@@ -701,14 +698,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
 {
 	struct io_uring_buf_status buf_status;
 	struct io_buffer_list *bl;
-	int i;
 
 	if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
 		return -EFAULT;
-
-	for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
-		if (buf_status.resv[i])
-			return -EINVAL;
+	if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv)))
+		return -EINVAL;
 
 	bl = io_buffer_get_list(ctx, buf_status.buf_group);
 	if (!bl)

From 4e9fda29d66b06caf5c81b8acbe0a504effc73fb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 May 2025 18:26:48 +0100
Subject: [PATCH 43/59] io_uring/kbuf: drop extra vars in io_register_pbuf_ring

bl and free_bl variables in io_register_pbuf_ring() always point to the
same list since we started to reallocate the pre-existent list. Drop
free_bl.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d45c3342d74c9030f99376c777a4b3d59089074d.1747150490.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 344517d1d921..406e8a9b42c3 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -591,7 +591,7 @@ err:
 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 {
 	struct io_uring_buf_reg reg;
-	struct io_buffer_list *bl, *free_bl = NULL;
+	struct io_buffer_list *bl;
 	struct io_uring_region_desc rd;
 	struct io_uring_buf_ring *br;
 	unsigned long mmap_offset;
@@ -620,7 +620,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 		io_destroy_bl(ctx, bl);
 	}
 
-	free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
+	bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
 	if (!bl)
 		return -ENOMEM;
 
@@ -665,7 +665,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	return 0;
 fail:
 	io_free_region(ctx, &bl->region);
-	kfree(free_bl);
+	kfree(bl);
 	return ret;
 }
 

From 52a05d0cf8f3b4569c525153132a90661c32fe11 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 May 2025 18:26:49 +0100
Subject: [PATCH 44/59] io_uring/kbuf: don't compute size twice on prep

The size in prep is calculated by io_provide_buffers_prep(), so remove
the recomputation a few lines after.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7c97206561b74fce245cb22449c6082d2e066844.1747150490.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 406e8a9b42c3..eb666c02f488 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -509,8 +509,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return -EOVERFLOW;
 	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
 		return -EOVERFLOW;
-
-	size = (unsigned long)p->len * p->nbufs;
 	if (!access_ok(u64_to_user_ptr(p->addr), size))
 		return -EFAULT;
 

From c724e801239ffc3714afe65cf6e721ddd04199d0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 May 2025 18:26:50 +0100
Subject: [PATCH 45/59] io_uring/kbuf: refactor __io_remove_buffers

__io_remove_buffers used for two purposes, the first is removing
buffers for non ring based lists, which implies that it can be called
multiple times for the same list. And the second is for destroying
lists, which is not perfectly reentrable for ring based lists.

It's confusing, so just have a helper for the legacy pbuf buffer
removal, make sure it's not called for ring pbuf, and open code all ring
pbuf destruction into io_put_bl().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0ae416b099d311ad23f285cea02f2c94c8ae9a6c.1747150490.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 38 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index eb666c02f488..df8aeb42e910 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -376,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs)
 	return ret;
 }
 
-static int __io_remove_buffers(struct io_ring_ctx *ctx,
-			       struct io_buffer_list *bl, unsigned nbufs)
+static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
+				    struct io_buffer_list *bl,
+				    unsigned long nbufs)
 {
-	unsigned i = 0;
-
-	/* shouldn't happen */
-	if (!nbufs)
-		return 0;
-
-	if (bl->flags & IOBL_BUF_RING) {
-		i = bl->buf_ring->tail - bl->head;
-		io_free_region(ctx, &bl->region);
-		/* make sure it's seen as empty */
-		INIT_LIST_HEAD(&bl->buf_list);
-		bl->flags &= ~IOBL_BUF_RING;
-		return i;
-	}
+	unsigned long i = 0;
+	struct io_buffer *nxt;
 
 	/* protects io_buffers_cache */
 	lockdep_assert_held(&ctx->uring_lock);
+	WARN_ON_ONCE(bl->flags & IOBL_BUF_RING);
 
-	while (!list_empty(&bl->buf_list)) {
-		struct io_buffer *nxt;
-
+	for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) {
 		nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
 		list_del(&nxt->list);
 		kfree(nxt);
-
-		if (++i == nbufs)
-			return i;
 		cond_resched();
 	}
-
 	return i;
 }
 
 static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
 {
-	__io_remove_buffers(ctx, bl, -1U);
+	if (bl->flags & IOBL_BUF_RING)
+		io_free_region(ctx, &bl->region);
+	else
+		io_remove_buffers_legacy(ctx, bl, -1U);
+
 	kfree(bl);
 }
 
@@ -477,7 +465,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 		ret = -EINVAL;
 		/* can't use provide/remove buffers command on mapped buffers */
 		if (!(bl->flags & IOBL_BUF_RING))
-			ret = __io_remove_buffers(ctx, bl, p->nbufs);
+			ret = io_remove_buffers_legacy(ctx, bl, p->nbufs);
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
 	if (ret < 0)

From 2b61bb1d9aa601ec393054a61be0a707a5bea928 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 13 May 2025 18:26:51 +0100
Subject: [PATCH 46/59] io_uring/kbuf: unify legacy buf provision and removal

Combine IORING_OP_PROVIDE_BUFFERS and IORING_OP_REMOVE_BUFFERS
->issue(), so that we can deduplicate ring locking and list lookups.
This way we further reduce code for legacy provided buffers. Locking is
also separated from buffer related handling, which makes it a bit
simpler with label jumps.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f61af131622ad4337c2fb9f7c453d5b0102c7b90.1747150490.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c  | 79 +++++++++++++++++++-----------------------------
 io_uring/kbuf.h  |  4 +--
 io_uring/opdef.c |  4 +--
 3 files changed, 34 insertions(+), 53 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index df8aeb42e910..823e7eb15fb2 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -450,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_buffer_list *bl;
-	int ret = 0;
-
-	io_ring_submit_lock(ctx, issue_flags);
-
-	ret = -ENOENT;
-	bl = io_buffer_get_list(ctx, p->bgid);
-	if (bl) {
-		ret = -EINVAL;
-		/* can't use provide/remove buffers command on mapped buffers */
-		if (!(bl->flags & IOBL_BUF_RING))
-			ret = io_remove_buffers_legacy(ctx, bl, p->nbufs);
-	}
-	io_ring_submit_unlock(ctx, issue_flags);
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	unsigned long size, tmp_check;
@@ -535,37 +511,44 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
 	return i ? 0 : -ENOMEM;
 }
 
-int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_manage_buffers_legacy(struct io_kiocb *req,
+					struct io_buffer_list *bl)
+{
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
+	int ret;
+
+	if (!bl) {
+		if (req->opcode != IORING_OP_PROVIDE_BUFFERS)
+			return -ENOENT;
+		bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
+		if (!bl)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&bl->buf_list);
+		ret = io_buffer_add_list(req->ctx, bl, p->bgid);
+		if (ret) {
+			kfree(bl);
+			return ret;
+		}
+	}
+	/* can't use provide/remove buffers command on mapped buffers */
+	if (bl->flags & IOBL_BUF_RING)
+		return -EINVAL;
+	if (req->opcode == IORING_OP_PROVIDE_BUFFERS)
+		return io_add_buffers(req->ctx, p, bl);
+	return io_remove_buffers_legacy(req->ctx, bl, p->nbufs);
+}
+
+int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
-	int ret = 0;
+	int ret;
 
 	io_ring_submit_lock(ctx, issue_flags);
-
 	bl = io_buffer_get_list(ctx, p->bgid);
-	if (unlikely(!bl)) {
-		bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
-		if (!bl) {
-			ret = -ENOMEM;
-			goto err;
-		}
-		INIT_LIST_HEAD(&bl->buf_list);
-		ret = io_buffer_add_list(ctx, bl, p->bgid);
-		if (ret) {
-			kfree(bl);
-			goto err;
-		}
-	}
-	/* can't add buffers via this command for a mapped buffer ring */
-	if (bl->flags & IOBL_BUF_RING) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ret = io_add_buffers(ctx, p, bl);
-err:
+	ret = __io_manage_buffers_legacy(req, bl);
 	io_ring_submit_unlock(ctx, issue_flags);
 
 	if (ret < 0)
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 0798a732e6cb..4d2c209d1a41 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -66,10 +66,8 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
 void io_destroy_buffers(struct io_ring_ctx *ctx);
 
 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
-
 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
+int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags);
 
 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index db36433c2294..6e0882b051f9 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
 		.prep			= io_provide_buffers_prep,
-		.issue			= io_provide_buffers,
+		.issue			= io_manage_buffers_legacy,
 	},
 	[IORING_OP_REMOVE_BUFFERS] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
 		.prep			= io_remove_buffers_prep,
-		.issue			= io_remove_buffers,
+		.issue			= io_manage_buffers_legacy,
 	},
 	[IORING_OP_TEE] = {
 		.needs_file		= 1,

From 9a109266278f200ae0b64508273fea3db8af7a9e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 May 2025 12:10:12 -0600
Subject: [PATCH 47/59] io_uring/fdinfo: only compile if CONFIG_PROC_FS is set

Rather than wrap fdinfo.c in one big if, handle it on the Makefile
side instead. io_uring.c already conditionally sets fops->fdinfo()
anyway.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/Makefile | 3 ++-
 io_uring/fdinfo.c | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/io_uring/Makefile b/io_uring/Makefile
index 11a739927a62..d97c6b51d584 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \
 					eventfd.o uring_cmd.o openclose.o \
 					sqpoll.o xattr.o nop.o fs.o splice.o \
 					sync.o msg_ring.o advise.o openclose.o \
-					statx.o timeout.o fdinfo.o cancel.o \
+					statx.o timeout.o cancel.o \
 					waitid.o register.o truncate.o \
 					memmap.o alloc_cache.o
 obj-$(CONFIG_IO_URING_ZCRX)	+= zcrx.o
@@ -20,3 +20,4 @@ obj-$(CONFIG_FUTEX)		+= futex.o
 obj-$(CONFIG_EPOLL)		+= epoll.o
 obj-$(CONFIG_NET_RX_BUSY_POLL)	+= napi.o
 obj-$(CONFIG_NET) += net.o cmd_net.o
+obj-$(CONFIG_PROC_FS) += fdinfo.o
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index e0d6a59a89fa..b83296eee5f8 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -15,7 +15,6 @@
 #include "cancel.h"
 #include "rsrc.h"
 
-#ifdef CONFIG_PROC_FS
 static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 		const struct cred *cred)
 {
@@ -264,4 +263,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 		mutex_unlock(&ctx->uring_lock);
 	}
 }
-#endif

From 16256648cd0877aed9ede41d5d4ad3c1d65d9b2b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 May 2025 12:33:28 -0600
Subject: [PATCH 48/59] io_uring/fdinfo: get rid of dumping credentials

It's a faily obscure feature, and registered credentials would for that
mostly be a static thing. Don't bother including code to dump the
personalities indices.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/fdinfo.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index b83296eee5f8..e9355276ab5d 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -15,36 +15,6 @@
 #include "cancel.h"
 #include "rsrc.h"
 
-static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
-		const struct cred *cred)
-{
-	struct user_namespace *uns = seq_user_ns(m);
-	struct group_info *gi;
-	kernel_cap_t cap;
-	int g;
-
-	seq_printf(m, "%5d\n", id);
-	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
-	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
-	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
-	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
-	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
-	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
-	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
-	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
-	seq_puts(m, "\n\tGroups:\t");
-	gi = cred->group_info;
-	for (g = 0; g < gi->ngroups; g++) {
-		seq_put_decimal_ull(m, g ? " " : "",
-					from_kgid_munged(uns, gi->gid[g]));
-	}
-	seq_puts(m, "\n\tCapEff:\t");
-	cap = cred->cap_effective;
-	seq_put_hex_ll(m, NULL, cap.val, 16);
-	seq_putc(m, '\n');
-	return 0;
-}
-
 #ifdef CONFIG_NET_RX_BUSY_POLL
 static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
 					       struct seq_file *m,
@@ -213,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 		else
 			seq_printf(m, "%5u: <none>\n", i);
 	}
-	if (!xa_empty(&ctx->personalities)) {
-		unsigned long index;
-		const struct cred *cred;
-
-		seq_printf(m, "Personalities:\n");
-		xa_for_each(&ctx->personalities, index, cred)
-			io_uring_show_cred(m, index, cred);
-	}
 
 	seq_puts(m, "PollList:\n");
 	for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {

From 5288b9e28f8a6f464746ddabcf9bf49d1323acfc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 14 May 2025 09:07:20 +0100
Subject: [PATCH 49/59] io_uring: open code io_req_cqe_overflow()

A preparation patch, just open code io_req_cqe_overflow().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 43c285cd2294..e4d6e572eabc 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -739,14 +739,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	return true;
 }
 
-static void io_req_cqe_overflow(struct io_kiocb *req)
-{
-	io_cqring_event_overflow(req->ctx, req->cqe.user_data,
-				req->cqe.res, req->cqe.flags,
-				req->big_cqe.extra1, req->big_cqe.extra2);
-	memset(&req->big_cqe, 0, sizeof(req->big_cqe));
-}
-
 /*
  * writes to the cq entry need to come after reading head; the
  * control dependency is enough as we're using WRITE_ONCE to
@@ -1435,11 +1427,19 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		    unlikely(!io_fill_cqe_req(ctx, req))) {
 			if (ctx->lockless_cq) {
 				spin_lock(&ctx->completion_lock);
-				io_req_cqe_overflow(req);
+				io_cqring_event_overflow(req->ctx, req->cqe.user_data,
+							req->cqe.res, req->cqe.flags,
+							req->big_cqe.extra1,
+							req->big_cqe.extra2);
 				spin_unlock(&ctx->completion_lock);
 			} else {
-				io_req_cqe_overflow(req);
+				io_cqring_event_overflow(req->ctx, req->cqe.user_data,
+							req->cqe.res, req->cqe.flags,
+							req->big_cqe.extra1,
+							req->big_cqe.extra2);
 			}
+
+			memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 		}
 	}
 	__io_cq_unlock_post(ctx);

From 10f466abc404443cb72ab3384f297345ac7415e1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 May 2025 09:39:14 -0600
Subject: [PATCH 50/59] io_uring: split alloc and add of overflow

Add a new helper, io_alloc_ocqe(), that simply allocates and fills an
overflow entry. Then it can get done outside of the locking section,
and hence use more appropriate gfp_t allocation flags rather than always
default to GFP_ATOMIC.

Inspired by a previous series from Pavel:

https://lore.kernel.org/io-uring/cover.1747209332.git.asml.silence@gmail.com/

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 74 +++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 29 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e4d6e572eabc..c2f9610fa891 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -697,20 +697,11 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 	}
 }
 
-static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
-				     s32 res, u32 cflags, u64 extra1, u64 extra2)
+static bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
+				   struct io_overflow_cqe *ocqe)
 {
-	struct io_overflow_cqe *ocqe;
-	size_t ocq_size = sizeof(struct io_overflow_cqe);
-	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
-
 	lockdep_assert_held(&ctx->completion_lock);
 
-	if (is_cqe32)
-		ocq_size += sizeof(struct io_uring_cqe);
-
-	ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
-	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
 	if (!ocqe) {
 		struct io_rings *r = ctx->rings;
 
@@ -728,17 +719,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 		atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
 
 	}
-	ocqe->cqe.user_data = user_data;
-	ocqe->cqe.res = res;
-	ocqe->cqe.flags = cflags;
-	if (is_cqe32) {
-		ocqe->cqe.big_cqe[0] = extra1;
-		ocqe->cqe.big_cqe[1] = extra2;
-	}
 	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
 	return true;
 }
 
+static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
+					     u64 user_data, s32 res, u32 cflags,
+					     u64 extra1, u64 extra2, gfp_t gfp)
+{
+	struct io_overflow_cqe *ocqe;
+	size_t ocq_size = sizeof(struct io_overflow_cqe);
+	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+
+	if (is_cqe32)
+		ocq_size += sizeof(struct io_uring_cqe);
+
+	ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT);
+	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
+	if (ocqe) {
+		ocqe->cqe.user_data = user_data;
+		ocqe->cqe.res = res;
+		ocqe->cqe.flags = cflags;
+		if (is_cqe32) {
+			ocqe->cqe.big_cqe[0] = extra1;
+			ocqe->cqe.big_cqe[1] = extra2;
+		}
+	}
+	return ocqe;
+}
+
 /*
  * writes to the cq entry need to come after reading head; the
  * control dependency is enough as we're using WRITE_ONCE to
@@ -803,8 +812,12 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 
 	io_cq_lock(ctx);
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
-	if (!filled)
-		filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+	if (unlikely(!filled)) {
+		struct io_overflow_cqe *ocqe;
+
+		ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_ATOMIC);
+		filled = io_cqring_add_overflow(ctx, ocqe);
+	}
 	io_cq_unlock_post(ctx);
 	return filled;
 }
@@ -819,8 +832,11 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 	lockdep_assert(ctx->lockless_cq);
 
 	if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
+		struct io_overflow_cqe *ocqe;
+
+		ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_KERNEL);
 		spin_lock(&ctx->completion_lock);
-		io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+		io_cqring_add_overflow(ctx, ocqe);
 		spin_unlock(&ctx->completion_lock);
 	}
 	ctx->submit_state.cq_flush = true;
@@ -1425,18 +1441,18 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		 */
 		if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
 		    unlikely(!io_fill_cqe_req(ctx, req))) {
+			gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC;
+			struct io_overflow_cqe *ocqe;
+
+			ocqe = io_alloc_ocqe(ctx, req->cqe.user_data, req->cqe.res,
+					     req->cqe.flags, req->big_cqe.extra1,
+					     req->big_cqe.extra2, gfp);
 			if (ctx->lockless_cq) {
 				spin_lock(&ctx->completion_lock);
-				io_cqring_event_overflow(req->ctx, req->cqe.user_data,
-							req->cqe.res, req->cqe.flags,
-							req->big_cqe.extra1,
-							req->big_cqe.extra2);
+				io_cqring_add_overflow(ctx, ocqe);
 				spin_unlock(&ctx->completion_lock);
 			} else {
-				io_cqring_event_overflow(req->ctx, req->cqe.user_data,
-							req->cqe.res, req->cqe.flags,
-							req->big_cqe.extra1,
-							req->big_cqe.extra2);
+				io_cqring_add_overflow(ctx, ocqe);
 			}
 
 			memset(&req->big_cqe, 0, sizeof(req->big_cqe));

From 072d37b52c914271319b9f7e596ff3cba02e249c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 May 2025 09:45:24 -0600
Subject: [PATCH 51/59] io_uring: make io_alloc_ocqe() take a struct io_cqe
 pointer

The number of arguments to io_alloc_ocqe() is a bit unwieldy. Make it
take a struct io_cqe pointer rather than three separate CQE args. One
path already has that readily available, add an io_init_cqe() helper for
the remaining two.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c2f9610fa891..02d597716467 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -724,8 +724,8 @@ static bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
 }
 
 static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
-					     u64 user_data, s32 res, u32 cflags,
-					     u64 extra1, u64 extra2, gfp_t gfp)
+					     struct io_cqe *cqe, u64 extra1,
+					     u64 extra2, gfp_t gfp)
 {
 	struct io_overflow_cqe *ocqe;
 	size_t ocq_size = sizeof(struct io_overflow_cqe);
@@ -735,11 +735,11 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
 		ocq_size += sizeof(struct io_uring_cqe);
 
 	ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT);
-	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
+	trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
 	if (ocqe) {
-		ocqe->cqe.user_data = user_data;
-		ocqe->cqe.res = res;
-		ocqe->cqe.flags = cflags;
+		ocqe->cqe.user_data = cqe->user_data;
+		ocqe->cqe.res = cqe->res;
+		ocqe->cqe.flags = cqe->flags;
 		if (is_cqe32) {
 			ocqe->cqe.big_cqe[0] = extra1;
 			ocqe->cqe.big_cqe[1] = extra2;
@@ -806,6 +806,11 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	return false;
 }
 
+static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags)
+{
+	return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags };
+}
+
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 {
 	bool filled;
@@ -814,8 +819,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
 	if (unlikely(!filled)) {
 		struct io_overflow_cqe *ocqe;
+		struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
 
-		ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_ATOMIC);
+		ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_ATOMIC);
 		filled = io_cqring_add_overflow(ctx, ocqe);
 	}
 	io_cq_unlock_post(ctx);
@@ -833,8 +839,9 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 
 	if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
 		struct io_overflow_cqe *ocqe;
+		struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
 
-		ocqe = io_alloc_ocqe(ctx, user_data, res, cflags, 0, 0, GFP_KERNEL);
+		ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_KERNEL);
 		spin_lock(&ctx->completion_lock);
 		io_cqring_add_overflow(ctx, ocqe);
 		spin_unlock(&ctx->completion_lock);
@@ -1444,8 +1451,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 			gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC;
 			struct io_overflow_cqe *ocqe;
 
-			ocqe = io_alloc_ocqe(ctx, req->cqe.user_data, req->cqe.res,
-					     req->cqe.flags, req->big_cqe.extra1,
+			ocqe = io_alloc_ocqe(ctx, &req->cqe, req->big_cqe.extra1,
 					     req->big_cqe.extra2, gfp);
 			if (ctx->lockless_cq) {
 				spin_lock(&ctx->completion_lock);

From c80bdb1c55719cd6308d648a7920272a3be09e34 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 May 2025 13:16:44 -0600
Subject: [PATCH 52/59] io_uring: pass in struct io_big_cqe to io_alloc_ocqe()

Rather than pass extra1/extra2 separately, just pass in the (now) named
io_big_cqe struct instead. The callers that don't use/support CQE32 will
now just pass a single NULL, rather than two seperate mystery zero
values.

Move the clearing of the big_cqe elements into io_alloc_ocqe() as well,
so it can get moved out of the generic code.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 00dbd7cd0e7d..2922635986f5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -710,7 +710,7 @@ struct io_kiocb {
 	const struct cred		*creds;
 	struct io_wq_work		work;
 
-	struct {
+	struct io_big_cqe {
 		u64			extra1;
 		u64			extra2;
 	} big_cqe;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 02d597716467..4081ffd890af 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -724,8 +724,8 @@ static bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
 }
 
 static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
-					     struct io_cqe *cqe, u64 extra1,
-					     u64 extra2, gfp_t gfp)
+					     struct io_cqe *cqe,
+					     struct io_big_cqe *big_cqe, gfp_t gfp)
 {
 	struct io_overflow_cqe *ocqe;
 	size_t ocq_size = sizeof(struct io_overflow_cqe);
@@ -734,17 +734,19 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
 	if (is_cqe32)
 		ocq_size += sizeof(struct io_uring_cqe);
 
-	ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT);
+	ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
 	trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
 	if (ocqe) {
 		ocqe->cqe.user_data = cqe->user_data;
 		ocqe->cqe.res = cqe->res;
 		ocqe->cqe.flags = cqe->flags;
-		if (is_cqe32) {
-			ocqe->cqe.big_cqe[0] = extra1;
-			ocqe->cqe.big_cqe[1] = extra2;
+		if (is_cqe32 && big_cqe) {
+			ocqe->cqe.big_cqe[0] = big_cqe->extra1;
+			ocqe->cqe.big_cqe[1] = big_cqe->extra2;
 		}
 	}
+	if (big_cqe)
+		big_cqe->extra1 = big_cqe->extra2 = 0;
 	return ocqe;
 }
 
@@ -821,7 +823,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 		struct io_overflow_cqe *ocqe;
 		struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
 
-		ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_ATOMIC);
+		ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_ATOMIC);
 		filled = io_cqring_add_overflow(ctx, ocqe);
 	}
 	io_cq_unlock_post(ctx);
@@ -841,7 +843,7 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 		struct io_overflow_cqe *ocqe;
 		struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
 
-		ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_KERNEL);
+		ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_KERNEL);
 		spin_lock(&ctx->completion_lock);
 		io_cqring_add_overflow(ctx, ocqe);
 		spin_unlock(&ctx->completion_lock);
@@ -1451,8 +1453,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 			gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC;
 			struct io_overflow_cqe *ocqe;
 
-			ocqe = io_alloc_ocqe(ctx, &req->cqe, req->big_cqe.extra1,
-					     req->big_cqe.extra2, gfp);
+			ocqe = io_alloc_ocqe(ctx, &req->cqe, &req->big_cqe, gfp);
 			if (ctx->lockless_cq) {
 				spin_lock(&ctx->completion_lock);
 				io_cqring_add_overflow(ctx, ocqe);
@@ -1460,8 +1461,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 			} else {
 				io_cqring_add_overflow(ctx, ocqe);
 			}
-
-			memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 		}
 	}
 	__io_cq_unlock_post(ctx);

From f660fd2ca15a3743f65f6110ae60d5b80500d856 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 May 2025 13:39:32 -0600
Subject: [PATCH 53/59] io_uring: add new helpers for posting overflows

Add two helpers, one for posting overflows for lockless_cq rings, and
one for non-lockless_cq rings. The former can allocate sanely with
GFP_KERNEL, but needs to grab the completion lock for posting, while the
latter must do non-sleeping allocs as it already holds the completion
lock.

While at it, mark the overflow handling functions as __cold as well, as
they should not generally be called during normal operations of the
ring.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 50 ++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4081ffd890af..3c4a9561941f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -697,8 +697,8 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 	}
 }
 
-static bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
-				   struct io_overflow_cqe *ocqe)
+static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
+					  struct io_overflow_cqe *ocqe)
 {
 	lockdep_assert_held(&ctx->completion_lock);
 
@@ -813,6 +813,27 @@ static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags)
 	return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags };
 }
 
+static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe,
+			           struct io_big_cqe *big_cqe)
+{
+	struct io_overflow_cqe *ocqe;
+
+	ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL);
+	spin_lock(&ctx->completion_lock);
+	io_cqring_add_overflow(ctx, ocqe);
+	spin_unlock(&ctx->completion_lock);
+}
+
+static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx,
+					  struct io_cqe *cqe,
+					  struct io_big_cqe *big_cqe)
+{
+	struct io_overflow_cqe *ocqe;
+
+	ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC);
+	return io_cqring_add_overflow(ctx, ocqe);
+}
+
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 {
 	bool filled;
@@ -820,11 +841,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 	io_cq_lock(ctx);
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
 	if (unlikely(!filled)) {
-		struct io_overflow_cqe *ocqe;
 		struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
 
-		ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_ATOMIC);
-		filled = io_cqring_add_overflow(ctx, ocqe);
+		filled = io_cqe_overflow_locked(ctx, &cqe, NULL);
 	}
 	io_cq_unlock_post(ctx);
 	return filled;
@@ -840,13 +859,9 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 	lockdep_assert(ctx->lockless_cq);
 
 	if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
-		struct io_overflow_cqe *ocqe;
 		struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
 
-		ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_KERNEL);
-		spin_lock(&ctx->completion_lock);
-		io_cqring_add_overflow(ctx, ocqe);
-		spin_unlock(&ctx->completion_lock);
+		io_cqe_overflow(ctx, &cqe, NULL);
 	}
 	ctx->submit_state.cq_flush = true;
 }
@@ -1450,17 +1465,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		 */
 		if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
 		    unlikely(!io_fill_cqe_req(ctx, req))) {
-			gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC;
-			struct io_overflow_cqe *ocqe;
-
-			ocqe = io_alloc_ocqe(ctx, &req->cqe, &req->big_cqe, gfp);
-			if (ctx->lockless_cq) {
-				spin_lock(&ctx->completion_lock);
-				io_cqring_add_overflow(ctx, ocqe);
-				spin_unlock(&ctx->completion_lock);
-			} else {
-				io_cqring_add_overflow(ctx, ocqe);
-			}
+			if (ctx->lockless_cq)
+				io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
+			else
+				io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
 		}
 	}
 	__io_cq_unlock_post(ctx);

From 8bb9d6ccd36062d16baa707b759809e1f494017e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 May 2025 14:48:33 -0600
Subject: [PATCH 54/59] io_uring: finish IOU_OK -> IOU_COMPLETE transition

IOU_COMPLETE is more descriptive, in that it explicitly says that the
return value means "please post a completion for this request". This
patch completes the transition from IOU_OK to IOU_COMPLETE, replacing
existing IOU_OK users.

This is a purely mechanical change.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/advise.c    |  4 ++--
 io_uring/cancel.c    |  2 +-
 io_uring/epoll.c     |  4 ++--
 io_uring/fs.c        | 10 +++++-----
 io_uring/futex.c     |  6 +++---
 io_uring/io_uring.c  |  2 +-
 io_uring/io_uring.h  |  1 -
 io_uring/kbuf.c      |  2 +-
 io_uring/msg_ring.c  |  2 +-
 io_uring/net.c       | 14 +++++++-------
 io_uring/nop.c       |  2 +-
 io_uring/openclose.c |  8 ++++----
 io_uring/poll.c      |  4 ++--
 io_uring/rsrc.c      |  2 +-
 io_uring/rw.c        |  2 +-
 io_uring/splice.c    |  4 ++--
 io_uring/statx.c     |  2 +-
 io_uring/sync.c      |  6 +++---
 io_uring/timeout.c   |  2 +-
 io_uring/truncate.c  |  2 +-
 io_uring/uring_cmd.c |  2 +-
 io_uring/waitid.c    |  2 +-
 io_uring/xattr.c     |  8 ++++----
 23 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/io_uring/advise.c b/io_uring/advise.c
index cb7b881665e5..0073f74e3658 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 #else
 	return -EOPNOTSUPP;
 #endif
@@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 0870060bac7c..6d57602304df 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -229,7 +229,7 @@ done:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 static int __io_sync_cancel(struct io_uring_task *tctx,
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index 6d2c48ba1923..8d4610246ba0 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/fs.c b/io_uring/fs.c
index eccea851dd5a..37079a414eab 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 void io_renameat_cleanup(struct io_kiocb *req)
@@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 void io_unlinkat_cleanup(struct io_kiocb *req)
@@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 void io_mkdirat_cleanup(struct io_kiocb *req)
@@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 void io_link_cleanup(struct io_kiocb *req)
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 0ea4820cd8ff..b34695022baa 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -234,7 +234,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
 		kfree(futexv);
 		req->async_data = NULL;
 		req->flags &= ~REQ_F_ASYNC_DATA;
-		return IOU_OK;
+		return IOU_COMPLETE;
 	}
 
 	/*
@@ -311,7 +311,7 @@ done:
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
 	kfree(ifd);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
@@ -328,5 +328,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3c4a9561941f..5cdccf65c652 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1751,7 +1751,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = __io_issue_sqe(req, issue_flags, def);
 
-	if (ret == IOU_OK) {
+	if (ret == IOU_COMPLETE) {
 		if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 			io_req_complete_defer(req);
 		else
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 81f22196a57d..0ea7a435d1de 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -19,7 +19,6 @@
 #endif
 
 enum {
-	IOU_OK			= 0, /* deprecated, use IOU_COMPLETE */
 	IOU_COMPLETE		= 0,
 
 	IOU_ISSUE_SKIP_COMPLETE	= -EIOCBQUEUED,
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 823e7eb15fb2..8cce3ebd813f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -554,7 +554,7 @@ int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 50a958e9c921..71400d6cefc8 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -328,7 +328,7 @@ done:
 		req_set_fail(req);
 	}
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)
diff --git a/io_uring/net.c b/io_uring/net.c
index 1fbdb2bbb3f3..ee3f721ad758 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -128,7 +128,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = __sys_shutdown_sock(sock, shutdown->how);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 static bool io_net_retry(struct socket *sock, int flags)
@@ -502,7 +502,7 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
 	/* Otherwise stop bundle and use the current result. */
 finish:
 	io_req_set_res(req, *ret, cflags);
-	*ret = IOU_OK;
+	*ret = IOU_COMPLETE;
 	return true;
 }
 
@@ -553,7 +553,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	else if (sr->done_io)
 		ret = sr->done_io;
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
@@ -1459,7 +1459,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
 		io_req_msg_cleanup(req, 0);
 	}
 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
@@ -1530,7 +1530,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
 		io_req_msg_cleanup(req, 0);
 	}
 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 void io_sendrecv_fail(struct io_kiocb *req)
@@ -1694,7 +1694,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
 					    sock->file_slot);
 	}
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1761,7 +1761,7 @@ out:
 		req_set_fail(req);
 	io_req_msg_cleanup(req, issue_flags);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 28f06285fdc2..6ac2de761fd3 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -68,5 +68,5 @@ done:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, nop->result, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 4dd461163457..83e36ad4e31b 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -171,7 +171,7 @@ err:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_openat(struct io_kiocb *req, unsigned int issue_flags)
@@ -259,7 +259,7 @@ err:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -302,7 +302,7 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 struct io_pipe {
@@ -426,7 +426,7 @@ int io_pipe(struct io_kiocb *req, unsigned int issue_flags)
 
 	io_req_set_res(req, ret, 0);
 	if (!ret)
-		return IOU_OK;
+		return IOU_COMPLETE;
 
 	req_set_fail(req);
 	if (files[0])
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8eb744eb9f4c..0526062e2f81 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -893,7 +893,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
 	if (ret > 0) {
 		io_req_set_res(req, ipt.result_mask, 0);
-		return IOU_OK;
+		return IOU_COMPLETE;
 	}
 	return ret ?: IOU_ISSUE_SKIP_COMPLETE;
 }
@@ -948,5 +948,5 @@ out:
 	}
 	/* complete update request, we're done with it */
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1657d775c8ba..c592ceace97d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -500,7 +500,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 17a12a1cf3a6..8857b8445e46 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -660,7 +660,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 		io_req_io_end(req);
 		io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags));
 		io_req_rw_cleanup(req, issue_flags);
-		return IOU_OK;
+		return IOU_COMPLETE;
 	} else {
 		io_rw_done(req, ret);
 	}
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 7b89bd84d486..35ce4e60b495 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -103,7 +103,7 @@ done:
 	if (ret != sp->len)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -144,5 +144,5 @@ done:
 	if (ret != sp->len)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/statx.c b/io_uring/statx.c
index 6bc4651700a2..5111e9befbfe 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -59,7 +59,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 void io_statx_cleanup(struct io_kiocb *req)
diff --git a/io_uring/sync.c b/io_uring/sync.c
index 255f68c37e55..cea2d381ffd2 100644
--- a/io_uring/sync.c
+++ b/io_uring/sync.c
@@ -47,7 +47,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -79,7 +79,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 	ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
 				sync->flags & IORING_FSYNC_DATASYNC);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -108,5 +108,5 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret >= 0)
 		fsnotify_modify(req->file);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index a6ff8c026b1f..7f13bfa9f2b6 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -505,7 +505,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 static int __io_timeout_prep(struct io_kiocb *req,
diff --git a/io_uring/truncate.c b/io_uring/truncate.c
index 62ee73d34d72..487baf23b44e 100644
--- a/io_uring/truncate.c
+++ b/io_uring/truncate.c
@@ -44,5 +44,5 @@ int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags)
 	ret = do_ftruncate(req->file, ft->len, 1);
 
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 8a6b0ddef796..fe84c934734e 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -265,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 	io_req_uring_cleanup(req, issue_flags);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 54e69984cd8a..e07a94694397 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -323,5 +323,5 @@ done:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index de5064fcae8a..322b94ff9e4b 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -109,7 +109,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = file_getxattr(req->file, &ix->ctx);
 	io_xattr_finish(req, ret);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -122,7 +122,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
 	ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
 	ix->filename = NULL;
 	io_xattr_finish(req, ret);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 static int __io_setxattr_prep(struct io_kiocb *req,
@@ -190,7 +190,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = file_setxattr(req->file, &ix->ctx);
 	io_xattr_finish(req, ret);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }
 
 int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -203,5 +203,5 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
 	ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
 	ix->filename = NULL;
 	io_xattr_finish(req, ret);
-	return IOU_OK;
+	return IOU_COMPLETE;
 }

From 28be240c763a44932bfe573f09e145d182e52609 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 22 May 2025 09:04:50 -0600
Subject: [PATCH 55/59] trace/io_uring: fix io_uring_local_work_run ctx
 documentation

The comment for the tracepoint io_uring_local_work_run refers to a field
"tctx" and a type "io_uring_ctx", neither of which exist. "tctx" looks
to mean "ctx" and "io_uring_ctx" should be "io_ring_ctx".

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Link: https://lore.kernel.org/r/20250522150451.2385652-1-csander@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index fb81c533b310..178ab6f611be 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -645,7 +645,7 @@ TRACE_EVENT(io_uring_short_write,
 /*
  * io_uring_local_work_run - ran ring local task work
  *
- * @tctx:		pointer to a io_uring_ctx
+ * @ctx:		pointer to an io_ring_ctx
  * @count:		how many functions it ran
  * @loops:		how many loops it ran
  *

From e37dfc0530815ead6e8ddab2a4ccce3be31af954 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 May 2025 06:04:29 -0600
Subject: [PATCH 56/59] io_uring/io-wq: move hash helpers to the top

Just in preparation for using them higher up in the function, move
these generic helpers.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index d52069b1177b..d36d0bd9847d 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -150,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq,
 static void create_worker_cb(struct callback_head *cb);
 static void io_wq_cancel_tw_create(struct io_wq *wq);
 
+static inline unsigned int __io_get_work_hash(unsigned int work_flags)
+{
+	return work_flags >> IO_WQ_HASH_SHIFT;
+}
+
+static inline unsigned int io_get_work_hash(struct io_wq_work *work)
+{
+	return __io_get_work_hash(atomic_read(&work->flags));
+}
+
 static bool io_worker_get(struct io_worker *worker)
 {
 	return refcount_inc_not_zero(&worker->ref);
@@ -454,16 +464,6 @@ static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker)
 	}
 }
 
-static inline unsigned int __io_get_work_hash(unsigned int work_flags)
-{
-	return work_flags >> IO_WQ_HASH_SHIFT;
-}
-
-static inline unsigned int io_get_work_hash(struct io_wq_work *work)
-{
-	return __io_get_work_hash(atomic_read(&work->flags));
-}
-
 static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
 {
 	bool ret = false;

From 8343cae362e147a5d4505c2da0e161a4d9e9fbde Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 May 2025 06:05:37 -0600
Subject: [PATCH 57/59] io_uring/io-wq: ignore non-busy worker going to sleep

When an io-wq worker goes to sleep, it checks if there's work to do.
If there is, it'll create a new worker. But if this worker is currently
idle, it'll either get woken right back up immediately, or someone
else has already created the necessary worker to handle this work.

Only go through the worker creation logic if the current worker is
currently handling a work item. That means it's being scheduled out as
part of handling that work, not just going to sleep on its own.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index d36d0bd9847d..c4af99460399 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -429,6 +429,8 @@ static void io_wq_dec_running(struct io_worker *worker)
 
 	if (!atomic_dec_and_test(&acct->nr_running))
 		return;
+	if (!worker->cur_work)
+		return;
 	if (!io_acct_run_queue(acct))
 		return;
 

From 0b2b066f8a854ff485319f697d16c5f565c32112 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 May 2025 06:08:49 -0600
Subject: [PATCH 58/59] io_uring/io-wq: only create a new worker if it can make
 progress

Hashed work is serialized by io-wq, intended to be used for cases like
serializing buffered writes to a regular file, where the file system
will serialize the workers anyway with a mutex or similar. Since they
would be forcibly serialized and blocked, it's more efficient for io-wq
to handle these individually rather than issue them in parallel.

If a worker is currently handling a hashed work item and gets blocked,
don't create a new worker if the next work item is also hashed and
mapped to the same bucket. That new worker would not be able to make any
progress anyway.

Reported-by: Fengnan Chang <changfengnan@bytedance.com>
Reported-by: Diangang Li <lidiangang@bytedance.com>
Link: https://lore.kernel.org/io-uring/20250522090909.73212-1-changfengnan@bytedance.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index c4af99460399..cd1fcb115739 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -419,6 +419,30 @@ fail:
 	return false;
 }
 
+/* Defer if current and next work are both hashed to the same chain */
+static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct)
+{
+	unsigned int hash, work_flags;
+	struct io_wq_work *next;
+
+	lockdep_assert_held(&acct->lock);
+
+	work_flags = atomic_read(&work->flags);
+	if (!__io_wq_is_hashed(work_flags))
+		return false;
+
+	/* should not happen, io_acct_run_queue() said we had work */
+	if (wq_list_empty(&acct->work_list))
+		return true;
+
+	hash = __io_get_work_hash(work_flags);
+	next = container_of(acct->work_list.first, struct io_wq_work, list);
+	work_flags = atomic_read(&next->flags);
+	if (!__io_wq_is_hashed(work_flags))
+		return false;
+	return hash == __io_get_work_hash(work_flags);
+}
+
 static void io_wq_dec_running(struct io_worker *worker)
 {
 	struct io_wq_acct *acct = io_wq_get_acct(worker);
@@ -433,6 +457,10 @@ static void io_wq_dec_running(struct io_worker *worker)
 		return;
 	if (!io_acct_run_queue(acct))
 		return;
+	if (io_wq_hash_defer(worker->cur_work, acct)) {
+		raw_spin_unlock(&acct->lock);
+		return;
+	}
 
 	raw_spin_unlock(&acct->lock);
 	atomic_inc(&acct->nr_running);

From 6faaf6e0faf1cc9a1359cfe6ecb4d9711b4a9f29 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 May 2025 10:04:46 +0100
Subject: [PATCH 59/59] io_uring/cmd: warn on reg buf imports by ineligible
 cmds

For IORING_URING_CMD_FIXED-less commands io_uring doesn't pull buf_index
from the sqe, so imports might succeed if the index coincide, e.g. when
it's 0, but otherwise it's error prone. Warn if someone tries to import
without the flag.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Link: https://lore.kernel.org/r/a1c2c88e53c3fe96978f23d50c6bc66c2c79c337.1747991070.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/uring_cmd.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index fe84c934734e..929cad6ee326 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -275,6 +275,9 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
 
+	if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED)))
+		return -EINVAL;
+
 	return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags);
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
@@ -289,6 +292,9 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
 	struct io_async_cmd *ac = req->async_data;
 	int ret;
 
+	if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED)))
+		return -EINVAL;
+
 	ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs);
 	if (ret)
 		return ret;