Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - NAPI fixes and cleanups (Pavel, Olivier)

 - Add support for absolute timeouts (Pavel)

 - Fixes for io-wq/sqpoll affinities (Felix)

 - Efficiency improvements for dealing with huge pages (Chenliang)

 - Support for a minwait mode, where the application essentially has two
   timouts - one smaller one that defines the batch timeout, and the
   overall large one similar to what we had before. This enables
   efficient use of batching based on count + timeout, while still
   working well with periods of less intensive workloads

 - Use ITER_UBUF for single segment sends

 - Add support for incremental buffer consumption. Right now each
   operation will always consume a full buffer. With incremental
   consumption, a recv/read operation only consumes the part of the
   buffer that it needs to satisfy the operation

 - Add support for GCOV for io_uring, to help retain a high coverage of
   test to code ratio

 - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly
   converted to a blocking retry

 - Add support for cloning registered buffers from one ring to another

 - Misc cleanups (Anuj, me)

* tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits)
  io_uring: add IORING_REGISTER_COPY_BUFFERS method
  io_uring/register: provide helper to get io_ring_ctx from 'fd'
  io_uring/rsrc: add reference count to struct io_mapped_ubuf
  io_uring/rsrc: clear 'slot' entry upfront
  io_uring/io-wq: inherit cpuset of cgroup in io worker
  io_uring/io-wq: do not allow pinning outside of cpuset
  io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common()
  io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN
  io_uring/sqpoll: do not allow pinning outside of cpuset
  io_uring/eventfd: move refs to refcount_t
  io_uring: remove unused rsrc_put_fn
  io_uring: add new line after variable declaration
  io_uring: add GCOV_PROFILE_URING Kconfig option
  io_uring/kbuf: add support for incremental buffer consumption
  io_uring/kbuf: pass in 'len' argument for buffer commit
  Revert "io_uring: Require zeroed sqe->len on provided-buffers send"
  io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h
  io_uring/kbuf: add io_kbuf_commit() helper
  io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg
  io_uring: wire up min batch wake timeout
  ...
This commit is contained in:
Linus Torvalds
2024-09-16 13:29:00 +02:00
20 changed files with 725 additions and 262 deletions
+41 -1
View File
@@ -440,11 +440,21 @@ struct io_uring_cqe {
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
* them from sends.
* IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
* more completions. In other words, the buffer is being
* partially consumed, and will be used by the kernel for
* more completions. This is only set for buffers used via
* the incremental buffer consumption, as provided by
* a ring buffer setup with IOU_PBUF_RING_INC. For any
* other provided buffer type, all completions with a
* buffer passed back is automatically returned to the
* application.
*/
#define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
#define IORING_CQE_F_NOTIF (1U << 3)
#define IORING_CQE_F_BUF_MORE (1U << 4)
#define IORING_CQE_BUFFER_SHIFT 16
@@ -507,6 +517,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -542,6 +553,7 @@ struct io_uring_params {
#define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
/*
* io_uring_register(2) opcodes and arguments
@@ -595,6 +607,11 @@ enum io_uring_register_op {
IORING_REGISTER_NAPI = 27,
IORING_UNREGISTER_NAPI = 28,
IORING_REGISTER_CLOCK = 29,
/* copy registered buffers from source ring to current ring */
IORING_REGISTER_COPY_BUFFERS = 30,
/* this goes last */
IORING_REGISTER_LAST,
@@ -675,6 +692,21 @@ struct io_uring_restriction {
__u32 resv2[3];
};
struct io_uring_clock_register {
__u32 clockid;
__u32 __resv[3];
};
enum {
IORING_REGISTER_SRC_REGISTERED = 1,
};
struct io_uring_copy_buffers {
__u32 src_fd;
__u32 flags;
__u32 pad[6];
};
struct io_uring_buf {
__u64 addr;
__u32 len;
@@ -707,9 +739,17 @@ struct io_uring_buf_ring {
* mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
* IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be
* consumed incrementally. Normally one (or more) buffers
* are fully consumed. With incremental consumptions, it's
* feasible to register big ranges of buffers, and each
* use of it will consume only as much as it needs. This
* requires that both the kernel and application keep
* track of where the current read/recv index is at.
*/
enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_MMAP = 1,
IOU_PBUF_RING_INC = 2,
};
/* argument for IORING_(UN)REGISTER_PBUF_RING */
@@ -758,7 +798,7 @@ enum io_uring_register_restriction_op {
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad;
__u32 min_wait_usec;
__u64 ts;
};