io_uring: add support for fixed wait regions
Generally applications have 1 or a few waits of waiting, yet they pass
in a struct io_uring_getevents_arg every time. This needs to get copied
and, in turn, the timeout value needs to get copied.
Rather than do this for every invocation, allow the application to
register a fixed set of wait regions that can simply be indexed when
asking the kernel to wait on events.
At ring setup time, the application can register a number of these wait
regions and initialize region/index 0 upfront:
struct io_uring_reg_wait *reg;
reg = io_uring_setup_reg_wait(ring, nr_regions, &ret);
/* set timeout and mark as set, sigmask/sigmask_sz as needed */
reg->ts.tv_sec = 0;
reg->ts.tv_nsec = 100000;
reg->flags = IORING_REG_WAIT_TS;
where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The
above initializes index 0, but 63 other regions can be initialized,
if needed. Now, instead of doing:
struct __kernel_timespec timeout = { .tv_nsec = 100000, };
io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL);
to wait for events for each submit_and_wait, or just wait, operation, it
can just reference the above region at offset 0 and do:
io_uring_submit_and_wait_reg(ring, &cqe, nr, 0);
to achieve the same goal of waiting 100usec without needing to copy
both struct io_uring_getevents_arg (24b) and struct __kernel_timeout
(16b) for each invocation. Struct io_uring_reg_wait looks as follows:
struct io_uring_reg_wait {
struct __kernel_timespec ts;
__u32 min_wait_usec;
__u32 flags;
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad[3];
__u64 pad2[2];
};
embedding the timeout itself in the region, rather than passing it as
a pointer as well. Note that the signal mask is still passed as a
pointer, both for compatability reasons, but also because there doesn't
seem to be a lot of high frequency waits scenarios that involve setting
and resetting the signal mask for each wait.
The application is free to modify any region before a wait call, or it
can use keep multiple regions with different settings to avoid needing to
modify the same one for wait calls. Up to a page size of regions is mapped
by default, allowing PAGE_SIZE / 64 available regions for use.
The registered region must fit within a page. On a 4kb page size system,
that allows for 64 wait regions if a full page is used, as the size of
struct io_uring_reg_wait is 64b. The region registered must be aligned
to io_uring_reg_wait in size. It's valid to register less than 64
entries.
In network performance testing with zero-copy, this reduced the time
spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4%
to 0.3%.
Wait regions are fixed for the lifetime of the ring - once registered,
they are persistent until the ring is torn down. The regions support
minimum wait timeout as well as the regular waits.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
@@ -518,6 +518,7 @@ struct io_cqring_offsets {
|
||||
#define IORING_ENTER_EXT_ARG (1U << 3)
|
||||
#define IORING_ENTER_REGISTERED_RING (1U << 4)
|
||||
#define IORING_ENTER_ABS_TIMER (1U << 5)
|
||||
#define IORING_ENTER_EXT_ARG_REG (1U << 6)
|
||||
|
||||
/*
|
||||
* Passed in for io_uring_setup(2). Copied back with updated info on success
|
||||
@@ -620,6 +621,9 @@ enum io_uring_register_op {
|
||||
/* resize CQ ring */
|
||||
IORING_REGISTER_RESIZE_RINGS = 33,
|
||||
|
||||
/* register fixed io_uring_reg_wait arguments */
|
||||
IORING_REGISTER_CQWAIT_REG = 34,
|
||||
|
||||
/* this goes last */
|
||||
IORING_REGISTER_LAST,
|
||||
|
||||
@@ -803,6 +807,43 @@ enum io_uring_register_restriction_op {
|
||||
IORING_RESTRICTION_LAST
|
||||
};
|
||||
|
||||
enum {
|
||||
IORING_REG_WAIT_TS = (1U << 0),
|
||||
};
|
||||
|
||||
/*
|
||||
* Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
|
||||
* struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
|
||||
* called rather than pass in a wait argument structure separately.
|
||||
*/
|
||||
struct io_uring_cqwait_reg_arg {
|
||||
__u32 flags;
|
||||
__u32 struct_size;
|
||||
__u32 nr_entries;
|
||||
__u32 pad;
|
||||
__u64 user_addr;
|
||||
__u64 pad2[3];
|
||||
};
|
||||
|
||||
/*
|
||||
* Argument for io_uring_enter(2) with
|
||||
* IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
|
||||
* is an index into a previously registered fixed wait region described by
|
||||
* the below structure.
|
||||
*/
|
||||
struct io_uring_reg_wait {
|
||||
struct __kernel_timespec ts;
|
||||
__u32 min_wait_usec;
|
||||
__u32 flags;
|
||||
__u64 sigmask;
|
||||
__u32 sigmask_sz;
|
||||
__u32 pad[3];
|
||||
__u64 pad2[2];
|
||||
};
|
||||
|
||||
/*
|
||||
* Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
|
||||
*/
|
||||
struct io_uring_getevents_arg {
|
||||
__u64 sigmask;
|
||||
__u32 sigmask_sz;
|
||||
|
||||
Reference in New Issue
Block a user