io_uring: add option to remove SQ indirection
authorPavel Begunkov <asml.silence@gmail.com>
Thu, 24 Aug 2023 22:53:32 +0000 (23:53 +0100)
committerJens Axboe <axboe@kernel.dk>
Thu, 24 Aug 2023 23:16:19 +0000 (17:16 -0600)
Not many aware, but io_uring submission queue has two levels. The first
level usually appears as sq_array and stores indexes into the actual SQ.

To my knowledge, no one has ever seriously used it, nor liburing exposes
it to users. Add IORING_SETUP_NO_SQARRAY, when set we don't bother
creating and using the sq_array and SQ heads/tails will be pointing
directly into the SQ. Improves memory footprint, in term of both
allocations as well as cache usage, and also should make io_get_sqe()
less branchy in the end.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0ffa3268a5ef61d326201ff43a233315c96312e0.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
include/uapi/linux/io_uring.h
io_uring/io_uring.c

index 9fc7195..8e61f8b 100644 (file)
@@ -185,6 +185,11 @@ enum {
  */
 #define IORING_SETUP_REGISTERED_FD_ONLY        (1U << 15)
 
+/*
+ * Removes indirection through the SQ index array.
+ */
+#define IORING_SETUP_NO_SQARRAY                (1U << 16)
+
 enum io_uring_op {
        IORING_OP_NOP,
        IORING_OP_READV,
index e832190..a6eea39 100644 (file)
@@ -2339,8 +2339,21 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
  */
 static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 {
-       unsigned head, mask = ctx->sq_entries - 1;
-       unsigned sq_idx = ctx->cached_sq_head++ & mask;
+       unsigned mask = ctx->sq_entries - 1;
+       unsigned head = ctx->cached_sq_head++ & mask;
+
+       if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
+               head = READ_ONCE(ctx->sq_array[head]);
+               if (unlikely(head >= ctx->sq_entries)) {
+                       /* drop invalid entries */
+                       spin_lock(&ctx->completion_lock);
+                       ctx->cq_extra--;
+                       spin_unlock(&ctx->completion_lock);
+                       WRITE_ONCE(ctx->rings->sq_dropped,
+                                  READ_ONCE(ctx->rings->sq_dropped) + 1);
+                       return false;
+               }
+       }
 
        /*
         * The cached sq head (or cq tail) serves two purposes:
@@ -2350,22 +2363,12 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
         * 2) allows the kernel side to track the head on its own, even
         *    though the application is the one updating it.
         */
-       head = READ_ONCE(ctx->sq_array[sq_idx]);
-       if (likely(head < ctx->sq_entries)) {
-               /* double index for 128-byte SQEs, twice as long */
-               if (ctx->flags & IORING_SETUP_SQE128)
-                       head <<= 1;
-               *sqe = &ctx->sq_sqes[head];
-               return true;
-       }
 
-       /* drop invalid entries */
-       spin_lock(&ctx->completion_lock);
-       ctx->cq_extra--;
-       spin_unlock(&ctx->completion_lock);
-       WRITE_ONCE(ctx->rings->sq_dropped,
-                  READ_ONCE(ctx->rings->sq_dropped) + 1);
-       return false;
+       /* double index for 128-byte SQEs, twice as long */
+       if (ctx->flags & IORING_SETUP_SQE128)
+               head <<= 1;
+       *sqe = &ctx->sq_sqes[head];
+       return true;
 }
 
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
@@ -2734,6 +2737,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
                return SIZE_MAX;
 #endif
 
+       if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
+               if (sq_offset)
+                       *sq_offset = SIZE_MAX;
+               return off;
+       }
+
        if (sq_offset)
                *sq_offset = off;
 
@@ -3710,7 +3719,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
                return PTR_ERR(rings);
 
        ctx->rings = rings;
-       ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
+       if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+               ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
        rings->sq_ring_mask = p->sq_entries - 1;
        rings->cq_ring_mask = p->cq_entries - 1;
        rings->sq_ring_entries = p->sq_entries;
@@ -3921,7 +3931,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
        p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
        p->sq_off.flags = offsetof(struct io_rings, sq_flags);
        p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
-       p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
+       if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+               p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
        p->sq_off.resv1 = 0;
        if (!(ctx->flags & IORING_SETUP_NO_MMAP))
                p->sq_off.user_addr = 0;
@@ -4010,7 +4021,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
                        IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
                        IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
                        IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
-                       IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
+                       IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
+                       IORING_SETUP_NO_SQARRAY))
                return -EINVAL;
 
        return io_uring_create(entries, &p, params);