io_uring: save atomic dec for inline executed reqs
authorPavel Begunkov <asml.silence@gmail.com>
Tue, 19 Jan 2021 13:32:47 +0000 (13:32 +0000)
committerJens Axboe <axboe@kernel.dk>
Mon, 1 Feb 2021 17:02:42 +0000 (10:02 -0700)
When a request is completed with comp_state, its completion reference
put is deferred to io_submit_flush_completions(), but the submission
is put not far from there, so do it together to save one atomic dec per
request. That targets requests that complete inline, e.g. buffered rw,
send/recv.

Proper benchmarking haven't been conducted but for nops(batch=32) it was
around 7901 vs 8117 KIOPS (~2.7%), or ~4% per perf profiling.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
fs/io_uring.c

index 935a16a..3f6d055 100644 (file)
@@ -629,6 +629,7 @@ enum {
        REQ_F_NO_FILE_TABLE_BIT,
        REQ_F_WORK_INITIALIZED_BIT,
        REQ_F_LTIMEOUT_ACTIVE_BIT,
+       REQ_F_COMPLETE_INLINE_BIT,
 
        /* not a real bit, just to check we're not overflowing the space */
        __REQ_F_LAST_BIT,
@@ -672,6 +673,8 @@ enum {
        REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
        /* linked timeout is active, i.e. prepared by link's head */
        REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
+       /* completion is deferred through io_comp_state */
+       REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 };
 
 struct async_poll {
@@ -1917,14 +1920,15 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
                 * io_free_req() doesn't care about completion_lock unless one
                 * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
                 * because of a potential deadlock with req->work.fs->lock
+                * We defer both, completion and submission refs.
                 */
                if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
                                 |REQ_F_WORK_INITIALIZED)) {
                        spin_unlock_irq(&ctx->completion_lock);
-                       io_put_req(req);
+                       io_double_put_req(req);
                        spin_lock_irq(&ctx->completion_lock);
                } else {
-                       io_put_req(req);
+                       io_double_put_req(req);
                }
        }
        io_commit_cqring(ctx);
@@ -1940,8 +1944,7 @@ static void io_req_complete_state(struct io_kiocb *req, long res,
        io_clean_op(req);
        req->result = res;
        req->compl.cflags = cflags;
-       list_add_tail(&req->compl.list, &cs->list);
-       cs->nr++;
+       req->flags |= REQ_F_COMPLETE_INLINE;
 }
 
 static inline void __io_req_complete(struct io_kiocb *req, long res,
@@ -6576,9 +6579,9 @@ again:
                        io_queue_linked_timeout(linked_timeout);
        } else if (likely(!ret)) {
                /* drop submission reference */
-               if (cs) {
-                       io_put_req(req);
-                       if (cs->nr >= 32)
+               if (req->flags & REQ_F_COMPLETE_INLINE) {
+                       list_add_tail(&req->compl.list, &cs->list);
+                       if (++cs->nr >= 32)
                                io_submit_flush_completions(cs);
                        req = NULL;
                } else {