io_uring: move rsrc related data, core, and commands
authorJens Axboe <axboe@kernel.dk>
Mon, 13 Jun 2022 13:12:45 +0000 (07:12 -0600)
committerJens Axboe <axboe@kernel.dk>
Mon, 25 Jul 2022 00:39:12 +0000 (18:39 -0600)
Signed-off-by: Jens Axboe <axboe@kernel.dk>
io_uring/Makefile
io_uring/io_uring.c
io_uring/io_uring.h
io_uring/openclose.c
io_uring/rsrc.c [new file with mode: 0644]
io_uring/rsrc.h [new file with mode: 0644]

index b85418b64e8241140b39c41d095dfc042a2e0a2f..360a83039c2a4f78ec943e2cd11480746400274d 100644 (file)
@@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING)          += io_uring.o xattr.o nop.o fs.o splice.o \
                                        openclose.o uring_cmd.o epoll.o \
                                        statx.o net.o msg_ring.o timeout.o \
                                        sqpoll.o fdinfo.o tctx.o poll.o \
-                                       cancel.o kbuf.o
+                                       cancel.o kbuf.o rsrc.o
 obj-$(CONFIG_IO_WQ)            += io-wq.o
index e395167999edfb150a9eb85cd11d402471b4b725..0c47c919887f527596c8c762fff4070377da418d 100644 (file)
@@ -68,7 +68,6 @@
 #include <linux/uaccess.h>
 #include <linux/nospec.h>
 #include <linux/sizes.h>
-#include <linux/hugetlb.h>
 #include <linux/highmem.h>
 #include <linux/fsnotify.h>
 #include <linux/fadvise.h>
@@ -94,6 +93,7 @@
 #include "sqpoll.h"
 #include "fdinfo.h"
 #include "kbuf.h"
+#include "rsrc.h"
 
 #include "xattr.h"
 #include "nop.h"
 #define IORING_MAX_ENTRIES     32768
 #define IORING_MAX_CQ_ENTRIES  (2 * IORING_MAX_ENTRIES)
 
-/* only define max */
-#define IORING_MAX_FIXED_FILES (1U << 20)
 #define IORING_MAX_RESTRICTIONS        (IORING_RESTRICTION_LAST + \
                                 IORING_REGISTER_LAST + IORING_OP_LAST)
 
-#define IO_RSRC_TAG_TABLE_SHIFT        (PAGE_SHIFT - 3)
-#define IO_RSRC_TAG_TABLE_MAX  (1U << IO_RSRC_TAG_TABLE_SHIFT)
-#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
-
-#define IORING_MAX_REG_BUFFERS (1U << 14)
-
 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
                          IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 
 
 #define IO_TCTX_REFS_CACHE_NR  (1U << 10)
 
-struct io_rsrc_put {
-       struct list_head list;
-       u64 tag;
-       union {
-               void *rsrc;
-               struct file *file;
-               struct io_mapped_ubuf *buf;
-       };
-};
-
-struct io_rsrc_node {
-       struct percpu_ref               refs;
-       struct list_head                node;
-       struct list_head                rsrc_list;
-       struct io_rsrc_data             *rsrc_data;
-       struct llist_node               llist;
-       bool                            done;
-};
-
-typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-
-struct io_rsrc_data {
-       struct io_ring_ctx              *ctx;
-
-       u64                             **tags;
-       unsigned int                    nr;
-       rsrc_put_fn                     *do_put;
-       atomic_t                        refs;
-       struct completion               done;
-       bool                            quiesce;
-};
-
 #define IO_COMPL_BATCH                 32
 #define IO_REQ_CACHE_SIZE              32
 #define IO_REQ_ALLOC_BATCH             8
@@ -188,13 +148,6 @@ struct io_rw {
        rwf_t                           flags;
 };
 
-struct io_rsrc_update {
-       struct file                     *file;
-       u64                             arg;
-       u32                             nr_args;
-       u32                             offset;
-};
-
 struct io_rw_state {
        struct iov_iter                 iter;
        struct iov_iter_state           iter_state;
@@ -208,11 +161,6 @@ struct io_async_rw {
        struct wait_page_queue          wpq;
 };
 
-enum {
-       IORING_RSRC_FILE                = 0,
-       IORING_RSRC_BUFFER              = 1,
-};
-
 enum {
        IO_CHECK_CQ_OVERFLOW_BIT,
        IO_CHECK_CQ_DROPPED_BIT,
@@ -233,12 +181,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         bool cancel_all);
 
 static void io_dismantle_req(struct io_kiocb *req);
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-                                    struct io_uring_rsrc_update2 *up,
-                                    unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
 static void io_queue_sqe(struct io_kiocb *req);
-static void io_rsrc_put_work(struct work_struct *work);
 
 static void io_req_task_queue(struct io_kiocb *req);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
@@ -268,22 +212,6 @@ struct sock *io_uring_get_socket(struct file *file)
 }
 EXPORT_SYMBOL(io_uring_get_socket);
 
-#if defined(CONFIG_UNIX)
-static inline bool io_file_need_scm(struct file *filp)
-{
-#if defined(IO_URING_SCM_ALL)
-       return true;
-#else
-       return !!unix_get_socket(filp);
-#endif
-}
-#else
-static inline bool io_file_need_scm(struct file *filp)
-{
-       return false;
-}
-#endif
-
 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 {
        if (!*locked) {
@@ -298,67 +226,6 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
                __io_submit_flush_completions(ctx);
 }
 
-#define IO_RSRC_REF_BATCH      100
-
-static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
-{
-       percpu_ref_put_many(&node->refs, nr);
-}
-
-static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
-                                         struct io_ring_ctx *ctx)
-       __must_hold(&ctx->uring_lock)
-{
-       struct io_rsrc_node *node = req->rsrc_node;
-
-       if (node) {
-               if (node == ctx->rsrc_node)
-                       ctx->rsrc_cached_refs++;
-               else
-                       io_rsrc_put_node(node, 1);
-       }
-}
-
-static inline void io_req_put_rsrc(struct io_kiocb *req)
-{
-       if (req->rsrc_node)
-               io_rsrc_put_node(req->rsrc_node, 1);
-}
-
-static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
-       __must_hold(&ctx->uring_lock)
-{
-       if (ctx->rsrc_cached_refs) {
-               io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
-               ctx->rsrc_cached_refs = 0;
-       }
-}
-
-static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
-       __must_hold(&ctx->uring_lock)
-{
-       ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
-       percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
-}
-
-static inline void io_req_set_rsrc_node(struct io_kiocb *req,
-                                       struct io_ring_ctx *ctx,
-                                       unsigned int issue_flags)
-{
-       if (!req->rsrc_node) {
-               req->rsrc_node = ctx->rsrc_node;
-
-               if (!(issue_flags & IO_URING_F_UNLOCKED)) {
-                       lockdep_assert_held(&ctx->uring_lock);
-                       ctx->rsrc_cached_refs--;
-                       if (unlikely(ctx->rsrc_cached_refs < 0))
-                               io_rsrc_refs_refill(ctx);
-               } else {
-                       percpu_ref_get(&req->rsrc_node->refs);
-               }
-       }
-}
-
 static bool io_match_linked(struct io_kiocb *head)
 {
        struct io_kiocb *req;
@@ -2870,92 +2737,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
        return -EOPNOTSUPP;
 }
 
-static int io_files_update_prep(struct io_kiocb *req,
-                               const struct io_uring_sqe *sqe)
-{
-       struct io_rsrc_update *up = io_kiocb_to_cmd(req);
-
-       if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
-               return -EINVAL;
-       if (sqe->rw_flags || sqe->splice_fd_in)
-               return -EINVAL;
-
-       up->offset = READ_ONCE(sqe->off);
-       up->nr_args = READ_ONCE(sqe->len);
-       if (!up->nr_args)
-               return -EINVAL;
-       up->arg = READ_ONCE(sqe->addr);
-       return 0;
-}
-
-static int io_files_update_with_index_alloc(struct io_kiocb *req,
-                                           unsigned int issue_flags)
-{
-       struct io_rsrc_update *up = io_kiocb_to_cmd(req);
-       __s32 __user *fds = u64_to_user_ptr(up->arg);
-       unsigned int done;
-       struct file *file;
-       int ret, fd;
-
-       if (!req->ctx->file_data)
-               return -ENXIO;
-
-       for (done = 0; done < up->nr_args; done++) {
-               if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
-                       ret = -EFAULT;
-                       break;
-               }
-
-               file = fget(fd);
-               if (!file) {
-                       ret = -EBADF;
-                       break;
-               }
-               ret = io_fixed_fd_install(req, issue_flags, file,
-                                         IORING_FILE_INDEX_ALLOC);
-               if (ret < 0)
-                       break;
-               if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
-                       __io_close_fixed(req, issue_flags, ret);
-                       ret = -EFAULT;
-                       break;
-               }
-       }
-
-       if (done)
-               return done;
-       return ret;
-}
-
-static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
-{
-       struct io_rsrc_update *up = io_kiocb_to_cmd(req);
-       struct io_ring_ctx *ctx = req->ctx;
-       struct io_uring_rsrc_update2 up2;
-       int ret;
-
-       up2.offset = up->offset;
-       up2.data = up->arg;
-       up2.nr = 0;
-       up2.tags = 0;
-       up2.resv = 0;
-       up2.resv2 = 0;
-
-       if (up->offset == IORING_FILE_INDEX_ALLOC) {
-               ret = io_files_update_with_index_alloc(req, issue_flags);
-       } else {
-               io_ring_submit_lock(ctx, issue_flags);
-               ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
-                                               &up2, up->nr_args);
-               io_ring_submit_unlock(ctx, issue_flags);
-       }
-
-       if (ret < 0)
-               req_set_fail(req);
-       io_req_set_res(req, ret, 0);
-       return IOU_OK;
-}
-
 static int io_req_prep_async(struct io_kiocb *req)
 {
        const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -3696,7 +3477,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
        return -1;
 }
 
-static int io_run_task_work_sig(void)
+int io_run_task_work_sig(void)
 {
        if (io_run_task_work())
                return 1;
@@ -3798,1265 +3579,164 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
        return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
-static void io_free_page_table(void **table, size_t size)
+int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+                         unsigned int issue_flags, u32 slot_index)
+       __must_hold(&req->ctx->uring_lock)
 {
-       unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+       struct io_ring_ctx *ctx = req->ctx;
+       bool needs_switch = false;
+       struct io_fixed_file *file_slot;
+       int ret;
 
-       for (i = 0; i < nr_tables; i++)
-               kfree(table[i]);
-       kfree(table);
-}
+       if (io_is_uring_fops(file))
+               return -EBADF;
+       if (!ctx->file_data)
+               return -ENXIO;
+       if (slot_index >= ctx->nr_user_files)
+               return -EINVAL;
 
-static __cold void **io_alloc_page_table(size_t size)
-{
-       unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-       size_t init_size = size;
-       void **table;
+       slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+       file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
 
-       table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
-       if (!table)
-               return NULL;
+       if (file_slot->file_ptr) {
+               struct file *old_file;
 
-       for (i = 0; i < nr_tables; i++) {
-               unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+               ret = io_rsrc_node_switch_start(ctx);
+               if (ret)
+                       goto err;
 
-               table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
-               if (!table[i]) {
-                       io_free_page_table(table, init_size);
-                       return NULL;
-               }
-               size -= this_size;
+               old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+               ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+                                           ctx->rsrc_node, old_file);
+               if (ret)
+                       goto err;
+               file_slot->file_ptr = 0;
+               io_file_bitmap_clear(&ctx->file_table, slot_index);
+               needs_switch = true;
        }
-       return table;
-}
 
-static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
-{
-       percpu_ref_exit(&ref_node->refs);
-       kfree(ref_node);
+       ret = io_scm_file_account(ctx, file);
+       if (!ret) {
+               *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+               io_fixed_file_set(file_slot, file);
+               io_file_bitmap_set(&ctx->file_table, slot_index);
+       }
+err:
+       if (needs_switch)
+               io_rsrc_node_switch(ctx, ctx->file_data);
+       if (ret)
+               fput(file);
+       return ret;
 }
 
-static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static void io_mem_free(void *ptr)
 {
-       struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
-       struct io_ring_ctx *ctx = node->rsrc_data->ctx;
-       unsigned long flags;
-       bool first_add = false;
-       unsigned long delay = HZ;
-
-       spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
-       node->done = true;
-
-       /* if we are mid-quiesce then do not delay */
-       if (node->rsrc_data->quiesce)
-               delay = 0;
+       struct page *page;
 
-       while (!list_empty(&ctx->rsrc_ref_list)) {
-               node = list_first_entry(&ctx->rsrc_ref_list,
-                                           struct io_rsrc_node, node);
-               /* recycle ref nodes in order */
-               if (!node->done)
-                       break;
-               list_del(&node->node);
-               first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
-       }
-       spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
+       if (!ptr)
+               return;
 
-       if (first_add)
-               mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+       page = virt_to_head_page(ptr);
+       if (put_page_testzero(page))
+               free_compound_page(page);
 }
 
-static struct io_rsrc_node *io_rsrc_node_alloc(void)
+static void *io_mem_alloc(size_t size)
 {
-       struct io_rsrc_node *ref_node;
-
-       ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-       if (!ref_node)
-               return NULL;
+       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
 
-       if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
-                           0, GFP_KERNEL)) {
-               kfree(ref_node);
-               return NULL;
-       }
-       INIT_LIST_HEAD(&ref_node->node);
-       INIT_LIST_HEAD(&ref_node->rsrc_list);
-       ref_node->done = false;
-       return ref_node;
+       return (void *) __get_free_pages(gfp, get_order(size));
 }
 
-void io_rsrc_node_switch(struct io_ring_ctx *ctx,
-                        struct io_rsrc_data *data_to_kill)
-       __must_hold(&ctx->uring_lock)
+static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
+                               unsigned int cq_entries, size_t *sq_offset)
 {
-       WARN_ON_ONCE(!ctx->rsrc_backup_node);
-       WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+       struct io_rings *rings;
+       size_t off, sq_array_size;
 
-       io_rsrc_refs_drop(ctx);
+       off = struct_size(rings, cqes, cq_entries);
+       if (off == SIZE_MAX)
+               return SIZE_MAX;
+       if (ctx->flags & IORING_SETUP_CQE32) {
+               if (check_shl_overflow(off, 1, &off))
+                       return SIZE_MAX;
+       }
 
-       if (data_to_kill) {
-               struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
+#ifdef CONFIG_SMP
+       off = ALIGN(off, SMP_CACHE_BYTES);
+       if (off == 0)
+               return SIZE_MAX;
+#endif
 
-               rsrc_node->rsrc_data = data_to_kill;
-               spin_lock_irq(&ctx->rsrc_ref_lock);
-               list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
-               spin_unlock_irq(&ctx->rsrc_ref_lock);
+       if (sq_offset)
+               *sq_offset = off;
 
-               atomic_inc(&data_to_kill->refs);
-               percpu_ref_kill(&rsrc_node->refs);
-               ctx->rsrc_node = NULL;
-       }
+       sq_array_size = array_size(sizeof(u32), sq_entries);
+       if (sq_array_size == SIZE_MAX)
+               return SIZE_MAX;
 
-       if (!ctx->rsrc_node) {
-               ctx->rsrc_node = ctx->rsrc_backup_node;
-               ctx->rsrc_backup_node = NULL;
-       }
-}
+       if (check_add_overflow(off, sq_array_size, &off))
+               return SIZE_MAX;
 
-int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
-{
-       if (ctx->rsrc_backup_node)
-               return 0;
-       ctx->rsrc_backup_node = io_rsrc_node_alloc();
-       return ctx->rsrc_backup_node ? 0 : -ENOMEM;
+       return off;
 }
 
-static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
-                                     struct io_ring_ctx *ctx)
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+                              unsigned int eventfd_async)
 {
-       int ret;
-
-       /* As we may drop ->uring_lock, other task may have started quiesce */
-       if (data->quiesce)
-               return -ENXIO;
-
-       data->quiesce = true;
-       do {
-               ret = io_rsrc_node_switch_start(ctx);
-               if (ret)
-                       break;
-               io_rsrc_node_switch(ctx, data);
+       struct io_ev_fd *ev_fd;
+       __s32 __user *fds = arg;
+       int fd;
 
-               /* kill initial ref, already quiesced if zero */
-               if (atomic_dec_and_test(&data->refs))
-                       break;
-               mutex_unlock(&ctx->uring_lock);
-               flush_delayed_work(&ctx->rsrc_put_work);
-               ret = wait_for_completion_interruptible(&data->done);
-               if (!ret) {
-                       mutex_lock(&ctx->uring_lock);
-                       if (atomic_read(&data->refs) > 0) {
-                               /*
-                                * it has been revived by another thread while
-                                * we were unlocked
-                                */
-                               mutex_unlock(&ctx->uring_lock);
-                       } else {
-                               break;
-                       }
-               }
+       ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+                                       lockdep_is_held(&ctx->uring_lock));
+       if (ev_fd)
+               return -EBUSY;
 
-               atomic_inc(&data->refs);
-               /* wait for all works potentially completing data->done */
-               flush_delayed_work(&ctx->rsrc_put_work);
-               reinit_completion(&data->done);
+       if (copy_from_user(&fd, fds, sizeof(*fds)))
+               return -EFAULT;
 
-               ret = io_run_task_work_sig();
-               mutex_lock(&ctx->uring_lock);
-       } while (ret >= 0);
-       data->quiesce = false;
+       ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+       if (!ev_fd)
+               return -ENOMEM;
 
-       return ret;
+       ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+       if (IS_ERR(ev_fd->cq_ev_fd)) {
+               int ret = PTR_ERR(ev_fd->cq_ev_fd);
+               kfree(ev_fd);
+               return ret;
+       }
+       ev_fd->eventfd_async = eventfd_async;
+       ctx->has_evfd = true;
+       rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
+       return 0;
 }
 
-static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+static void io_eventfd_put(struct rcu_head *rcu)
 {
-       unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
-       unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+       struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
 
-       return &data->tags[table_idx][off];
+       eventfd_ctx_put(ev_fd->cq_ev_fd);
+       kfree(ev_fd);
 }
 
-static void io_rsrc_data_free(struct io_rsrc_data *data)
+static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 {
-       size_t size = data->nr * sizeof(data->tags[0][0]);
+       struct io_ev_fd *ev_fd;
+
+       ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+                                       lockdep_is_held(&ctx->uring_lock));
+       if (ev_fd) {
+               ctx->has_evfd = false;
+               rcu_assign_pointer(ctx->io_ev_fd, NULL);
+               call_rcu(&ev_fd->rcu, io_eventfd_put);
+               return 0;
+       }
 
-       if (data->tags)
-               io_free_page_table((void **)data->tags, size);
-       kfree(data);
+       return -ENXIO;
 }
 
-static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
-                                    u64 __user *utags, unsigned nr,
-                                    struct io_rsrc_data **pdata)
-{
-       struct io_rsrc_data *data;
-       int ret = -ENOMEM;
-       unsigned i;
-
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
-       data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
-       if (!data->tags) {
-               kfree(data);
-               return -ENOMEM;
-       }
-
-       data->nr = nr;
-       data->ctx = ctx;
-       data->do_put = do_put;
-       if (utags) {
-               ret = -EFAULT;
-               for (i = 0; i < nr; i++) {
-                       u64 *tag_slot = io_get_tag_slot(data, i);
-
-                       if (copy_from_user(tag_slot, &utags[i],
-                                          sizeof(*tag_slot)))
-                               goto fail;
-               }
-       }
-
-       atomic_set(&data->refs, 1);
-       init_completion(&data->done);
-       *pdata = data;
-       return 0;
-fail:
-       io_rsrc_data_free(data);
-       return ret;
-}
-
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-#if !defined(IO_URING_SCM_ALL)
-       int i;
-
-       for (i = 0; i < ctx->nr_user_files; i++) {
-               struct file *file = io_file_from_index(&ctx->file_table, i);
-
-               if (!file)
-                       continue;
-               if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
-                       continue;
-               io_file_bitmap_clear(&ctx->file_table, i);
-               fput(file);
-       }
-#endif
-
-#if defined(CONFIG_UNIX)
-       if (ctx->ring_sock) {
-               struct sock *sock = ctx->ring_sock->sk;
-               struct sk_buff *skb;
-
-               while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
-                       kfree_skb(skb);
-       }
-#endif
-       io_free_file_tables(&ctx->file_table);
-       io_rsrc_data_free(ctx->file_data);
-       ctx->file_data = NULL;
-       ctx->nr_user_files = 0;
-}
-
-static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-       unsigned nr = ctx->nr_user_files;
-       int ret;
-
-       if (!ctx->file_data)
-               return -ENXIO;
-
-       /*
-        * Quiesce may unlock ->uring_lock, and while it's not held
-        * prevent new requests using the table.
-        */
-       ctx->nr_user_files = 0;
-       ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
-       ctx->nr_user_files = nr;
-       if (!ret)
-               __io_sqe_files_unregister(ctx);
-       return ret;
-}
-
-/*
- * Ensure the UNIX gc is aware of our file set, so we are certain that
- * the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing. We account only files that can hold other
- * files because otherwise they can't form a loop and so are not interesting
- * for GC.
- */
-static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
-{
-#if defined(CONFIG_UNIX)
-       struct sock *sk = ctx->ring_sock->sk;
-       struct sk_buff_head *head = &sk->sk_receive_queue;
-       struct scm_fp_list *fpl;
-       struct sk_buff *skb;
-
-       if (likely(!io_file_need_scm(file)))
-               return 0;
-
-       /*
-        * See if we can merge this file into an existing skb SCM_RIGHTS
-        * file set. If there's no room, fall back to allocating a new skb
-        * and filling it in.
-        */
-       spin_lock_irq(&head->lock);
-       skb = skb_peek(head);
-       if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
-               __skb_unlink(skb, head);
-       else
-               skb = NULL;
-       spin_unlock_irq(&head->lock);
-
-       if (!skb) {
-               fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
-               if (!fpl)
-                       return -ENOMEM;
-
-               skb = alloc_skb(0, GFP_KERNEL);
-               if (!skb) {
-                       kfree(fpl);
-                       return -ENOMEM;
-               }
-
-               fpl->user = get_uid(current_user());
-               fpl->max = SCM_MAX_FD;
-               fpl->count = 0;
-
-               UNIXCB(skb).fp = fpl;
-               skb->sk = sk;
-               skb->destructor = unix_destruct_scm;
-               refcount_add(skb->truesize, &sk->sk_wmem_alloc);
-       }
-
-       fpl = UNIXCB(skb).fp;
-       fpl->fp[fpl->count++] = get_file(file);
-       unix_inflight(fpl->user, file);
-       skb_queue_head(head, skb);
-       fput(file);
-#endif
-       return 0;
-}
-
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
-       struct file *file = prsrc->file;
-#if defined(CONFIG_UNIX)
-       struct sock *sock = ctx->ring_sock->sk;
-       struct sk_buff_head list, *head = &sock->sk_receive_queue;
-       struct sk_buff *skb;
-       int i;
-
-       if (!io_file_need_scm(file)) {
-               fput(file);
-               return;
-       }
-
-       __skb_queue_head_init(&list);
-
-       /*
-        * Find the skb that holds this file in its SCM_RIGHTS. When found,
-        * remove this entry and rearrange the file array.
-        */
-       skb = skb_dequeue(head);
-       while (skb) {
-               struct scm_fp_list *fp;
-
-               fp = UNIXCB(skb).fp;
-               for (i = 0; i < fp->count; i++) {
-                       int left;
-
-                       if (fp->fp[i] != file)
-                               continue;
-
-                       unix_notinflight(fp->user, fp->fp[i]);
-                       left = fp->count - 1 - i;
-                       if (left) {
-                               memmove(&fp->fp[i], &fp->fp[i + 1],
-                                               left * sizeof(struct file *));
-                       }
-                       fp->count--;
-                       if (!fp->count) {
-                               kfree_skb(skb);
-                               skb = NULL;
-                       } else {
-                               __skb_queue_tail(&list, skb);
-                       }
-                       fput(file);
-                       file = NULL;
-                       break;
-               }
-
-               if (!file)
-                       break;
-
-               __skb_queue_tail(&list, skb);
-
-               skb = skb_dequeue(head);
-       }
-
-       if (skb_peek(&list)) {
-               spin_lock_irq(&head->lock);
-               while ((skb = __skb_dequeue(&list)) != NULL)
-                       __skb_queue_tail(head, skb);
-               spin_unlock_irq(&head->lock);
-       }
-#else
-       fput(file);
-#endif
-}
-
-static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
-{
-       struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
-       struct io_ring_ctx *ctx = rsrc_data->ctx;
-       struct io_rsrc_put *prsrc, *tmp;
-
-       list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
-               list_del(&prsrc->list);
-
-               if (prsrc->tag) {
-                       if (ctx->flags & IORING_SETUP_IOPOLL)
-                               mutex_lock(&ctx->uring_lock);
-
-                       spin_lock(&ctx->completion_lock);
-                       io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
-                       io_commit_cqring(ctx);
-                       spin_unlock(&ctx->completion_lock);
-                       io_cqring_ev_posted(ctx);
-
-                       if (ctx->flags & IORING_SETUP_IOPOLL)
-                               mutex_unlock(&ctx->uring_lock);
-               }
-
-               rsrc_data->do_put(ctx, prsrc);
-               kfree(prsrc);
-       }
-
-       io_rsrc_node_destroy(ref_node);
-       if (atomic_dec_and_test(&rsrc_data->refs))
-               complete(&rsrc_data->done);
-}
-
-static void io_rsrc_put_work(struct work_struct *work)
-{
-       struct io_ring_ctx *ctx;
-       struct llist_node *node;
-
-       ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
-       node = llist_del_all(&ctx->rsrc_put_llist);
-
-       while (node) {
-               struct io_rsrc_node *ref_node;
-               struct llist_node *next = node->next;
-
-               ref_node = llist_entry(node, struct io_rsrc_node, llist);
-               __io_rsrc_put_work(ref_node);
-               node = next;
-       }
-}
-
-static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
-                                unsigned nr_args, u64 __user *tags)
-{
-       __s32 __user *fds = (__s32 __user *) arg;
-       struct file *file;
-       int fd, ret;
-       unsigned i;
-
-       if (ctx->file_data)
-               return -EBUSY;
-       if (!nr_args)
-               return -EINVAL;
-       if (nr_args > IORING_MAX_FIXED_FILES)
-               return -EMFILE;
-       if (nr_args > rlimit(RLIMIT_NOFILE))
-               return -EMFILE;
-       ret = io_rsrc_node_switch_start(ctx);
-       if (ret)
-               return ret;
-       ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
-                                &ctx->file_data);
-       if (ret)
-               return ret;
-
-       if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
-               io_rsrc_data_free(ctx->file_data);
-               ctx->file_data = NULL;
-               return -ENOMEM;
-       }
-
-       for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-               struct io_fixed_file *file_slot;
-
-               if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
-                       ret = -EFAULT;
-                       goto fail;
-               }
-               /* allow sparse sets */
-               if (!fds || fd == -1) {
-                       ret = -EINVAL;
-                       if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
-                               goto fail;
-                       continue;
-               }
-
-               file = fget(fd);
-               ret = -EBADF;
-               if (unlikely(!file))
-                       goto fail;
-
-               /*
-                * Don't allow io_uring instances to be registered. If UNIX
-                * isn't enabled, then this causes a reference cycle and this
-                * instance can never get freed. If UNIX is enabled we'll
-                * handle it just fine, but there's still no point in allowing
-                * a ring fd as it doesn't support regular read/write anyway.
-                */
-               if (io_is_uring_fops(file)) {
-                       fput(file);
-                       goto fail;
-               }
-               ret = io_scm_file_account(ctx, file);
-               if (ret) {
-                       fput(file);
-                       goto fail;
-               }
-               file_slot = io_fixed_file_slot(&ctx->file_table, i);
-               io_fixed_file_set(file_slot, file);
-               io_file_bitmap_set(&ctx->file_table, i);
-       }
-
-       io_rsrc_node_switch(ctx, NULL);
-       return 0;
-fail:
-       __io_sqe_files_unregister(ctx);
-       return ret;
-}
-
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
-                         struct io_rsrc_node *node, void *rsrc)
-{
-       u64 *tag_slot = io_get_tag_slot(data, idx);
-       struct io_rsrc_put *prsrc;
-
-       prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
-       if (!prsrc)
-               return -ENOMEM;
-
-       prsrc->tag = *tag_slot;
-       *tag_slot = 0;
-       prsrc->rsrc = rsrc;
-       list_add(&prsrc->list, &node->rsrc_list);
-       return 0;
-}
-
-int io_install_fixed_file(struct io_kiocb *req, struct file *file,
-                         unsigned int issue_flags, u32 slot_index)
-       __must_hold(&req->ctx->uring_lock)
-{
-       struct io_ring_ctx *ctx = req->ctx;
-       bool needs_switch = false;
-       struct io_fixed_file *file_slot;
-       int ret;
-
-       if (io_is_uring_fops(file))
-               return -EBADF;
-       if (!ctx->file_data)
-               return -ENXIO;
-       if (slot_index >= ctx->nr_user_files)
-               return -EINVAL;
-
-       slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-       file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-
-       if (file_slot->file_ptr) {
-               struct file *old_file;
-
-               ret = io_rsrc_node_switch_start(ctx);
-               if (ret)
-                       goto err;
-
-               old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-               ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-                                           ctx->rsrc_node, old_file);
-               if (ret)
-                       goto err;
-               file_slot->file_ptr = 0;
-               io_file_bitmap_clear(&ctx->file_table, slot_index);
-               needs_switch = true;
-       }
-
-       ret = io_scm_file_account(ctx, file);
-       if (!ret) {
-               *io_get_tag_slot(ctx->file_data, slot_index) = 0;
-               io_fixed_file_set(file_slot, file);
-               io_file_bitmap_set(&ctx->file_table, slot_index);
-       }
-err:
-       if (needs_switch)
-               io_rsrc_node_switch(ctx, ctx->file_data);
-       if (ret)
-               fput(file);
-       return ret;
-}
-
-static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-                                struct io_uring_rsrc_update2 *up,
-                                unsigned nr_args)
-{
-       u64 __user *tags = u64_to_user_ptr(up->tags);
-       __s32 __user *fds = u64_to_user_ptr(up->data);
-       struct io_rsrc_data *data = ctx->file_data;
-       struct io_fixed_file *file_slot;
-       struct file *file;
-       int fd, i, err = 0;
-       unsigned int done;
-       bool needs_switch = false;
-
-       if (!ctx->file_data)
-               return -ENXIO;
-       if (up->offset + nr_args > ctx->nr_user_files)
-               return -EINVAL;
-
-       for (done = 0; done < nr_args; done++) {
-               u64 tag = 0;
-
-               if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
-                   copy_from_user(&fd, &fds[done], sizeof(fd))) {
-                       err = -EFAULT;
-                       break;
-               }
-               if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
-                       err = -EINVAL;
-                       break;
-               }
-               if (fd == IORING_REGISTER_FILES_SKIP)
-                       continue;
-
-               i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-               file_slot = io_fixed_file_slot(&ctx->file_table, i);
-
-               if (file_slot->file_ptr) {
-                       file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-                       err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
-                       if (err)
-                               break;
-                       file_slot->file_ptr = 0;
-                       io_file_bitmap_clear(&ctx->file_table, i);
-                       needs_switch = true;
-               }
-               if (fd != -1) {
-                       file = fget(fd);
-                       if (!file) {
-                               err = -EBADF;
-                               break;
-                       }
-                       /*
-                        * Don't allow io_uring instances to be registered. If
-                        * UNIX isn't enabled, then this causes a reference
-                        * cycle and this instance can never get freed. If UNIX
-                        * is enabled we'll handle it just fine, but there's
-                        * still no point in allowing a ring fd as it doesn't
-                        * support regular read/write anyway.
-                        */
-                       if (io_is_uring_fops(file)) {
-                               fput(file);
-                               err = -EBADF;
-                               break;
-                       }
-                       err = io_scm_file_account(ctx, file);
-                       if (err) {
-                               fput(file);
-                               break;
-                       }
-                       *io_get_tag_slot(data, i) = tag;
-                       io_fixed_file_set(file_slot, file);
-                       io_file_bitmap_set(&ctx->file_table, i);
-               }
-       }
-
-       if (needs_switch)
-               io_rsrc_node_switch(ctx, data);
-       return done ? done : err;
-}
-
-static inline void __io_unaccount_mem(struct user_struct *user,
-                                     unsigned long nr_pages)
-{
-       atomic_long_sub(nr_pages, &user->locked_vm);
-}
-
-static inline int __io_account_mem(struct user_struct *user,
-                                  unsigned long nr_pages)
-{
-       unsigned long page_limit, cur_pages, new_pages;
-
-       /* Don't allow more pages than we can safely lock */
-       page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-       do {
-               cur_pages = atomic_long_read(&user->locked_vm);
-               new_pages = cur_pages + nr_pages;
-               if (new_pages > page_limit)
-                       return -ENOMEM;
-       } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
-                                       new_pages) != cur_pages);
-
-       return 0;
-}
-
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
-{
-       if (ctx->user)
-               __io_unaccount_mem(ctx->user, nr_pages);
-
-       if (ctx->mm_account)
-               atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
-}
-
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
-{
-       int ret;
-
-       if (ctx->user) {
-               ret = __io_account_mem(ctx->user, nr_pages);
-               if (ret)
-                       return ret;
-       }
-
-       if (ctx->mm_account)
-               atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
-
-       return 0;
-}
-
-static void io_mem_free(void *ptr)
-{
-       struct page *page;
-
-       if (!ptr)
-               return;
-
-       page = virt_to_head_page(ptr);
-       if (put_page_testzero(page))
-               free_compound_page(page);
-}
-
-static void *io_mem_alloc(size_t size)
-{
-       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
-
-       return (void *) __get_free_pages(gfp, get_order(size));
-}
-
-static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
-                               unsigned int cq_entries, size_t *sq_offset)
-{
-       struct io_rings *rings;
-       size_t off, sq_array_size;
-
-       off = struct_size(rings, cqes, cq_entries);
-       if (off == SIZE_MAX)
-               return SIZE_MAX;
-       if (ctx->flags & IORING_SETUP_CQE32) {
-               if (check_shl_overflow(off, 1, &off))
-                       return SIZE_MAX;
-       }
-
-#ifdef CONFIG_SMP
-       off = ALIGN(off, SMP_CACHE_BYTES);
-       if (off == 0)
-               return SIZE_MAX;
-#endif
-
-       if (sq_offset)
-               *sq_offset = off;
-
-       sq_array_size = array_size(sizeof(u32), sq_entries);
-       if (sq_array_size == SIZE_MAX)
-               return SIZE_MAX;
-
-       if (check_add_overflow(off, sq_array_size, &off))
-               return SIZE_MAX;
-
-       return off;
-}
-
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
-{
-       struct io_mapped_ubuf *imu = *slot;
-       unsigned int i;
-
-       if (imu != ctx->dummy_ubuf) {
-               for (i = 0; i < imu->nr_bvecs; i++)
-                       unpin_user_page(imu->bvec[i].bv_page);
-               if (imu->acct_pages)
-                       io_unaccount_mem(ctx, imu->acct_pages);
-               kvfree(imu);
-       }
-       *slot = NULL;
-}
-
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
-       io_buffer_unmap(ctx, &prsrc->buf);
-       prsrc->buf = NULL;
-}
-
-static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
-       unsigned int i;
-
-       for (i = 0; i < ctx->nr_user_bufs; i++)
-               io_buffer_unmap(ctx, &ctx->user_bufs[i]);
-       kfree(ctx->user_bufs);
-       io_rsrc_data_free(ctx->buf_data);
-       ctx->user_bufs = NULL;
-       ctx->buf_data = NULL;
-       ctx->nr_user_bufs = 0;
-}
-
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
-       unsigned nr = ctx->nr_user_bufs;
-       int ret;
-
-       if (!ctx->buf_data)
-               return -ENXIO;
-
-       /*
-        * Quiesce may unlock ->uring_lock, and while it's not held
-        * prevent new requests using the table.
-        */
-       ctx->nr_user_bufs = 0;
-       ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
-       ctx->nr_user_bufs = nr;
-       if (!ret)
-               __io_sqe_buffers_unregister(ctx);
-       return ret;
-}
-
-static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
-                      void __user *arg, unsigned index)
-{
-       struct iovec __user *src;
-
-#ifdef CONFIG_COMPAT
-       if (ctx->compat) {
-               struct compat_iovec __user *ciovs;
-               struct compat_iovec ciov;
-
-               ciovs = (struct compat_iovec __user *) arg;
-               if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
-                       return -EFAULT;
-
-               dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
-               dst->iov_len = ciov.iov_len;
-               return 0;
-       }
-#endif
-       src = (struct iovec __user *) arg;
-       if (copy_from_user(dst, &src[index], sizeof(*dst)))
-               return -EFAULT;
-       return 0;
-}
-
-/*
- * Not super efficient, but this is just a registration time. And we do cache
- * the last compound head, so generally we'll only do a full search if we don't
- * match that one.
- *
- * We check if the given compound head page has already been accounted, to
- * avoid double accounting it. This allows us to account the full size of the
- * page, not just the constituent pages of a huge page.
- */
-static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
-                                 int nr_pages, struct page *hpage)
-{
-       int i, j;
-
-       /* check current page array */
-       for (i = 0; i < nr_pages; i++) {
-               if (!PageCompound(pages[i]))
-                       continue;
-               if (compound_head(pages[i]) == hpage)
-                       return true;
-       }
-
-       /* check previously registered pages */
-       for (i = 0; i < ctx->nr_user_bufs; i++) {
-               struct io_mapped_ubuf *imu = ctx->user_bufs[i];
-
-               for (j = 0; j < imu->nr_bvecs; j++) {
-                       if (!PageCompound(imu->bvec[j].bv_page))
-                               continue;
-                       if (compound_head(imu->bvec[j].bv_page) == hpage)
-                               return true;
-               }
-       }
-
-       return false;
-}
-
-static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
-                                int nr_pages, struct io_mapped_ubuf *imu,
-                                struct page **last_hpage)
-{
-       int i, ret;
-
-       imu->acct_pages = 0;
-       for (i = 0; i < nr_pages; i++) {
-               if (!PageCompound(pages[i])) {
-                       imu->acct_pages++;
-               } else {
-                       struct page *hpage;
-
-                       hpage = compound_head(pages[i]);
-                       if (hpage == *last_hpage)
-                               continue;
-                       *last_hpage = hpage;
-                       if (headpage_already_acct(ctx, pages, i, hpage))
-                               continue;
-                       imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
-               }
-       }
-
-       if (!imu->acct_pages)
-               return 0;
-
-       ret = io_account_mem(ctx, imu->acct_pages);
-       if (ret)
-               imu->acct_pages = 0;
-       return ret;
-}
-
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
-{
-       unsigned long start, end, nr_pages;
-       struct vm_area_struct **vmas = NULL;
-       struct page **pages = NULL;
-       int i, pret, ret = -ENOMEM;
-
-       end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       start = ubuf >> PAGE_SHIFT;
-       nr_pages = end - start;
-
-       pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
-       if (!pages)
-               goto done;
-
-       vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
-                             GFP_KERNEL);
-       if (!vmas)
-               goto done;
-
-       ret = 0;
-       mmap_read_lock(current->mm);
-       pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
-                             pages, vmas);
-       if (pret == nr_pages) {
-               /* don't support file backed memory */
-               for (i = 0; i < nr_pages; i++) {
-                       struct vm_area_struct *vma = vmas[i];
-
-                       if (vma_is_shmem(vma))
-                               continue;
-                       if (vma->vm_file &&
-                           !is_file_hugepages(vma->vm_file)) {
-                               ret = -EOPNOTSUPP;
-                               break;
-                       }
-               }
-               *npages = nr_pages;
-       } else {
-               ret = pret < 0 ? pret : -EFAULT;
-       }
-       mmap_read_unlock(current->mm);
-       if (ret) {
-               /*
-                * if we did partial map, or found file backed vmas,
-                * release any pages we did get
-                */
-               if (pret > 0)
-                       unpin_user_pages(pages, pret);
-               goto done;
-       }
-       ret = 0;
-done:
-       kvfree(vmas);
-       if (ret < 0) {
-               kvfree(pages);
-               pages = ERR_PTR(ret);
-       }
-       return pages;
-}
-
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-                                 struct io_mapped_ubuf **pimu,
-                                 struct page **last_hpage)
-{
-       struct io_mapped_ubuf *imu = NULL;
-       struct page **pages = NULL;
-       unsigned long off;
-       size_t size;
-       int ret, nr_pages, i;
-
-       if (!iov->iov_base) {
-               *pimu = ctx->dummy_ubuf;
-               return 0;
-       }
-
-       *pimu = NULL;
-       ret = -ENOMEM;
-
-       pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
-                               &nr_pages);
-       if (IS_ERR(pages)) {
-               ret = PTR_ERR(pages);
-               pages = NULL;
-               goto done;
-       }
-
-       imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
-       if (!imu)
-               goto done;
-
-       ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
-       if (ret) {
-               unpin_user_pages(pages, nr_pages);
-               goto done;
-       }
-
-       off = (unsigned long) iov->iov_base & ~PAGE_MASK;
-       size = iov->iov_len;
-       for (i = 0; i < nr_pages; i++) {
-               size_t vec_len;
-
-               vec_len = min_t(size_t, size, PAGE_SIZE - off);
-               imu->bvec[i].bv_page = pages[i];
-               imu->bvec[i].bv_len = vec_len;
-               imu->bvec[i].bv_offset = off;
-               off = 0;
-               size -= vec_len;
-       }
-       /* store original address for later verification */
-       imu->ubuf = (unsigned long) iov->iov_base;
-       imu->ubuf_end = imu->ubuf + iov->iov_len;
-       imu->nr_bvecs = nr_pages;
-       *pimu = imu;
-       ret = 0;
-done:
-       if (ret)
-               kvfree(imu);
-       kvfree(pages);
-       return ret;
-}
-
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
-{
-       ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
-       return ctx->user_bufs ? 0 : -ENOMEM;
-}
-
-static int io_buffer_validate(struct iovec *iov)
-{
-       unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
-
-       /*
-        * Don't impose further limits on the size and buffer
-        * constraints here, we'll -EINVAL later when IO is
-        * submitted if they are wrong.
-        */
-       if (!iov->iov_base)
-               return iov->iov_len ? -EFAULT : 0;
-       if (!iov->iov_len)
-               return -EFAULT;
-
-       /* arbitrary limit, but we need something */
-       if (iov->iov_len > SZ_1G)
-               return -EFAULT;
-
-       if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
-               return -EOVERFLOW;
-
-       return 0;
-}
-
-static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
-                                  unsigned int nr_args, u64 __user *tags)
-{
-       struct page *last_hpage = NULL;
-       struct io_rsrc_data *data;
-       int i, ret;
-       struct iovec iov;
-
-       if (ctx->user_bufs)
-               return -EBUSY;
-       if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
-               return -EINVAL;
-       ret = io_rsrc_node_switch_start(ctx);
-       if (ret)
-               return ret;
-       ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
-       if (ret)
-               return ret;
-       ret = io_buffers_map_alloc(ctx, nr_args);
-       if (ret) {
-               io_rsrc_data_free(data);
-               return ret;
-       }
-
-       for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
-               if (arg) {
-                       ret = io_copy_iov(ctx, &iov, arg, i);
-                       if (ret)
-                               break;
-                       ret = io_buffer_validate(&iov);
-                       if (ret)
-                               break;
-               } else {
-                       memset(&iov, 0, sizeof(iov));
-               }
-
-               if (!iov.iov_base && *io_get_tag_slot(data, i)) {
-                       ret = -EINVAL;
-                       break;
-               }
-
-               ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
-                                            &last_hpage);
-               if (ret)
-                       break;
-       }
-
-       WARN_ON_ONCE(ctx->buf_data);
-
-       ctx->buf_data = data;
-       if (ret)
-               __io_sqe_buffers_unregister(ctx);
-       else
-               io_rsrc_node_switch(ctx, NULL);
-       return ret;
-}
-
-static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
-                                  struct io_uring_rsrc_update2 *up,
-                                  unsigned int nr_args)
-{
-       u64 __user *tags = u64_to_user_ptr(up->tags);
-       struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
-       struct page *last_hpage = NULL;
-       bool needs_switch = false;
-       __u32 done;
-       int i, err;
-
-       if (!ctx->buf_data)
-               return -ENXIO;
-       if (up->offset + nr_args > ctx->nr_user_bufs)
-               return -EINVAL;
-
-       for (done = 0; done < nr_args; done++) {
-               struct io_mapped_ubuf *imu;
-               int offset = up->offset + done;
-               u64 tag = 0;
-
-               err = io_copy_iov(ctx, &iov, iovs, done);
-               if (err)
-                       break;
-               if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
-                       err = -EFAULT;
-                       break;
-               }
-               err = io_buffer_validate(&iov);
-               if (err)
-                       break;
-               if (!iov.iov_base && tag) {
-                       err = -EINVAL;
-                       break;
-               }
-               err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
-               if (err)
-                       break;
-
-               i = array_index_nospec(offset, ctx->nr_user_bufs);
-               if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
-                       err = io_queue_rsrc_removal(ctx->buf_data, i,
-                                                   ctx->rsrc_node, ctx->user_bufs[i]);
-                       if (unlikely(err)) {
-                               io_buffer_unmap(ctx, &imu);
-                               break;
-                       }
-                       ctx->user_bufs[i] = NULL;
-                       needs_switch = true;
-               }
-
-               ctx->user_bufs[i] = imu;
-               *io_get_tag_slot(ctx->buf_data, offset) = tag;
-       }
-
-       if (needs_switch)
-               io_rsrc_node_switch(ctx, ctx->buf_data);
-       return done ? done : err;
-}
-
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
-                              unsigned int eventfd_async)
-{
-       struct io_ev_fd *ev_fd;
-       __s32 __user *fds = arg;
-       int fd;
-
-       ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
-                                       lockdep_is_held(&ctx->uring_lock));
-       if (ev_fd)
-               return -EBUSY;
-
-       if (copy_from_user(&fd, fds, sizeof(*fds)))
-               return -EFAULT;
-
-       ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
-       if (!ev_fd)
-               return -ENOMEM;
-
-       ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
-       if (IS_ERR(ev_fd->cq_ev_fd)) {
-               int ret = PTR_ERR(ev_fd->cq_ev_fd);
-               kfree(ev_fd);
-               return ret;
-       }
-       ev_fd->eventfd_async = eventfd_async;
-       ctx->has_evfd = true;
-       rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
-       return 0;
-}
-
-static void io_eventfd_put(struct rcu_head *rcu)
-{
-       struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
-
-       eventfd_ctx_put(ev_fd->cq_ev_fd);
-       kfree(ev_fd);
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
-       struct io_ev_fd *ev_fd;
-
-       ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
-                                       lockdep_is_held(&ctx->uring_lock));
-       if (ev_fd) {
-               ctx->has_evfd = false;
-               rcu_assign_pointer(ctx->io_ev_fd, NULL);
-               call_rcu(&ev_fd->rcu, io_eventfd_put);
-               return 0;
-       }
-
-       return -ENXIO;
-}
-
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static void io_req_caches_free(struct io_ring_ctx *ctx)
 {
        struct io_submit_state *state = &ctx->submit_state;
        int nr = 0;
@@ -5078,12 +3758,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
        mutex_unlock(&ctx->uring_lock);
 }
 
-static void io_wait_rsrc_data(struct io_rsrc_data *data)
-{
-       if (data && !atomic_dec_and_test(&data->refs))
-               wait_for_completion(&data->done);
-}
-
 static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
 {
        struct async_poll *apoll;
@@ -6228,89 +4902,6 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
        return 0;
 }
 
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-                                    struct io_uring_rsrc_update2 *up,
-                                    unsigned nr_args)
-{
-       __u32 tmp;
-       int err;
-
-       if (check_add_overflow(up->offset, nr_args, &tmp))
-               return -EOVERFLOW;
-       err = io_rsrc_node_switch_start(ctx);
-       if (err)
-               return err;
-
-       switch (type) {
-       case IORING_RSRC_FILE:
-               return __io_sqe_files_update(ctx, up, nr_args);
-       case IORING_RSRC_BUFFER:
-               return __io_sqe_buffers_update(ctx, up, nr_args);
-       }
-       return -EINVAL;
-}
-
-static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
-                                   unsigned nr_args)
-{
-       struct io_uring_rsrc_update2 up;
-
-       if (!nr_args)
-               return -EINVAL;
-       memset(&up, 0, sizeof(up));
-       if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
-               return -EFAULT;
-       if (up.resv || up.resv2)
-               return -EINVAL;
-       return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
-}
-
-static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
-                                  unsigned size, unsigned type)
-{
-       struct io_uring_rsrc_update2 up;
-
-       if (size != sizeof(up))
-               return -EINVAL;
-       if (copy_from_user(&up, arg, sizeof(up)))
-               return -EFAULT;
-       if (!up.nr || up.resv || up.resv2)
-               return -EINVAL;
-       return __io_register_rsrc_update(ctx, type, &up, up.nr);
-}
-
-static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
-                           unsigned int size, unsigned int type)
-{
-       struct io_uring_rsrc_register rr;
-
-       /* keep it extendible */
-       if (size != sizeof(rr))
-               return -EINVAL;
-
-       memset(&rr, 0, sizeof(rr));
-       if (copy_from_user(&rr, arg, size))
-               return -EFAULT;
-       if (!rr.nr || rr.resv2)
-               return -EINVAL;
-       if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
-               return -EINVAL;
-
-       switch (type) {
-       case IORING_RSRC_FILE:
-               if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
-                       break;
-               return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
-                                            rr.nr, u64_to_user_ptr(rr.tags));
-       case IORING_RSRC_BUFFER:
-               if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
-                       break;
-               return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
-                                              rr.nr, u64_to_user_ptr(rr.tags));
-       }
-       return -EINVAL;
-}
-
 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
                                       void __user *arg, unsigned len)
 {
@@ -7103,7 +5694,6 @@ static int __init io_uring_init(void)
                     sizeof(struct io_uring_rsrc_update2));
 
        /* ->buf_index is u16 */
-       BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
        BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
        BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
                     offsetof(struct io_uring_buf_ring, tail));
index 172defdcfdbe4081f3637d8f7ec5d3575106da6b..090c17deba9db250b8d81ee23fcfe86df9f8e9bb 100644 (file)
@@ -92,6 +92,7 @@ static inline bool io_run_task_work(void)
        return false;
 }
 
+int io_run_task_work_sig(void);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
@@ -110,11 +111,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 int io_install_fixed_file(struct io_kiocb *req, struct file *file,
                          unsigned int issue_flags, u32 slot_index);
 
-int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
-                         struct io_rsrc_node *node, void *rsrc);
-void io_rsrc_node_switch(struct io_ring_ctx *ctx,
-                        struct io_rsrc_data *data_to_kill);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
index fa35bd56a33086271da91376ec4a57d1966342b1..1cbf3903097053f96d91bf76af48493462b2a7cf 100644 (file)
@@ -14,6 +14,7 @@
 
 #include "io_uring_types.h"
 #include "io_uring.h"
+#include "rsrc.h"
 #include "openclose.h"
 
 struct io_open {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
new file mode 100644 (file)
index 0000000..8c40b20
--- /dev/null
@@ -0,0 +1,1320 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/nospec.h>
+#include <linux/hugetlb.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring_types.h"
+#include "io_uring.h"
+#include "openclose.h"
+#include "rsrc.h"
+
+struct io_rsrc_update {
+       struct file                     *file;
+       u64                             arg;
+       u32                             nr_args;
+       u32                             offset;
+};
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+                                 struct io_mapped_ubuf **pimu,
+                                 struct page **last_hpage);
+
+#define IO_RSRC_REF_BATCH      100
+
+/* only define max */
+#define IORING_MAX_FIXED_FILES (1U << 20)
+#define IORING_MAX_REG_BUFFERS (1U << 14)
+
+void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
+{
+       if (ctx->rsrc_cached_refs) {
+               io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
+               ctx->rsrc_cached_refs = 0;
+       }
+}
+
+static inline void __io_unaccount_mem(struct user_struct *user,
+                                     unsigned long nr_pages)
+{
+       atomic_long_sub(nr_pages, &user->locked_vm);
+}
+
+static inline int __io_account_mem(struct user_struct *user,
+                                  unsigned long nr_pages)
+{
+       unsigned long page_limit, cur_pages, new_pages;
+
+       /* Don't allow more pages than we can safely lock */
+       page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       do {
+               cur_pages = atomic_long_read(&user->locked_vm);
+               new_pages = cur_pages + nr_pages;
+               if (new_pages > page_limit)
+                       return -ENOMEM;
+       } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+                                       new_pages) != cur_pages);
+
+       return 0;
+}
+
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+{
+       if (ctx->user)
+               __io_unaccount_mem(ctx->user, nr_pages);
+
+       if (ctx->mm_account)
+               atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
+}
+
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+{
+       int ret;
+
+       if (ctx->user) {
+               ret = __io_account_mem(ctx->user, nr_pages);
+               if (ret)
+                       return ret;
+       }
+
+       if (ctx->mm_account)
+               atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
+
+       return 0;
+}
+
+static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
+                      void __user *arg, unsigned index)
+{
+       struct iovec __user *src;
+
+#ifdef CONFIG_COMPAT
+       if (ctx->compat) {
+               struct compat_iovec __user *ciovs;
+               struct compat_iovec ciov;
+
+               ciovs = (struct compat_iovec __user *) arg;
+               if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
+                       return -EFAULT;
+
+               dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
+               dst->iov_len = ciov.iov_len;
+               return 0;
+       }
+#endif
+       src = (struct iovec __user *) arg;
+       if (copy_from_user(dst, &src[index], sizeof(*dst)))
+               return -EFAULT;
+       return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
+       unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+
+       /*
+        * Don't impose further limits on the size and buffer
+        * constraints here, we'll -EINVAL later when IO is
+        * submitted if they are wrong.
+        */
+       if (!iov->iov_base)
+               return iov->iov_len ? -EFAULT : 0;
+       if (!iov->iov_len)
+               return -EFAULT;
+
+       /* arbitrary limit, but we need something */
+       if (iov->iov_len > SZ_1G)
+               return -EFAULT;
+
+       if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
+               return -EOVERFLOW;
+
+       return 0;
+}
+
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
+{
+       struct io_mapped_ubuf *imu = *slot;
+       unsigned int i;
+
+       if (imu != ctx->dummy_ubuf) {
+               for (i = 0; i < imu->nr_bvecs; i++)
+                       unpin_user_page(imu->bvec[i].bv_page);
+               if (imu->acct_pages)
+                       io_unaccount_mem(ctx, imu->acct_pages);
+               kvfree(imu);
+       }
+       *slot = NULL;
+}
+
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
+{
+       ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+       percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
+
+static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
+{
+       struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
+       struct io_ring_ctx *ctx = rsrc_data->ctx;
+       struct io_rsrc_put *prsrc, *tmp;
+
+       list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
+               list_del(&prsrc->list);
+
+               if (prsrc->tag) {
+                       if (ctx->flags & IORING_SETUP_IOPOLL)
+                               mutex_lock(&ctx->uring_lock);
+
+                       spin_lock(&ctx->completion_lock);
+                       io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
+                       io_commit_cqring(ctx);
+                       spin_unlock(&ctx->completion_lock);
+                       io_cqring_ev_posted(ctx);
+
+                       if (ctx->flags & IORING_SETUP_IOPOLL)
+                               mutex_unlock(&ctx->uring_lock);
+               }
+
+               rsrc_data->do_put(ctx, prsrc);
+               kfree(prsrc);
+       }
+
+       io_rsrc_node_destroy(ref_node);
+       if (atomic_dec_and_test(&rsrc_data->refs))
+               complete(&rsrc_data->done);
+}
+
+void io_rsrc_put_work(struct work_struct *work)
+{
+       struct io_ring_ctx *ctx;
+       struct llist_node *node;
+
+       ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
+       node = llist_del_all(&ctx->rsrc_put_llist);
+
+       while (node) {
+               struct io_rsrc_node *ref_node;
+               struct llist_node *next = node->next;
+
+               ref_node = llist_entry(node, struct io_rsrc_node, llist);
+               __io_rsrc_put_work(ref_node);
+               node = next;
+       }
+}
+
+void io_wait_rsrc_data(struct io_rsrc_data *data)
+{
+       if (data && !atomic_dec_and_test(&data->refs))
+               wait_for_completion(&data->done);
+}
+
+void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
+{
+       percpu_ref_exit(&ref_node->refs);
+       kfree(ref_node);
+}
+
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+{
+       struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+       struct io_ring_ctx *ctx = node->rsrc_data->ctx;
+       unsigned long flags;
+       bool first_add = false;
+       unsigned long delay = HZ;
+
+       spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
+       node->done = true;
+
+       /* if we are mid-quiesce then do not delay */
+       if (node->rsrc_data->quiesce)
+               delay = 0;
+
+       while (!list_empty(&ctx->rsrc_ref_list)) {
+               node = list_first_entry(&ctx->rsrc_ref_list,
+                                           struct io_rsrc_node, node);
+               /* recycle ref nodes in order */
+               if (!node->done)
+                       break;
+               list_del(&node->node);
+               first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
+       }
+       spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
+
+       if (first_add)
+               mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+}
+
+static struct io_rsrc_node *io_rsrc_node_alloc(void)
+{
+       struct io_rsrc_node *ref_node;
+
+       ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+       if (!ref_node)
+               return NULL;
+
+       if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
+                           0, GFP_KERNEL)) {
+               kfree(ref_node);
+               return NULL;
+       }
+       INIT_LIST_HEAD(&ref_node->node);
+       INIT_LIST_HEAD(&ref_node->rsrc_list);
+       ref_node->done = false;
+       return ref_node;
+}
+
+void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+                        struct io_rsrc_data *data_to_kill)
+       __must_hold(&ctx->uring_lock)
+{
+       WARN_ON_ONCE(!ctx->rsrc_backup_node);
+       WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+
+       io_rsrc_refs_drop(ctx);
+
+       if (data_to_kill) {
+               struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
+
+               rsrc_node->rsrc_data = data_to_kill;
+               spin_lock_irq(&ctx->rsrc_ref_lock);
+               list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
+               spin_unlock_irq(&ctx->rsrc_ref_lock);
+
+               atomic_inc(&data_to_kill->refs);
+               percpu_ref_kill(&rsrc_node->refs);
+               ctx->rsrc_node = NULL;
+       }
+
+       if (!ctx->rsrc_node) {
+               ctx->rsrc_node = ctx->rsrc_backup_node;
+               ctx->rsrc_backup_node = NULL;
+       }
+}
+
+int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
+{
+       if (ctx->rsrc_backup_node)
+               return 0;
+       ctx->rsrc_backup_node = io_rsrc_node_alloc();
+       return ctx->rsrc_backup_node ? 0 : -ENOMEM;
+}
+
+__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+                                     struct io_ring_ctx *ctx)
+{
+       int ret;
+
+       /* As we may drop ->uring_lock, other task may have started quiesce */
+       if (data->quiesce)
+               return -ENXIO;
+
+       data->quiesce = true;
+       do {
+               ret = io_rsrc_node_switch_start(ctx);
+               if (ret)
+                       break;
+               io_rsrc_node_switch(ctx, data);
+
+               /* kill initial ref, already quiesced if zero */
+               if (atomic_dec_and_test(&data->refs))
+                       break;
+               mutex_unlock(&ctx->uring_lock);
+               flush_delayed_work(&ctx->rsrc_put_work);
+               ret = wait_for_completion_interruptible(&data->done);
+               if (!ret) {
+                       mutex_lock(&ctx->uring_lock);
+                       if (atomic_read(&data->refs) > 0) {
+                               /*
+                                * it has been revived by another thread while
+                                * we were unlocked
+                                */
+                               mutex_unlock(&ctx->uring_lock);
+                       } else {
+                               break;
+                       }
+               }
+
+               atomic_inc(&data->refs);
+               /* wait for all works potentially completing data->done */
+               flush_delayed_work(&ctx->rsrc_put_work);
+               reinit_completion(&data->done);
+
+               ret = io_run_task_work_sig();
+               mutex_lock(&ctx->uring_lock);
+       } while (ret >= 0);
+       data->quiesce = false;
+
+       return ret;
+}
+
+static void io_free_page_table(void **table, size_t size)
+{
+       unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+
+       for (i = 0; i < nr_tables; i++)
+               kfree(table[i]);
+       kfree(table);
+}
+
+static void io_rsrc_data_free(struct io_rsrc_data *data)
+{
+       size_t size = data->nr * sizeof(data->tags[0][0]);
+
+       if (data->tags)
+               io_free_page_table((void **)data->tags, size);
+       kfree(data);
+}
+
+static __cold void **io_alloc_page_table(size_t size)
+{
+       unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+       size_t init_size = size;
+       void **table;
+
+       table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
+       if (!table)
+               return NULL;
+
+       for (i = 0; i < nr_tables; i++) {
+               unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+
+               table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
+               if (!table[i]) {
+                       io_free_page_table(table, init_size);
+                       return NULL;
+               }
+               size -= this_size;
+       }
+       return table;
+}
+
+__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
+                                    rsrc_put_fn *do_put, u64 __user *utags,
+                                    unsigned nr, struct io_rsrc_data **pdata)
+{
+       struct io_rsrc_data *data;
+       int ret = -ENOMEM;
+       unsigned i;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+       data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
+       if (!data->tags) {
+               kfree(data);
+               return -ENOMEM;
+       }
+
+       data->nr = nr;
+       data->ctx = ctx;
+       data->do_put = do_put;
+       if (utags) {
+               ret = -EFAULT;
+               for (i = 0; i < nr; i++) {
+                       u64 *tag_slot = io_get_tag_slot(data, i);
+
+                       if (copy_from_user(tag_slot, &utags[i],
+                                          sizeof(*tag_slot)))
+                               goto fail;
+               }
+       }
+
+       atomic_set(&data->refs, 1);
+       init_completion(&data->done);
+       *pdata = data;
+       return 0;
+fail:
+       io_rsrc_data_free(data);
+       return ret;
+}
+
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+                                struct io_uring_rsrc_update2 *up,
+                                unsigned nr_args)
+{
+       u64 __user *tags = u64_to_user_ptr(up->tags);
+       __s32 __user *fds = u64_to_user_ptr(up->data);
+       struct io_rsrc_data *data = ctx->file_data;
+       struct io_fixed_file *file_slot;
+       struct file *file;
+       int fd, i, err = 0;
+       unsigned int done;
+       bool needs_switch = false;
+
+       if (!ctx->file_data)
+               return -ENXIO;
+       if (up->offset + nr_args > ctx->nr_user_files)
+               return -EINVAL;
+
+       for (done = 0; done < nr_args; done++) {
+               u64 tag = 0;
+
+               if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
+                   copy_from_user(&fd, &fds[done], sizeof(fd))) {
+                       err = -EFAULT;
+                       break;
+               }
+               if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
+                       err = -EINVAL;
+                       break;
+               }
+               if (fd == IORING_REGISTER_FILES_SKIP)
+                       continue;
+
+               i = array_index_nospec(up->offset + done, ctx->nr_user_files);
+               file_slot = io_fixed_file_slot(&ctx->file_table, i);
+
+               if (file_slot->file_ptr) {
+                       file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+                       err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
+                       if (err)
+                               break;
+                       file_slot->file_ptr = 0;
+                       io_file_bitmap_clear(&ctx->file_table, i);
+                       needs_switch = true;
+               }
+               if (fd != -1) {
+                       file = fget(fd);
+                       if (!file) {
+                               err = -EBADF;
+                               break;
+                       }
+                       /*
+                        * Don't allow io_uring instances to be registered. If
+                        * UNIX isn't enabled, then this causes a reference
+                        * cycle and this instance can never get freed. If UNIX
+                        * is enabled we'll handle it just fine, but there's
+                        * still no point in allowing a ring fd as it doesn't
+                        * support regular read/write anyway.
+                        */
+                       if (io_is_uring_fops(file)) {
+                               fput(file);
+                               err = -EBADF;
+                               break;
+                       }
+                       err = io_scm_file_account(ctx, file);
+                       if (err) {
+                               fput(file);
+                               break;
+                       }
+                       *io_get_tag_slot(data, i) = tag;
+                       io_fixed_file_set(file_slot, file);
+                       io_file_bitmap_set(&ctx->file_table, i);
+               }
+       }
+
+       if (needs_switch)
+               io_rsrc_node_switch(ctx, data);
+       return done ? done : err;
+}
+
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+                                  struct io_uring_rsrc_update2 *up,
+                                  unsigned int nr_args)
+{
+       u64 __user *tags = u64_to_user_ptr(up->tags);
+       struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+       struct page *last_hpage = NULL;
+       bool needs_switch = false;
+       __u32 done;
+       int i, err;
+
+       if (!ctx->buf_data)
+               return -ENXIO;
+       if (up->offset + nr_args > ctx->nr_user_bufs)
+               return -EINVAL;
+
+       for (done = 0; done < nr_args; done++) {
+               struct io_mapped_ubuf *imu;
+               int offset = up->offset + done;
+               u64 tag = 0;
+
+               err = io_copy_iov(ctx, &iov, iovs, done);
+               if (err)
+                       break;
+               if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
+                       err = -EFAULT;
+                       break;
+               }
+               err = io_buffer_validate(&iov);
+               if (err)
+                       break;
+               if (!iov.iov_base && tag) {
+                       err = -EINVAL;
+                       break;
+               }
+               err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
+               if (err)
+                       break;
+
+               i = array_index_nospec(offset, ctx->nr_user_bufs);
+               if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
+                       err = io_queue_rsrc_removal(ctx->buf_data, i,
+                                                   ctx->rsrc_node, ctx->user_bufs[i]);
+                       if (unlikely(err)) {
+                               io_buffer_unmap(ctx, &imu);
+                               break;
+                       }
+                       ctx->user_bufs[i] = NULL;
+                       needs_switch = true;
+               }
+
+               ctx->user_bufs[i] = imu;
+               *io_get_tag_slot(ctx->buf_data, offset) = tag;
+       }
+
+       if (needs_switch)
+               io_rsrc_node_switch(ctx, ctx->buf_data);
+       return done ? done : err;
+}
+
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
+                                    struct io_uring_rsrc_update2 *up,
+                                    unsigned nr_args)
+{
+       __u32 tmp;
+       int err;
+
+       if (check_add_overflow(up->offset, nr_args, &tmp))
+               return -EOVERFLOW;
+       err = io_rsrc_node_switch_start(ctx);
+       if (err)
+               return err;
+
+       switch (type) {
+       case IORING_RSRC_FILE:
+               return __io_sqe_files_update(ctx, up, nr_args);
+       case IORING_RSRC_BUFFER:
+               return __io_sqe_buffers_update(ctx, up, nr_args);
+       }
+       return -EINVAL;
+}
+
+int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+                            unsigned nr_args)
+{
+       struct io_uring_rsrc_update2 up;
+
+       if (!nr_args)
+               return -EINVAL;
+       memset(&up, 0, sizeof(up));
+       if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
+               return -EFAULT;
+       if (up.resv || up.resv2)
+               return -EINVAL;
+       return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
+}
+
+int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+                           unsigned size, unsigned type)
+{
+       struct io_uring_rsrc_update2 up;
+
+       if (size != sizeof(up))
+               return -EINVAL;
+       if (copy_from_user(&up, arg, sizeof(up)))
+               return -EFAULT;
+       if (!up.nr || up.resv || up.resv2)
+               return -EINVAL;
+       return __io_register_rsrc_update(ctx, type, &up, up.nr);
+}
+
+__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+                           unsigned int size, unsigned int type)
+{
+       struct io_uring_rsrc_register rr;
+
+       /* keep it extendible */
+       if (size != sizeof(rr))
+               return -EINVAL;
+
+       memset(&rr, 0, sizeof(rr));
+       if (copy_from_user(&rr, arg, size))
+               return -EFAULT;
+       if (!rr.nr || rr.resv2)
+               return -EINVAL;
+       if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
+               return -EINVAL;
+
+       switch (type) {
+       case IORING_RSRC_FILE:
+               if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+                       break;
+               return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
+                                            rr.nr, u64_to_user_ptr(rr.tags));
+       case IORING_RSRC_BUFFER:
+               if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+                       break;
+               return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
+                                              rr.nr, u64_to_user_ptr(rr.tags));
+       }
+       return -EINVAL;
+}
+
+int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+       struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+
+       if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+               return -EINVAL;
+       if (sqe->rw_flags || sqe->splice_fd_in)
+               return -EINVAL;
+
+       up->offset = READ_ONCE(sqe->off);
+       up->nr_args = READ_ONCE(sqe->len);
+       if (!up->nr_args)
+               return -EINVAL;
+       up->arg = READ_ONCE(sqe->addr);
+       return 0;
+}
+
+static int io_files_update_with_index_alloc(struct io_kiocb *req,
+                                           unsigned int issue_flags)
+{
+       struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+       __s32 __user *fds = u64_to_user_ptr(up->arg);
+       unsigned int done;
+       struct file *file;
+       int ret, fd;
+
+       if (!req->ctx->file_data)
+               return -ENXIO;
+
+       for (done = 0; done < up->nr_args; done++) {
+               if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+                       ret = -EFAULT;
+                       break;
+               }
+
+               file = fget(fd);
+               if (!file) {
+                       ret = -EBADF;
+                       break;
+               }
+               ret = io_fixed_fd_install(req, issue_flags, file,
+                                         IORING_FILE_INDEX_ALLOC);
+               if (ret < 0)
+                       break;
+               if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
+                       __io_close_fixed(req, issue_flags, ret);
+                       ret = -EFAULT;
+                       break;
+               }
+       }
+
+       if (done)
+               return done;
+       return ret;
+}
+
+int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
+{
+       struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_uring_rsrc_update2 up2;
+       int ret;
+
+       up2.offset = up->offset;
+       up2.data = up->arg;
+       up2.nr = 0;
+       up2.tags = 0;
+       up2.resv = 0;
+       up2.resv2 = 0;
+
+       if (up->offset == IORING_FILE_INDEX_ALLOC) {
+               ret = io_files_update_with_index_alloc(req, issue_flags);
+       } else {
+               io_ring_submit_lock(ctx, issue_flags);
+               ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
+                                               &up2, up->nr_args);
+               io_ring_submit_unlock(ctx, issue_flags);
+       }
+
+       if (ret < 0)
+               req_set_fail(req);
+       io_req_set_res(req, ret, 0);
+       return IOU_OK;
+}
+
+int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+                         struct io_rsrc_node *node, void *rsrc)
+{
+       u64 *tag_slot = io_get_tag_slot(data, idx);
+       struct io_rsrc_put *prsrc;
+
+       prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+       if (!prsrc)
+               return -ENOMEM;
+
+       prsrc->tag = *tag_slot;
+       *tag_slot = 0;
+       prsrc->rsrc = rsrc;
+       list_add(&prsrc->list, &node->rsrc_list);
+       return 0;
+}
+
+void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if !defined(IO_URING_SCM_ALL)
+       int i;
+
+       for (i = 0; i < ctx->nr_user_files; i++) {
+               struct file *file = io_file_from_index(&ctx->file_table, i);
+
+               if (!file)
+                       continue;
+               if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
+                       continue;
+               io_file_bitmap_clear(&ctx->file_table, i);
+               fput(file);
+       }
+#endif
+
+#if defined(CONFIG_UNIX)
+       if (ctx->ring_sock) {
+               struct sock *sock = ctx->ring_sock->sk;
+               struct sk_buff *skb;
+
+               while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+                       kfree_skb(skb);
+       }
+#endif
+       io_free_file_tables(&ctx->file_table);
+       io_rsrc_data_free(ctx->file_data);
+       ctx->file_data = NULL;
+       ctx->nr_user_files = 0;
+}
+
+int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+       unsigned nr = ctx->nr_user_files;
+       int ret;
+
+       if (!ctx->file_data)
+               return -ENXIO;
+
+       /*
+        * Quiesce may unlock ->uring_lock, and while it's not held
+        * prevent new requests using the table.
+        */
+       ctx->nr_user_files = 0;
+       ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+       ctx->nr_user_files = nr;
+       if (!ret)
+               __io_sqe_files_unregister(ctx);
+       return ret;
+}
+
+/*
+ * Ensure the UNIX gc is aware of our file set, so we are certain that
+ * the io_uring can be safely unregistered on process exit, even if we have
+ * loops in the file referencing. We account only files that can hold other
+ * files because otherwise they can't form a loop and so are not interesting
+ * for GC.
+ */
+int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
+{
+#if defined(CONFIG_UNIX)
+       struct sock *sk = ctx->ring_sock->sk;
+       struct sk_buff_head *head = &sk->sk_receive_queue;
+       struct scm_fp_list *fpl;
+       struct sk_buff *skb;
+
+       if (likely(!io_file_need_scm(file)))
+               return 0;
+
+       /*
+        * See if we can merge this file into an existing skb SCM_RIGHTS
+        * file set. If there's no room, fall back to allocating a new skb
+        * and filling it in.
+        */
+       spin_lock_irq(&head->lock);
+       skb = skb_peek(head);
+       if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
+               __skb_unlink(skb, head);
+       else
+               skb = NULL;
+       spin_unlock_irq(&head->lock);
+
+       if (!skb) {
+               fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+               if (!fpl)
+                       return -ENOMEM;
+
+               skb = alloc_skb(0, GFP_KERNEL);
+               if (!skb) {
+                       kfree(fpl);
+                       return -ENOMEM;
+               }
+
+               fpl->user = get_uid(current_user());
+               fpl->max = SCM_MAX_FD;
+               fpl->count = 0;
+
+               UNIXCB(skb).fp = fpl;
+               skb->sk = sk;
+               skb->destructor = unix_destruct_scm;
+               refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+       }
+
+       fpl = UNIXCB(skb).fp;
+       fpl->fp[fpl->count++] = get_file(file);
+       unix_inflight(fpl->user, file);
+       skb_queue_head(head, skb);
+       fput(file);
+#endif
+       return 0;
+}
+
+static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+       struct file *file = prsrc->file;
+#if defined(CONFIG_UNIX)
+       struct sock *sock = ctx->ring_sock->sk;
+       struct sk_buff_head list, *head = &sock->sk_receive_queue;
+       struct sk_buff *skb;
+       int i;
+
+       if (!io_file_need_scm(file)) {
+               fput(file);
+               return;
+       }
+
+       __skb_queue_head_init(&list);
+
+       /*
+        * Find the skb that holds this file in its SCM_RIGHTS. When found,
+        * remove this entry and rearrange the file array.
+        */
+       skb = skb_dequeue(head);
+       while (skb) {
+               struct scm_fp_list *fp;
+
+               fp = UNIXCB(skb).fp;
+               for (i = 0; i < fp->count; i++) {
+                       int left;
+
+                       if (fp->fp[i] != file)
+                               continue;
+
+                       unix_notinflight(fp->user, fp->fp[i]);
+                       left = fp->count - 1 - i;
+                       if (left) {
+                               memmove(&fp->fp[i], &fp->fp[i + 1],
+                                               left * sizeof(struct file *));
+                       }
+                       fp->count--;
+                       if (!fp->count) {
+                               kfree_skb(skb);
+                               skb = NULL;
+                       } else {
+                               __skb_queue_tail(&list, skb);
+                       }
+                       fput(file);
+                       file = NULL;
+                       break;
+               }
+
+               if (!file)
+                       break;
+
+               __skb_queue_tail(&list, skb);
+
+               skb = skb_dequeue(head);
+       }
+
+       if (skb_peek(&list)) {
+               spin_lock_irq(&head->lock);
+               while ((skb = __skb_dequeue(&list)) != NULL)
+                       __skb_queue_tail(head, skb);
+               spin_unlock_irq(&head->lock);
+       }
+#else
+       fput(file);
+#endif
+}
+
+int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+                         unsigned nr_args, u64 __user *tags)
+{
+       __s32 __user *fds = (__s32 __user *) arg;
+       struct file *file;
+       int fd, ret;
+       unsigned i;
+
+       if (ctx->file_data)
+               return -EBUSY;
+       if (!nr_args)
+               return -EINVAL;
+       if (nr_args > IORING_MAX_FIXED_FILES)
+               return -EMFILE;
+       if (nr_args > rlimit(RLIMIT_NOFILE))
+               return -EMFILE;
+       ret = io_rsrc_node_switch_start(ctx);
+       if (ret)
+               return ret;
+       ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
+                                &ctx->file_data);
+       if (ret)
+               return ret;
+
+       if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
+               io_rsrc_data_free(ctx->file_data);
+               ctx->file_data = NULL;
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+               struct io_fixed_file *file_slot;
+
+               if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
+                       ret = -EFAULT;
+                       goto fail;
+               }
+               /* allow sparse sets */
+               if (!fds || fd == -1) {
+                       ret = -EINVAL;
+                       if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
+                               goto fail;
+                       continue;
+               }
+
+               file = fget(fd);
+               ret = -EBADF;
+               if (unlikely(!file))
+                       goto fail;
+
+               /*
+                * Don't allow io_uring instances to be registered. If UNIX
+                * isn't enabled, then this causes a reference cycle and this
+                * instance can never get freed. If UNIX is enabled we'll
+                * handle it just fine, but there's still no point in allowing
+                * a ring fd as it doesn't support regular read/write anyway.
+                */
+               if (io_is_uring_fops(file)) {
+                       fput(file);
+                       goto fail;
+               }
+               ret = io_scm_file_account(ctx, file);
+               if (ret) {
+                       fput(file);
+                       goto fail;
+               }
+               file_slot = io_fixed_file_slot(&ctx->file_table, i);
+               io_fixed_file_set(file_slot, file);
+               io_file_bitmap_set(&ctx->file_table, i);
+       }
+
+       io_rsrc_node_switch(ctx, NULL);
+       return 0;
+fail:
+       __io_sqe_files_unregister(ctx);
+       return ret;
+}
+
+static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+       io_buffer_unmap(ctx, &prsrc->buf);
+       prsrc->buf = NULL;
+}
+
+void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+       unsigned int i;
+
+       for (i = 0; i < ctx->nr_user_bufs; i++)
+               io_buffer_unmap(ctx, &ctx->user_bufs[i]);
+       kfree(ctx->user_bufs);
+       io_rsrc_data_free(ctx->buf_data);
+       ctx->user_bufs = NULL;
+       ctx->buf_data = NULL;
+       ctx->nr_user_bufs = 0;
+}
+
+int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+       unsigned nr = ctx->nr_user_bufs;
+       int ret;
+
+       if (!ctx->buf_data)
+               return -ENXIO;
+
+       /*
+        * Quiesce may unlock ->uring_lock, and while it's not held
+        * prevent new requests using the table.
+        */
+       ctx->nr_user_bufs = 0;
+       ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
+       ctx->nr_user_bufs = nr;
+       if (!ret)
+               __io_sqe_buffers_unregister(ctx);
+       return ret;
+}
+
+/*
+ * Not super efficient, but this is just a registration time. And we do cache
+ * the last compound head, so generally we'll only do a full search if we don't
+ * match that one.
+ *
+ * We check if the given compound head page has already been accounted, to
+ * avoid double accounting it. This allows us to account the full size of the
+ * page, not just the constituent pages of a huge page.
+ */
+static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
+                                 int nr_pages, struct page *hpage)
+{
+       int i, j;
+
+       /* check current page array */
+       for (i = 0; i < nr_pages; i++) {
+               if (!PageCompound(pages[i]))
+                       continue;
+               if (compound_head(pages[i]) == hpage)
+                       return true;
+       }
+
+       /* check previously registered pages */
+       for (i = 0; i < ctx->nr_user_bufs; i++) {
+               struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+               for (j = 0; j < imu->nr_bvecs; j++) {
+                       if (!PageCompound(imu->bvec[j].bv_page))
+                               continue;
+                       if (compound_head(imu->bvec[j].bv_page) == hpage)
+                               return true;
+               }
+       }
+
+       return false;
+}
+
+static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
+                                int nr_pages, struct io_mapped_ubuf *imu,
+                                struct page **last_hpage)
+{
+       int i, ret;
+
+       imu->acct_pages = 0;
+       for (i = 0; i < nr_pages; i++) {
+               if (!PageCompound(pages[i])) {
+                       imu->acct_pages++;
+               } else {
+                       struct page *hpage;
+
+                       hpage = compound_head(pages[i]);
+                       if (hpage == *last_hpage)
+                               continue;
+                       *last_hpage = hpage;
+                       if (headpage_already_acct(ctx, pages, i, hpage))
+                               continue;
+                       imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
+               }
+       }
+
+       if (!imu->acct_pages)
+               return 0;
+
+       ret = io_account_mem(ctx, imu->acct_pages);
+       if (ret)
+               imu->acct_pages = 0;
+       return ret;
+}
+
+struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
+{
+       unsigned long start, end, nr_pages;
+       struct vm_area_struct **vmas = NULL;
+       struct page **pages = NULL;
+       int i, pret, ret = -ENOMEM;
+
+       end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       start = ubuf >> PAGE_SHIFT;
+       nr_pages = end - start;
+
+       pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+       if (!pages)
+               goto done;
+
+       vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
+                             GFP_KERNEL);
+       if (!vmas)
+               goto done;
+
+       ret = 0;
+       mmap_read_lock(current->mm);
+       pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+                             pages, vmas);
+       if (pret == nr_pages) {
+               /* don't support file backed memory */
+               for (i = 0; i < nr_pages; i++) {
+                       struct vm_area_struct *vma = vmas[i];
+
+                       if (vma_is_shmem(vma))
+                               continue;
+                       if (vma->vm_file &&
+                           !is_file_hugepages(vma->vm_file)) {
+                               ret = -EOPNOTSUPP;
+                               break;
+                       }
+               }
+               *npages = nr_pages;
+       } else {
+               ret = pret < 0 ? pret : -EFAULT;
+       }
+       mmap_read_unlock(current->mm);
+       if (ret) {
+               /*
+                * if we did partial map, or found file backed vmas,
+                * release any pages we did get
+                */
+               if (pret > 0)
+                       unpin_user_pages(pages, pret);
+               goto done;
+       }
+       ret = 0;
+done:
+       kvfree(vmas);
+       if (ret < 0) {
+               kvfree(pages);
+               pages = ERR_PTR(ret);
+       }
+       return pages;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+                                 struct io_mapped_ubuf **pimu,
+                                 struct page **last_hpage)
+{
+       struct io_mapped_ubuf *imu = NULL;
+       struct page **pages = NULL;
+       unsigned long off;
+       size_t size;
+       int ret, nr_pages, i;
+
+       if (!iov->iov_base) {
+               *pimu = ctx->dummy_ubuf;
+               return 0;
+       }
+
+       *pimu = NULL;
+       ret = -ENOMEM;
+
+       pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+                               &nr_pages);
+       if (IS_ERR(pages)) {
+               ret = PTR_ERR(pages);
+               pages = NULL;
+               goto done;
+       }
+
+       imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+       if (!imu)
+               goto done;
+
+       ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
+       if (ret) {
+               unpin_user_pages(pages, nr_pages);
+               goto done;
+       }
+
+       off = (unsigned long) iov->iov_base & ~PAGE_MASK;
+       size = iov->iov_len;
+       for (i = 0; i < nr_pages; i++) {
+               size_t vec_len;
+
+               vec_len = min_t(size_t, size, PAGE_SIZE - off);
+               imu->bvec[i].bv_page = pages[i];
+               imu->bvec[i].bv_len = vec_len;
+               imu->bvec[i].bv_offset = off;
+               off = 0;
+               size -= vec_len;
+       }
+       /* store original address for later verification */
+       imu->ubuf = (unsigned long) iov->iov_base;
+       imu->ubuf_end = imu->ubuf + iov->iov_len;
+       imu->nr_bvecs = nr_pages;
+       *pimu = imu;
+       ret = 0;
+done:
+       if (ret)
+               kvfree(imu);
+       kvfree(pages);
+       return ret;
+}
+
+static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
+{
+       ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
+       return ctx->user_bufs ? 0 : -ENOMEM;
+}
+
+int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+                           unsigned int nr_args, u64 __user *tags)
+{
+       struct page *last_hpage = NULL;
+       struct io_rsrc_data *data;
+       int i, ret;
+       struct iovec iov;
+
+       BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+
+       if (ctx->user_bufs)
+               return -EBUSY;
+       if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
+               return -EINVAL;
+       ret = io_rsrc_node_switch_start(ctx);
+       if (ret)
+               return ret;
+       ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
+       if (ret)
+               return ret;
+       ret = io_buffers_map_alloc(ctx, nr_args);
+       if (ret) {
+               io_rsrc_data_free(data);
+               return ret;
+       }
+
+       for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+               if (arg) {
+                       ret = io_copy_iov(ctx, &iov, arg, i);
+                       if (ret)
+                               break;
+                       ret = io_buffer_validate(&iov);
+                       if (ret)
+                               break;
+               } else {
+                       memset(&iov, 0, sizeof(iov));
+               }
+
+               if (!iov.iov_base && *io_get_tag_slot(data, i)) {
+                       ret = -EINVAL;
+                       break;
+               }
+
+               ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+                                            &last_hpage);
+               if (ret)
+                       break;
+       }
+
+       WARN_ON_ONCE(ctx->buf_data);
+
+       ctx->buf_data = data;
+       if (ret)
+               __io_sqe_buffers_unregister(ctx);
+       else
+               io_rsrc_node_switch(ctx, NULL);
+       return ret;
+}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
new file mode 100644 (file)
index 0000000..872c863
--- /dev/null
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_RSRC_H
+#define IOU_RSRC_H
+
+#include <net/af_unix.h>
+
+#define IO_RSRC_TAG_TABLE_SHIFT        (PAGE_SHIFT - 3)
+#define IO_RSRC_TAG_TABLE_MAX  (1U << IO_RSRC_TAG_TABLE_SHIFT)
+#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
+
+enum {
+       IORING_RSRC_FILE                = 0,
+       IORING_RSRC_BUFFER              = 1,
+};
+
+struct io_rsrc_put {
+       struct list_head list;
+       u64 tag;
+       union {
+               void *rsrc;
+               struct file *file;
+               struct io_mapped_ubuf *buf;
+       };
+};
+
+typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
+
+struct io_rsrc_data {
+       struct io_ring_ctx              *ctx;
+
+       u64                             **tags;
+       unsigned int                    nr;
+       rsrc_put_fn                     *do_put;
+       atomic_t                        refs;
+       struct completion               done;
+       bool                            quiesce;
+};
+
+struct io_rsrc_node {
+       struct percpu_ref               refs;
+       struct list_head                node;
+       struct list_head                rsrc_list;
+       struct io_rsrc_data             *rsrc_data;
+       struct llist_node               llist;
+       bool                            done;
+};
+
+void io_rsrc_put_work(struct work_struct *work);
+void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
+void io_wait_rsrc_data(struct io_rsrc_data *data);
+void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
+void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
+int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
+int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+                         struct io_rsrc_node *node, void *rsrc);
+void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+                        struct io_rsrc_data *data_to_kill);
+
+
+void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
+int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
+int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+                           unsigned int nr_args, u64 __user *tags);
+void __io_sqe_files_unregister(struct io_ring_ctx *ctx);
+int io_sqe_files_unregister(struct io_ring_ctx *ctx);
+int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+                         unsigned nr_args, u64 __user *tags);
+
+int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file);
+
+#if defined(CONFIG_UNIX)
+static inline bool io_file_need_scm(struct file *filp)
+{
+#if defined(IO_URING_SCM_ALL)
+       return true;
+#else
+       return !!unix_get_socket(filp);
+#endif
+}
+#else
+static inline bool io_file_need_scm(struct file *filp)
+{
+       return false;
+}
+#endif
+
+static inline int io_scm_file_account(struct io_ring_ctx *ctx,
+                                     struct file *file)
+{
+       if (likely(!io_file_need_scm(file)))
+               return 0;
+       return __io_scm_file_account(ctx, file);
+}
+
+int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+                            unsigned nr_args);
+int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+                           unsigned size, unsigned type);
+int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+                       unsigned int size, unsigned int type);
+
+static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
+{
+       percpu_ref_put_many(&node->refs, nr);
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req)
+{
+       if (req->rsrc_node)
+               io_rsrc_put_node(req->rsrc_node, 1);
+}
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+                                         struct io_ring_ctx *ctx)
+       __must_hold(&ctx->uring_lock)
+{
+       struct io_rsrc_node *node = req->rsrc_node;
+
+       if (node) {
+               if (node == ctx->rsrc_node)
+                       ctx->rsrc_cached_refs++;
+               else
+                       io_rsrc_put_node(node, 1);
+       }
+}
+
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+                                       struct io_ring_ctx *ctx,
+                                       unsigned int issue_flags)
+{
+       if (!req->rsrc_node) {
+               req->rsrc_node = ctx->rsrc_node;
+
+               if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+                       lockdep_assert_held(&ctx->uring_lock);
+                       ctx->rsrc_cached_refs--;
+                       if (unlikely(ctx->rsrc_cached_refs < 0))
+                               io_rsrc_refs_refill(ctx);
+               } else {
+                       percpu_ref_get(&req->rsrc_node->refs);
+               }
+       }
+}
+
+static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+{
+       unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
+       unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+
+       return &data->tags[table_idx][off];
+}
+
+int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
+int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+#endif