fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqe (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blk-mq.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76 #include <linux/eventpoll.h>
  77 #include <linux/splice.h>
  78 #include <linux/task_work.h>
  79 #include <linux/pagemap.h>
  80 #include <linux/io_uring.h>
  81 #include <linux/audit.h>
  82 #include <linux/security.h>
  83
  84 #define CREATE_TRACE_POINTS
  85 #include <trace/events/io_uring.h>
  86
  87 #include <uapi/linux/io_uring.h>
  88
  89 #include "internal.h"
  90 #include "io-wq.h"
  91
  92 #define IORING_MAX_ENTRIES      32768
  93 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  94 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
  95
  96 /* only define max */
  97 #define IORING_MAX_FIXED_FILES  (1U << 15)
  98 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
  99                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 100
 101 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
 102 #define IO_RSRC_TAG_TABLE_MAX   (1U << IO_RSRC_TAG_TABLE_SHIFT)
 103 #define IO_RSRC_TAG_TABLE_MASK  (IO_RSRC_TAG_TABLE_MAX - 1)
 104
 105 #define IORING_MAX_REG_BUFFERS  (1U << 14)
 106
 107 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
 108                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 109
 110 #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
 111                         IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
 112
 113 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 114                                 REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
 115
 116 #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
 117                                  IO_REQ_CLEAN_FLAGS)
 118
 119 #define IO_TCTX_REFS_CACHE_NR   (1U << 10)
 120
 121 struct io_uring {
 122         u32 head ____cacheline_aligned_in_smp;
 123         u32 tail ____cacheline_aligned_in_smp;
 124 };
 125
 126 /*
 127  * This data is shared with the application through the mmap at offsets
 128  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 129  *
 130  * The offsets to the member fields are published through struct
 131  * io_sqring_offsets when calling io_uring_setup.
 132  */
 133 struct io_rings {
 134         /*
 135          * Head and tail offsets into the ring; the offsets need to be
 136          * masked to get valid indices.
 137          *
 138          * The kernel controls head of the sq ring and the tail of the cq ring,
 139          * and the application controls tail of the sq ring and the head of the
 140          * cq ring.
 141          */
 142         struct io_uring         sq, cq;
 143         /*
 144          * Bitmasks to apply to head and tail offsets (constant, equals
 145          * ring_entries - 1)
 146          */
 147         u32                     sq_ring_mask, cq_ring_mask;
 148         /* Ring sizes (constant, power of 2) */
 149         u32                     sq_ring_entries, cq_ring_entries;
 150         /*
 151          * Number of invalid entries dropped by the kernel due to
 152          * invalid index stored in array
 153          *
 154          * Written by the kernel, shouldn't be modified by the
 155          * application (i.e. get number of "new events" by comparing to
 156          * cached value).
 157          *
 158          * After a new SQ head value was read by the application this
 159          * counter includes all submissions that were dropped reaching
 160          * the new SQ head (and possibly more).
 161          */
 162         u32                     sq_dropped;
 163         /*
 164          * Runtime SQ flags
 165          *
 166          * Written by the kernel, shouldn't be modified by the
 167          * application.
 168          *
 169          * The application needs a full memory barrier before checking
 170          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 171          */
 172         u32                     sq_flags;
 173         /*
 174          * Runtime CQ flags
 175          *
 176          * Written by the application, shouldn't be modified by the
 177          * kernel.
 178          */
 179         u32                     cq_flags;
 180         /*
 181          * Number of completion events lost because the queue was full;
 182          * this should be avoided by the application by making sure
 183          * there are not more requests pending than there is space in
 184          * the completion queue.
 185          *
 186          * Written by the kernel, shouldn't be modified by the
 187          * application (i.e. get number of "new events" by comparing to
 188          * cached value).
 189          *
 190          * As completion events come in out of order this counter is not
 191          * ordered with any other data.
 192          */
 193         u32                     cq_overflow;
 194         /*
 195          * Ring buffer of completion events.
 196          *
 197          * The kernel writes completion events fresh every time they are
 198          * produced, so the application is allowed to modify pending
 199          * entries.
 200          */
 201         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 202 };
 203
 204 enum io_uring_cmd_flags {
 205         IO_URING_F_COMPLETE_DEFER       = 1,
 206         IO_URING_F_UNLOCKED             = 2,
 207         /* int's last bit, sign checks are usually faster than a bit test */
 208         IO_URING_F_NONBLOCK             = INT_MIN,
 209 };
 210
 211 struct io_mapped_ubuf {
 212         u64             ubuf;
 213         u64             ubuf_end;
 214         unsigned int    nr_bvecs;
 215         unsigned long   acct_pages;
 216         struct bio_vec  bvec[];
 217 };
 218
 219 struct io_ring_ctx;
 220
 221 struct io_overflow_cqe {
 222         struct io_uring_cqe cqe;
 223         struct list_head list;
 224 };
 225
 226 struct io_fixed_file {
 227         /* file * with additional FFS_* flags */
 228         unsigned long file_ptr;
 229 };
 230
 231 struct io_rsrc_put {
 232         struct list_head list;
 233         u64 tag;
 234         union {
 235                 void *rsrc;
 236                 struct file *file;
 237                 struct io_mapped_ubuf *buf;
 238         };
 239 };
 240
 241 struct io_file_table {
 242         struct io_fixed_file *files;
 243 };
 244
 245 struct io_rsrc_node {
 246         struct percpu_ref               refs;
 247         struct list_head                node;
 248         struct list_head                rsrc_list;
 249         struct io_rsrc_data             *rsrc_data;
 250         struct llist_node               llist;
 251         bool                            done;
 252 };
 253
 254 typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 255
 256 struct io_rsrc_data {
 257         struct io_ring_ctx              *ctx;
 258
 259         u64                             **tags;
 260         unsigned int                    nr;
 261         rsrc_put_fn                     *do_put;
 262         atomic_t                        refs;
 263         struct completion               done;
 264         bool                            quiesce;
 265 };
 266
 267 struct io_buffer_list {
 268         struct list_head list;
 269         struct list_head buf_list;
 270         __u16 bgid;
 271 };
 272
 273 struct io_buffer {
 274         struct list_head list;
 275         __u64 addr;
 276         __u32 len;
 277         __u16 bid;
 278         __u16 bgid;
 279 };
 280
 281 struct io_restriction {
 282         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 283         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 284         u8 sqe_flags_allowed;
 285         u8 sqe_flags_required;
 286         bool registered;
 287 };
 288
 289 enum {
 290         IO_SQ_THREAD_SHOULD_STOP = 0,
 291         IO_SQ_THREAD_SHOULD_PARK,
 292 };
 293
 294 struct io_sq_data {
 295         refcount_t              refs;
 296         atomic_t                park_pending;
 297         struct mutex            lock;
 298
 299         /* ctx's that are using this sqd */
 300         struct list_head        ctx_list;
 301
 302         struct task_struct      *thread;
 303         struct wait_queue_head  wait;
 304
 305         unsigned                sq_thread_idle;
 306         int                     sq_cpu;
 307         pid_t                   task_pid;
 308         pid_t                   task_tgid;
 309
 310         unsigned long           state;
 311         struct completion       exited;
 312 };
 313
 314 #define IO_COMPL_BATCH                  32
 315 #define IO_REQ_CACHE_SIZE               32
 316 #define IO_REQ_ALLOC_BATCH              8
 317
 318 struct io_submit_link {
 319         struct io_kiocb         *head;
 320         struct io_kiocb         *last;
 321 };
 322
 323 struct io_submit_state {
 324         /* inline/task_work completion list, under ->uring_lock */
 325         struct io_wq_work_node  free_list;
 326         /* batch completion logic */
 327         struct io_wq_work_list  compl_reqs;
 328         struct io_submit_link   link;
 329
 330         bool                    plug_started;
 331         bool                    need_plug;
 332         bool                    flush_cqes;
 333         unsigned short          submit_nr;
 334         struct blk_plug         plug;
 335 };
 336
 337 struct io_ev_fd {
 338         struct eventfd_ctx      *cq_ev_fd;
 339         unsigned int            eventfd_async: 1;
 340         struct rcu_head         rcu;
 341 };
 342
 343 #define IO_BUFFERS_HASH_BITS    5
 344
 345 struct io_ring_ctx {
 346         /* const or read-mostly hot data */
 347         struct {
 348                 struct percpu_ref       refs;
 349
 350                 struct io_rings         *rings;
 351                 unsigned int            flags;
 352                 unsigned int            compat: 1;
 353                 unsigned int            drain_next: 1;
 354                 unsigned int            restricted: 1;
 355                 unsigned int            off_timeout_used: 1;
 356                 unsigned int            drain_active: 1;
 357                 unsigned int            drain_disabled: 1;
 358                 unsigned int            has_evfd: 1;
 359                 unsigned int            syscall_iopoll: 1;
 360         } ____cacheline_aligned_in_smp;
 361
 362         /* submission data */
 363         struct {
 364                 struct mutex            uring_lock;
 365
 366                 /*
 367                  * Ring buffer of indices into array of io_uring_sqe, which is
 368                  * mmapped by the application using the IORING_OFF_SQES offset.
 369                  *
 370                  * This indirection could e.g. be used to assign fixed
 371                  * io_uring_sqe entries to operations and only submit them to
 372                  * the queue when needed.
 373                  *
 374                  * The kernel modifies neither the indices array nor the entries
 375                  * array.
 376                  */
 377                 u32                     *sq_array;
 378                 struct io_uring_sqe     *sq_sqes;
 379                 unsigned                cached_sq_head;
 380                 unsigned                sq_entries;
 381                 struct list_head        defer_list;
 382
 383                 /*
 384                  * Fixed resources fast path, should be accessed only under
 385                  * uring_lock, and updated through io_uring_register(2)
 386                  */
 387                 struct io_rsrc_node     *rsrc_node;
 388                 int                     rsrc_cached_refs;
 389                 struct io_file_table    file_table;
 390                 unsigned                nr_user_files;
 391                 unsigned                nr_user_bufs;
 392                 struct io_mapped_ubuf   **user_bufs;
 393
 394                 struct io_submit_state  submit_state;
 395                 struct list_head        timeout_list;
 396                 struct list_head        ltimeout_list;
 397                 struct list_head        cq_overflow_list;
 398                 struct list_head        *io_buffers;
 399                 struct list_head        io_buffers_cache;
 400                 struct list_head        apoll_cache;
 401                 struct xarray           personalities;
 402                 u32                     pers_next;
 403                 unsigned                sq_thread_idle;
 404         } ____cacheline_aligned_in_smp;
 405
 406         /* IRQ completion list, under ->completion_lock */
 407         struct io_wq_work_list  locked_free_list;
 408         unsigned int            locked_free_nr;
 409
 410         const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
 411         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 412
 413         struct wait_queue_head  sqo_sq_wait;
 414         struct list_head        sqd_list;
 415
 416         unsigned long           check_cq_overflow;
 417
 418         struct {
 419                 unsigned                cached_cq_tail;
 420                 unsigned                cq_entries;
 421                 struct io_ev_fd __rcu   *io_ev_fd;
 422                 struct wait_queue_head  cq_wait;
 423                 unsigned                cq_extra;
 424                 atomic_t                cq_timeouts;
 425                 unsigned                cq_last_tm_flush;
 426         } ____cacheline_aligned_in_smp;
 427
 428         struct {
 429                 spinlock_t              completion_lock;
 430
 431                 spinlock_t              timeout_lock;
 432
 433                 /*
 434                  * ->iopoll_list is protected by the ctx->uring_lock for
 435                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 436                  * For SQPOLL, only the single threaded io_sq_thread() will
 437                  * manipulate the list, hence no extra locking is needed there.
 438                  */
 439                 struct io_wq_work_list  iopoll_list;
 440                 struct hlist_head       *cancel_hash;
 441                 unsigned                cancel_hash_bits;
 442                 bool                    poll_multi_queue;
 443
 444                 struct list_head        io_buffers_comp;
 445         } ____cacheline_aligned_in_smp;
 446
 447         struct io_restriction           restrictions;
 448
 449         /* slow path rsrc auxilary data, used by update/register */
 450         struct {
 451                 struct io_rsrc_node             *rsrc_backup_node;
 452                 struct io_mapped_ubuf           *dummy_ubuf;
 453                 struct io_rsrc_data             *file_data;
 454                 struct io_rsrc_data             *buf_data;
 455
 456                 struct delayed_work             rsrc_put_work;
 457                 struct llist_head               rsrc_put_llist;
 458                 struct list_head                rsrc_ref_list;
 459                 spinlock_t                      rsrc_ref_lock;
 460
 461                 struct list_head        io_buffers_pages;
 462         };
 463
 464         /* Keep this last, we don't need it for the fast path */
 465         struct {
 466                 #if defined(CONFIG_UNIX)
 467                         struct socket           *ring_sock;
 468                 #endif
 469                 /* hashed buffered write serialization */
 470                 struct io_wq_hash               *hash_map;
 471
 472                 /* Only used for accounting purposes */
 473                 struct user_struct              *user;
 474                 struct mm_struct                *mm_account;
 475
 476                 /* ctx exit and cancelation */
 477                 struct llist_head               fallback_llist;
 478                 struct delayed_work             fallback_work;
 479                 struct work_struct              exit_work;
 480                 struct list_head                tctx_list;
 481                 struct completion               ref_comp;
 482                 u32                             iowq_limits[2];
 483                 bool                            iowq_limits_set;
 484         };
 485 };
 486
 487 /*
 488  * Arbitrary limit, can be raised if need be
 489  */
 490 #define IO_RINGFD_REG_MAX 16
 491
 492 struct io_uring_task {
 493         /* submission side */
 494         int                     cached_refs;
 495         struct xarray           xa;
 496         struct wait_queue_head  wait;
 497         const struct io_ring_ctx *last;
 498         struct io_wq            *io_wq;
 499         struct percpu_counter   inflight;
 500         atomic_t                in_idle;
 501
 502         spinlock_t              task_lock;
 503         struct io_wq_work_list  task_list;
 504         struct io_wq_work_list  prior_task_list;
 505         struct callback_head    task_work;
 506         struct file             **registered_rings;
 507         bool                    task_running;
 508 };
 509
 510 /*
 511  * First field must be the file pointer in all the
 512  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 513  */
 514 struct io_poll_iocb {
 515         struct file                     *file;
 516         struct wait_queue_head          *head;
 517         __poll_t                        events;
 518         struct wait_queue_entry         wait;
 519 };
 520
 521 struct io_poll_update {
 522         struct file                     *file;
 523         u64                             old_user_data;
 524         u64                             new_user_data;
 525         __poll_t                        events;
 526         bool                            update_events;
 527         bool                            update_user_data;
 528 };
 529
 530 struct io_close {
 531         struct file                     *file;
 532         int                             fd;
 533         u32                             file_slot;
 534 };
 535
 536 struct io_timeout_data {
 537         struct io_kiocb                 *req;
 538         struct hrtimer                  timer;
 539         struct timespec64               ts;
 540         enum hrtimer_mode               mode;
 541         u32                             flags;
 542 };
 543
 544 struct io_accept {
 545         struct file                     *file;
 546         struct sockaddr __user          *addr;
 547         int __user                      *addr_len;
 548         int                             flags;
 549         u32                             file_slot;
 550         unsigned long                   nofile;
 551 };
 552
 553 struct io_sync {
 554         struct file                     *file;
 555         loff_t                          len;
 556         loff_t                          off;
 557         int                             flags;
 558         int                             mode;
 559 };
 560
 561 struct io_cancel {
 562         struct file                     *file;
 563         u64                             addr;
 564 };
 565
 566 struct io_timeout {
 567         struct file                     *file;
 568         u32                             off;
 569         u32                             target_seq;
 570         struct list_head                list;
 571         /* head of the link, used by linked timeouts only */
 572         struct io_kiocb                 *head;
 573         /* for linked completions */
 574         struct io_kiocb                 *prev;
 575 };
 576
 577 struct io_timeout_rem {
 578         struct file                     *file;
 579         u64                             addr;
 580
 581         /* timeout update */
 582         struct timespec64               ts;
 583         u32                             flags;
 584         bool                            ltimeout;
 585 };
 586
 587 struct io_rw {
 588         /* NOTE: kiocb has the file as the first member, so don't do it here */
 589         struct kiocb                    kiocb;
 590         u64                             addr;
 591         u32                             len;
 592         u32                             flags;
 593 };
 594
 595 struct io_connect {
 596         struct file                     *file;
 597         struct sockaddr __user          *addr;
 598         int                             addr_len;
 599 };
 600
 601 struct io_sr_msg {
 602         struct file                     *file;
 603         union {
 604                 struct compat_msghdr __user     *umsg_compat;
 605                 struct user_msghdr __user       *umsg;
 606                 void __user                     *buf;
 607         };
 608         int                             msg_flags;
 609         int                             bgid;
 610         size_t                          len;
 611         size_t                          done_io;
 612 };
 613
 614 struct io_open {
 615         struct file                     *file;
 616         int                             dfd;
 617         u32                             file_slot;
 618         struct filename                 *filename;
 619         struct open_how                 how;
 620         unsigned long                   nofile;
 621 };
 622
 623 struct io_rsrc_update {
 624         struct file                     *file;
 625         u64                             arg;
 626         u32                             nr_args;
 627         u32                             offset;
 628 };
 629
 630 struct io_fadvise {
 631         struct file                     *file;
 632         u64                             offset;
 633         u32                             len;
 634         u32                             advice;
 635 };
 636
 637 struct io_madvise {
 638         struct file                     *file;
 639         u64                             addr;
 640         u32                             len;
 641         u32                             advice;
 642 };
 643
 644 struct io_epoll {
 645         struct file                     *file;
 646         int                             epfd;
 647         int                             op;
 648         int                             fd;
 649         struct epoll_event              event;
 650 };
 651
 652 struct io_splice {
 653         struct file                     *file_out;
 654         loff_t                          off_out;
 655         loff_t                          off_in;
 656         u64                             len;
 657         int                             splice_fd_in;
 658         unsigned int                    flags;
 659 };
 660
 661 struct io_provide_buf {
 662         struct file                     *file;
 663         __u64                           addr;
 664         __u32                           len;
 665         __u32                           bgid;
 666         __u16                           nbufs;
 667         __u16                           bid;
 668 };
 669
 670 struct io_statx {
 671         struct file                     *file;
 672         int                             dfd;
 673         unsigned int                    mask;
 674         unsigned int                    flags;
 675         struct filename                 *filename;
 676         struct statx __user             *buffer;
 677 };
 678
 679 struct io_shutdown {
 680         struct file                     *file;
 681         int                             how;
 682 };
 683
 684 struct io_rename {
 685         struct file                     *file;
 686         int                             old_dfd;
 687         int                             new_dfd;
 688         struct filename                 *oldpath;
 689         struct filename                 *newpath;
 690         int                             flags;
 691 };
 692
 693 struct io_unlink {
 694         struct file                     *file;
 695         int                             dfd;
 696         int                             flags;
 697         struct filename                 *filename;
 698 };
 699
 700 struct io_mkdir {
 701         struct file                     *file;
 702         int                             dfd;
 703         umode_t                         mode;
 704         struct filename                 *filename;
 705 };
 706
 707 struct io_symlink {
 708         struct file                     *file;
 709         int                             new_dfd;
 710         struct filename                 *oldpath;
 711         struct filename                 *newpath;
 712 };
 713
 714 struct io_hardlink {
 715         struct file                     *file;
 716         int                             old_dfd;
 717         int                             new_dfd;
 718         struct filename                 *oldpath;
 719         struct filename                 *newpath;
 720         int                             flags;
 721 };
 722
 723 struct io_msg {
 724         struct file                     *file;
 725         u64 user_data;
 726         u32 len;
 727 };
 728
 729 struct io_async_connect {
 730         struct sockaddr_storage         address;
 731 };
 732
 733 struct io_async_msghdr {
 734         struct iovec                    fast_iov[UIO_FASTIOV];
 735         /* points to an allocated iov, if NULL we use fast_iov instead */
 736         struct iovec                    *free_iov;
 737         struct sockaddr __user          *uaddr;
 738         struct msghdr                   msg;
 739         struct sockaddr_storage         addr;
 740 };
 741
 742 struct io_rw_state {
 743         struct iov_iter                 iter;
 744         struct iov_iter_state           iter_state;
 745         struct iovec                    fast_iov[UIO_FASTIOV];
 746 };
 747
 748 struct io_async_rw {
 749         struct io_rw_state              s;
 750         const struct iovec              *free_iovec;
 751         size_t                          bytes_done;
 752         struct wait_page_queue          wpq;
 753 };
 754
 755 enum {
 756         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 757         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 758         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 759         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 760         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 761         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 762         REQ_F_CQE_SKIP_BIT      = IOSQE_CQE_SKIP_SUCCESS_BIT,
 763
 764         /* first byte is taken by user flags, shift it to not overlap */
 765         REQ_F_FAIL_BIT          = 8,
 766         REQ_F_INFLIGHT_BIT,
 767         REQ_F_CUR_POS_BIT,
 768         REQ_F_NOWAIT_BIT,
 769         REQ_F_LINK_TIMEOUT_BIT,
 770         REQ_F_NEED_CLEANUP_BIT,
 771         REQ_F_POLLED_BIT,
 772         REQ_F_BUFFER_SELECTED_BIT,
 773         REQ_F_COMPLETE_INLINE_BIT,
 774         REQ_F_REISSUE_BIT,
 775         REQ_F_CREDS_BIT,
 776         REQ_F_REFCOUNT_BIT,
 777         REQ_F_ARM_LTIMEOUT_BIT,
 778         REQ_F_ASYNC_DATA_BIT,
 779         REQ_F_SKIP_LINK_CQES_BIT,
 780         REQ_F_SINGLE_POLL_BIT,
 781         REQ_F_DOUBLE_POLL_BIT,
 782         REQ_F_PARTIAL_IO_BIT,
 783         /* keep async read/write and isreg together and in order */
 784         REQ_F_SUPPORT_NOWAIT_BIT,
 785         REQ_F_ISREG_BIT,
 786
 787         /* not a real bit, just to check we're not overflowing the space */
 788         __REQ_F_LAST_BIT,
 789 };
 790
 791 enum {
 792         /* ctx owns file */
 793         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 794         /* drain existing IO first */
 795         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 796         /* linked sqes */
 797         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 798         /* doesn't sever on completion < 0 */
 799         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 800         /* IOSQE_ASYNC */
 801         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 802         /* IOSQE_BUFFER_SELECT */
 803         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 804         /* IOSQE_CQE_SKIP_SUCCESS */
 805         REQ_F_CQE_SKIP          = BIT(REQ_F_CQE_SKIP_BIT),
 806
 807         /* fail rest of links */
 808         REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
 809         /* on inflight list, should be cancelled and waited on exit reliably */
 810         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 811         /* read/write uses file position */
 812         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 813         /* must not punt to workers */
 814         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 815         /* has or had linked timeout */
 816         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 817         /* needs cleanup */
 818         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 819         /* already went through poll handler */
 820         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 821         /* buffer already selected */
 822         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 823         /* completion is deferred through io_comp_state */
 824         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 825         /* caller should reissue async */
 826         REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
 827         /* supports async reads/writes */
 828         REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
 829         /* regular file */
 830         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 831         /* has creds assigned */
 832         REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
 833         /* skip refcounting if not set */
 834         REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
 835         /* there is a linked timeout that has to be armed */
 836         REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
 837         /* ->async_data allocated */
 838         REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
 839         /* don't post CQEs while failing linked requests */
 840         REQ_F_SKIP_LINK_CQES    = BIT(REQ_F_SKIP_LINK_CQES_BIT),
 841         /* single poll may be active */
 842         REQ_F_SINGLE_POLL       = BIT(REQ_F_SINGLE_POLL_BIT),
 843         /* double poll may active */
 844         REQ_F_DOUBLE_POLL       = BIT(REQ_F_DOUBLE_POLL_BIT),
 845         /* request has already done partial IO */
 846         REQ_F_PARTIAL_IO        = BIT(REQ_F_PARTIAL_IO_BIT),
 847 };
 848
 849 struct async_poll {
 850         struct io_poll_iocb     poll;
 851         struct io_poll_iocb     *double_poll;
 852 };
 853
 854 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 855
 856 struct io_task_work {
 857         union {
 858                 struct io_wq_work_node  node;
 859                 struct llist_node       fallback_node;
 860         };
 861         io_req_tw_func_t                func;
 862 };
 863
 864 enum {
 865         IORING_RSRC_FILE                = 0,
 866         IORING_RSRC_BUFFER              = 1,
 867 };
 868
 869 struct io_cqe {
 870         __u64   user_data;
 871         __s32   res;
 872         /* fd initially, then cflags for completion */
 873         union {
 874                 __u32   flags;
 875                 int     fd;
 876         };
 877 };
 878
 879 /*
 880  * NOTE! Each of the iocb union members has the file pointer
 881  * as the first entry in their struct definition. So you can
 882  * access the file pointer through any of the sub-structs,
 883  * or directly as just 'file' in this struct.
 884  */
 885 struct io_kiocb {
 886         union {
 887                 struct file             *file;
 888                 struct io_rw            rw;
 889                 struct io_poll_iocb     poll;
 890                 struct io_poll_update   poll_update;
 891                 struct io_accept        accept;
 892                 struct io_sync          sync;
 893                 struct io_cancel        cancel;
 894                 struct io_timeout       timeout;
 895                 struct io_timeout_rem   timeout_rem;
 896                 struct io_connect       connect;
 897                 struct io_sr_msg        sr_msg;
 898                 struct io_open          open;
 899                 struct io_close         close;
 900                 struct io_rsrc_update   rsrc_update;
 901                 struct io_fadvise       fadvise;
 902                 struct io_madvise       madvise;
 903                 struct io_epoll         epoll;
 904                 struct io_splice        splice;
 905                 struct io_provide_buf   pbuf;
 906                 struct io_statx         statx;
 907                 struct io_shutdown      shutdown;
 908                 struct io_rename        rename;
 909                 struct io_unlink        unlink;
 910                 struct io_mkdir         mkdir;
 911                 struct io_symlink       symlink;
 912                 struct io_hardlink      hardlink;
 913                 struct io_msg           msg;
 914         };
 915
 916         u8                              opcode;
 917         /* polled IO has completed */
 918         u8                              iopoll_completed;
 919         u16                             buf_index;
 920         unsigned int                    flags;
 921
 922         struct io_cqe                   cqe;
 923
 924         struct io_ring_ctx              *ctx;
 925         struct task_struct              *task;
 926
 927         struct percpu_ref               *fixed_rsrc_refs;
 928         /* store used ubuf, so we can prevent reloading */
 929         struct io_mapped_ubuf           *imu;
 930
 931         union {
 932                 /* used by request caches, completion batching and iopoll */
 933                 struct io_wq_work_node  comp_list;
 934                 /* cache ->apoll->events */
 935                 int apoll_events;
 936         };
 937         atomic_t                        refs;
 938         atomic_t                        poll_refs;
 939         struct io_task_work             io_task_work;
 940         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 941         struct hlist_node               hash_node;
 942         /* internal polling, see IORING_FEAT_FAST_POLL */
 943         struct async_poll               *apoll;
 944         /* opcode allocated if it needs to store data for async defer */
 945         void                            *async_data;
 946         /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
 947         struct io_buffer                *kbuf;
 948         /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
 949         struct io_kiocb                 *link;
 950         /* custom credentials, valid IFF REQ_F_CREDS is set */
 951         const struct cred               *creds;
 952         struct io_wq_work               work;
 953 };
 954
 955 struct io_tctx_node {
 956         struct list_head        ctx_node;
 957         struct task_struct      *task;
 958         struct io_ring_ctx      *ctx;
 959 };
 960
 961 struct io_defer_entry {
 962         struct list_head        list;
 963         struct io_kiocb         *req;
 964         u32                     seq;
 965 };
 966
 967 struct io_op_def {
 968         /* needs req->file assigned */
 969         unsigned                needs_file : 1;
 970         /* should block plug */
 971         unsigned                plug : 1;
 972         /* hash wq insertion if file is a regular file */
 973         unsigned                hash_reg_file : 1;
 974         /* unbound wq insertion if file is a non-regular file */
 975         unsigned                unbound_nonreg_file : 1;
 976         /* set if opcode supports polled "wait" */
 977         unsigned                pollin : 1;
 978         unsigned                pollout : 1;
 979         unsigned                poll_exclusive : 1;
 980         /* op supports buffer selection */
 981         unsigned                buffer_select : 1;
 982         /* do prep async if is going to be punted */
 983         unsigned                needs_async_setup : 1;
 984         /* opcode is not supported by this kernel */
 985         unsigned                not_supported : 1;
 986         /* skip auditing */
 987         unsigned                audit_skip : 1;
 988         /* size of async data needed, if any */
 989         unsigned short          async_size;
 990 };
 991
 992 static const struct io_op_def io_op_defs[] = {
 993         [IORING_OP_NOP] = {},
 994         [IORING_OP_READV] = {
 995                 .needs_file             = 1,
 996                 .unbound_nonreg_file    = 1,
 997                 .pollin                 = 1,
 998                 .buffer_select          = 1,
 999                 .needs_async_setup      = 1,
1000                 .plug                   = 1,
1001                 .audit_skip             = 1,
1002                 .async_size             = sizeof(struct io_async_rw),
1003         },
1004         [IORING_OP_WRITEV] = {
1005                 .needs_file             = 1,
1006                 .hash_reg_file          = 1,
1007                 .unbound_nonreg_file    = 1,
1008                 .pollout                = 1,
1009                 .needs_async_setup      = 1,
1010                 .plug                   = 1,
1011                 .audit_skip             = 1,
1012                 .async_size             = sizeof(struct io_async_rw),
1013         },
1014         [IORING_OP_FSYNC] = {
1015                 .needs_file             = 1,
1016                 .audit_skip             = 1,
1017         },
1018         [IORING_OP_READ_FIXED] = {
1019                 .needs_file             = 1,
1020                 .unbound_nonreg_file    = 1,
1021                 .pollin                 = 1,
1022                 .plug                   = 1,
1023                 .audit_skip             = 1,
1024                 .async_size             = sizeof(struct io_async_rw),
1025         },
1026         [IORING_OP_WRITE_FIXED] = {
1027                 .needs_file             = 1,
1028                 .hash_reg_file          = 1,
1029                 .unbound_nonreg_file    = 1,
1030                 .pollout                = 1,
1031                 .plug                   = 1,
1032                 .audit_skip             = 1,
1033                 .async_size             = sizeof(struct io_async_rw),
1034         },
1035         [IORING_OP_POLL_ADD] = {
1036                 .needs_file             = 1,
1037                 .unbound_nonreg_file    = 1,
1038                 .audit_skip             = 1,
1039         },
1040         [IORING_OP_POLL_REMOVE] = {
1041                 .audit_skip             = 1,
1042         },
1043         [IORING_OP_SYNC_FILE_RANGE] = {
1044                 .needs_file             = 1,
1045                 .audit_skip             = 1,
1046         },
1047         [IORING_OP_SENDMSG] = {
1048                 .needs_file             = 1,
1049                 .unbound_nonreg_file    = 1,
1050                 .pollout                = 1,
1051                 .needs_async_setup      = 1,
1052                 .async_size             = sizeof(struct io_async_msghdr),
1053         },
1054         [IORING_OP_RECVMSG] = {
1055                 .needs_file             = 1,
1056                 .unbound_nonreg_file    = 1,
1057                 .pollin                 = 1,
1058                 .buffer_select          = 1,
1059                 .needs_async_setup      = 1,
1060                 .async_size             = sizeof(struct io_async_msghdr),
1061         },
1062         [IORING_OP_TIMEOUT] = {
1063                 .audit_skip             = 1,
1064                 .async_size             = sizeof(struct io_timeout_data),
1065         },
1066         [IORING_OP_TIMEOUT_REMOVE] = {
1067                 /* used by timeout updates' prep() */
1068                 .audit_skip             = 1,
1069         },
1070         [IORING_OP_ACCEPT] = {
1071                 .needs_file             = 1,
1072                 .unbound_nonreg_file    = 1,
1073                 .pollin                 = 1,
1074                 .poll_exclusive         = 1,
1075         },
1076         [IORING_OP_ASYNC_CANCEL] = {
1077                 .audit_skip             = 1,
1078         },
1079         [IORING_OP_LINK_TIMEOUT] = {
1080                 .audit_skip             = 1,
1081                 .async_size             = sizeof(struct io_timeout_data),
1082         },
1083         [IORING_OP_CONNECT] = {
1084                 .needs_file             = 1,
1085                 .unbound_nonreg_file    = 1,
1086                 .pollout                = 1,
1087                 .needs_async_setup      = 1,
1088                 .async_size             = sizeof(struct io_async_connect),
1089         },
1090         [IORING_OP_FALLOCATE] = {
1091                 .needs_file             = 1,
1092         },
1093         [IORING_OP_OPENAT] = {},
1094         [IORING_OP_CLOSE] = {},
1095         [IORING_OP_FILES_UPDATE] = {
1096                 .audit_skip             = 1,
1097         },
1098         [IORING_OP_STATX] = {
1099                 .audit_skip             = 1,
1100         },
1101         [IORING_OP_READ] = {
1102                 .needs_file             = 1,
1103                 .unbound_nonreg_file    = 1,
1104                 .pollin                 = 1,
1105                 .buffer_select          = 1,
1106                 .plug                   = 1,
1107                 .audit_skip             = 1,
1108                 .async_size             = sizeof(struct io_async_rw),
1109         },
1110         [IORING_OP_WRITE] = {
1111                 .needs_file             = 1,
1112                 .hash_reg_file          = 1,
1113                 .unbound_nonreg_file    = 1,
1114                 .pollout                = 1,
1115                 .plug                   = 1,
1116                 .audit_skip             = 1,
1117                 .async_size             = sizeof(struct io_async_rw),
1118         },
1119         [IORING_OP_FADVISE] = {
1120                 .needs_file             = 1,
1121                 .audit_skip             = 1,
1122         },
1123         [IORING_OP_MADVISE] = {},
1124         [IORING_OP_SEND] = {
1125                 .needs_file             = 1,
1126                 .unbound_nonreg_file    = 1,
1127                 .pollout                = 1,
1128                 .audit_skip             = 1,
1129         },
1130         [IORING_OP_RECV] = {
1131                 .needs_file             = 1,
1132                 .unbound_nonreg_file    = 1,
1133                 .pollin                 = 1,
1134                 .buffer_select          = 1,
1135                 .audit_skip             = 1,
1136         },
1137         [IORING_OP_OPENAT2] = {
1138         },
1139         [IORING_OP_EPOLL_CTL] = {
1140                 .unbound_nonreg_file    = 1,
1141                 .audit_skip             = 1,
1142         },
1143         [IORING_OP_SPLICE] = {
1144                 .needs_file             = 1,
1145                 .hash_reg_file          = 1,
1146                 .unbound_nonreg_file    = 1,
1147                 .audit_skip             = 1,
1148         },
1149         [IORING_OP_PROVIDE_BUFFERS] = {
1150                 .audit_skip             = 1,
1151         },
1152         [IORING_OP_REMOVE_BUFFERS] = {
1153                 .audit_skip             = 1,
1154         },
1155         [IORING_OP_TEE] = {
1156                 .needs_file             = 1,
1157                 .hash_reg_file          = 1,
1158                 .unbound_nonreg_file    = 1,
1159                 .audit_skip             = 1,
1160         },
1161         [IORING_OP_SHUTDOWN] = {
1162                 .needs_file             = 1,
1163         },
1164         [IORING_OP_RENAMEAT] = {},
1165         [IORING_OP_UNLINKAT] = {},
1166         [IORING_OP_MKDIRAT] = {},
1167         [IORING_OP_SYMLINKAT] = {},
1168         [IORING_OP_LINKAT] = {},
1169         [IORING_OP_MSG_RING] = {
1170                 .needs_file             = 1,
1171         },
1172 };
1173
1174 /* requests with any of those set should undergo io_disarm_next() */
1175 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1176
1177 static bool io_disarm_next(struct io_kiocb *req);
1178 static void io_uring_del_tctx_node(unsigned long index);
1179 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1180                                          struct task_struct *task,
1181                                          bool cancel_all);
1182 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1183
1184 static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
1185
1186 static void io_put_req(struct io_kiocb *req);
1187 static void io_put_req_deferred(struct io_kiocb *req);
1188 static void io_dismantle_req(struct io_kiocb *req);
1189 static void io_queue_linked_timeout(struct io_kiocb *req);
1190 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1191                                      struct io_uring_rsrc_update2 *up,
1192                                      unsigned nr_args);
1193 static void io_clean_op(struct io_kiocb *req);
1194 static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
1195                                              unsigned issue_flags);
1196 static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
1197 static void io_drop_inflight_file(struct io_kiocb *req);
1198 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
1199 static void __io_queue_sqe(struct io_kiocb *req);
1200 static void io_rsrc_put_work(struct work_struct *work);
1201
1202 static void io_req_task_queue(struct io_kiocb *req);
1203 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
1204 static int io_req_prep_async(struct io_kiocb *req);
1205
1206 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1207                                  unsigned int issue_flags, u32 slot_index);
1208 static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1209
1210 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
1211 static void io_eventfd_signal(struct io_ring_ctx *ctx);
1212
1213 static struct kmem_cache *req_cachep;
1214
1215 static const struct file_operations io_uring_fops;
1216
1217 struct sock *io_uring_get_socket(struct file *file)
1218 {
1219 #if defined(CONFIG_UNIX)
1220         if (file->f_op == &io_uring_fops) {
1221                 struct io_ring_ctx *ctx = file->private_data;
1222
1223                 return ctx->ring_sock->sk;
1224         }
1225 #endif
1226         return NULL;
1227 }
1228 EXPORT_SYMBOL(io_uring_get_socket);
1229
1230 #if defined(CONFIG_UNIX)
1231 static inline bool io_file_need_scm(struct file *filp)
1232 {
1233         return !!unix_get_socket(filp);
1234 }
1235 #else
1236 static inline bool io_file_need_scm(struct file *filp)
1237 {
1238         return 0;
1239 }
1240 #endif
1241
1242 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
1243 {
1244         lockdep_assert_held(&ctx->uring_lock);
1245         if (issue_flags & IO_URING_F_UNLOCKED)
1246                 mutex_unlock(&ctx->uring_lock);
1247 }
1248
1249 static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
1250 {
1251         /*
1252          * "Normal" inline submissions always hold the uring_lock, since we
1253          * grab it from the system call. Same is true for the SQPOLL offload.
1254          * The only exception is when we've detached the request and issue it
1255          * from an async worker thread, grab the lock for that case.
1256          */
1257         if (issue_flags & IO_URING_F_UNLOCKED)
1258                 mutex_lock(&ctx->uring_lock);
1259         lockdep_assert_held(&ctx->uring_lock);
1260 }
1261
1262 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1263 {
1264         if (!*locked) {
1265                 mutex_lock(&ctx->uring_lock);
1266                 *locked = true;
1267         }
1268 }
1269
1270 #define io_for_each_link(pos, head) \
1271         for (pos = (head); pos; pos = pos->link)
1272
1273 /*
1274  * Shamelessly stolen from the mm implementation of page reference checking,
1275  * see commit f958d7b528b1 for details.
1276  */
1277 #define req_ref_zero_or_close_to_overflow(req)  \
1278         ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1279
1280 static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1281 {
1282         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1283         return atomic_inc_not_zero(&req->refs);
1284 }
1285
1286 static inline bool req_ref_put_and_test(struct io_kiocb *req)
1287 {
1288         if (likely(!(req->flags & REQ_F_REFCOUNT)))
1289                 return true;
1290
1291         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1292         return atomic_dec_and_test(&req->refs);
1293 }
1294
1295 static inline void req_ref_get(struct io_kiocb *req)
1296 {
1297         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1298         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1299         atomic_inc(&req->refs);
1300 }
1301
1302 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1303 {
1304         if (!wq_list_empty(&ctx->submit_state.compl_reqs))
1305                 __io_submit_flush_completions(ctx);
1306 }
1307
1308 static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
1309 {
1310         if (!(req->flags & REQ_F_REFCOUNT)) {
1311                 req->flags |= REQ_F_REFCOUNT;
1312                 atomic_set(&req->refs, nr);
1313         }
1314 }
1315
1316 static inline void io_req_set_refcount(struct io_kiocb *req)
1317 {
1318         __io_req_set_refcount(req, 1);
1319 }
1320
1321 #define IO_RSRC_REF_BATCH       100
1322
1323 static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1324                                           struct io_ring_ctx *ctx)
1325         __must_hold(&ctx->uring_lock)
1326 {
1327         struct percpu_ref *ref = req->fixed_rsrc_refs;
1328
1329         if (ref) {
1330                 if (ref == &ctx->rsrc_node->refs)
1331                         ctx->rsrc_cached_refs++;
1332                 else
1333                         percpu_ref_put(ref);
1334         }
1335 }
1336
1337 static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
1338 {
1339         if (req->fixed_rsrc_refs)
1340                 percpu_ref_put(req->fixed_rsrc_refs);
1341 }
1342
1343 static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1344         __must_hold(&ctx->uring_lock)
1345 {
1346         if (ctx->rsrc_cached_refs) {
1347                 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
1348                 ctx->rsrc_cached_refs = 0;
1349         }
1350 }
1351
1352 static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1353         __must_hold(&ctx->uring_lock)
1354 {
1355         ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1356         percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1357 }
1358
1359 static inline void io_req_set_rsrc_node(struct io_kiocb *req,
1360                                         struct io_ring_ctx *ctx,
1361                                         unsigned int issue_flags)
1362 {
1363         if (!req->fixed_rsrc_refs) {
1364                 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1365
1366                 if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1367                         lockdep_assert_held(&ctx->uring_lock);
1368                         ctx->rsrc_cached_refs--;
1369                         if (unlikely(ctx->rsrc_cached_refs < 0))
1370                                 io_rsrc_refs_refill(ctx);
1371                 } else {
1372                         percpu_ref_get(req->fixed_rsrc_refs);
1373                 }
1374         }
1375 }
1376
1377 static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
1378 {
1379         struct io_buffer *kbuf = req->kbuf;
1380         unsigned int cflags;
1381
1382         cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
1383         req->flags &= ~REQ_F_BUFFER_SELECTED;
1384         list_add(&kbuf->list, list);
1385         req->kbuf = NULL;
1386         return cflags;
1387 }
1388
1389 static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
1390 {
1391         lockdep_assert_held(&req->ctx->completion_lock);
1392
1393         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1394                 return 0;
1395         return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
1396 }
1397
1398 static inline unsigned int io_put_kbuf(struct io_kiocb *req,
1399                                        unsigned issue_flags)
1400 {
1401         unsigned int cflags;
1402
1403         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1404                 return 0;
1405
1406         /*
1407          * We can add this buffer back to two lists:
1408          *
1409          * 1) The io_buffers_cache list. This one is protected by the
1410          *    ctx->uring_lock. If we already hold this lock, add back to this
1411          *    list as we can grab it from issue as well.
1412          * 2) The io_buffers_comp list. This one is protected by the
1413          *    ctx->completion_lock.
1414          *
1415          * We migrate buffers from the comp_list to the issue cache list
1416          * when we need one.
1417          */
1418         if (issue_flags & IO_URING_F_UNLOCKED) {
1419                 struct io_ring_ctx *ctx = req->ctx;
1420
1421                 spin_lock(&ctx->completion_lock);
1422                 cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
1423                 spin_unlock(&ctx->completion_lock);
1424         } else {
1425                 lockdep_assert_held(&req->ctx->uring_lock);
1426
1427                 cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
1428         }
1429
1430         return cflags;
1431 }
1432
1433 static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
1434                                                  unsigned int bgid)
1435 {
1436         struct list_head *hash_list;
1437         struct io_buffer_list *bl;
1438
1439         hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
1440         list_for_each_entry(bl, hash_list, list)
1441                 if (bl->bgid == bgid || bgid == -1U)
1442                         return bl;
1443
1444         return NULL;
1445 }
1446
1447 static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
1448 {
1449         struct io_ring_ctx *ctx = req->ctx;
1450         struct io_buffer_list *bl;
1451         struct io_buffer *buf;
1452
1453         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1454                 return;
1455         /* don't recycle if we already did IO to this buffer */
1456         if (req->flags & REQ_F_PARTIAL_IO)
1457                 return;
1458
1459         io_ring_submit_lock(ctx, issue_flags);
1460
1461         buf = req->kbuf;
1462         bl = io_buffer_get_list(ctx, buf->bgid);
1463         list_add(&buf->list, &bl->buf_list);
1464         req->flags &= ~REQ_F_BUFFER_SELECTED;
1465         req->kbuf = NULL;
1466
1467         io_ring_submit_unlock(ctx, issue_flags);
1468 }
1469
1470 static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1471                           bool cancel_all)
1472         __must_hold(&req->ctx->timeout_lock)
1473 {
1474         if (task && head->task != task)
1475                 return false;
1476         return cancel_all;
1477 }
1478
1479 /*
1480  * As io_match_task() but protected against racing with linked timeouts.
1481  * User must not hold timeout_lock.
1482  */
1483 static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
1484                                bool cancel_all)
1485 {
1486         if (task && head->task != task)
1487                 return false;
1488         return cancel_all;
1489 }
1490
1491 static inline bool req_has_async_data(struct io_kiocb *req)
1492 {
1493         return req->flags & REQ_F_ASYNC_DATA;
1494 }
1495
1496 static inline void req_set_fail(struct io_kiocb *req)
1497 {
1498         req->flags |= REQ_F_FAIL;
1499         if (req->flags & REQ_F_CQE_SKIP) {
1500                 req->flags &= ~REQ_F_CQE_SKIP;
1501                 req->flags |= REQ_F_SKIP_LINK_CQES;
1502         }
1503 }
1504
1505 static inline void req_fail_link_node(struct io_kiocb *req, int res)
1506 {
1507         req_set_fail(req);
1508         req->cqe.res = res;
1509 }
1510
1511 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
1512 {
1513         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1514
1515         complete(&ctx->ref_comp);
1516 }
1517
1518 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1519 {
1520         return !req->timeout.off;
1521 }
1522
1523 static __cold void io_fallback_req_func(struct work_struct *work)
1524 {
1525         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1526                                                 fallback_work.work);
1527         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1528         struct io_kiocb *req, *tmp;
1529         bool locked = false;
1530
1531         percpu_ref_get(&ctx->refs);
1532         llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1533                 req->io_task_work.func(req, &locked);
1534
1535         if (locked) {
1536                 io_submit_flush_completions(ctx);
1537                 mutex_unlock(&ctx->uring_lock);
1538         }
1539         percpu_ref_put(&ctx->refs);
1540 }
1541
1542 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1543 {
1544         struct io_ring_ctx *ctx;
1545         int i, hash_bits;
1546
1547         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1548         if (!ctx)
1549                 return NULL;
1550
1551         /*
1552          * Use 5 bits less than the max cq entries, that should give us around
1553          * 32 entries per hash list if totally full and uniformly spread.
1554          */
1555         hash_bits = ilog2(p->cq_entries);
1556         hash_bits -= 5;
1557         if (hash_bits <= 0)
1558                 hash_bits = 1;
1559         ctx->cancel_hash_bits = hash_bits;
1560         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1561                                         GFP_KERNEL);
1562         if (!ctx->cancel_hash)
1563                 goto err;
1564         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1565
1566         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1567         if (!ctx->dummy_ubuf)
1568                 goto err;
1569         /* set invalid range, so io_import_fixed() fails meeting it */
1570         ctx->dummy_ubuf->ubuf = -1UL;
1571
1572         ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
1573                                         sizeof(struct list_head), GFP_KERNEL);
1574         if (!ctx->io_buffers)
1575                 goto err;
1576         for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
1577                 INIT_LIST_HEAD(&ctx->io_buffers[i]);
1578
1579         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1580                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1581                 goto err;
1582
1583         ctx->flags = p->flags;
1584         init_waitqueue_head(&ctx->sqo_sq_wait);
1585         INIT_LIST_HEAD(&ctx->sqd_list);
1586         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1587         INIT_LIST_HEAD(&ctx->io_buffers_cache);
1588         INIT_LIST_HEAD(&ctx->apoll_cache);
1589         init_completion(&ctx->ref_comp);
1590         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1591         mutex_init(&ctx->uring_lock);
1592         init_waitqueue_head(&ctx->cq_wait);
1593         spin_lock_init(&ctx->completion_lock);
1594         spin_lock_init(&ctx->timeout_lock);
1595         INIT_WQ_LIST(&ctx->iopoll_list);
1596         INIT_LIST_HEAD(&ctx->io_buffers_pages);
1597         INIT_LIST_HEAD(&ctx->io_buffers_comp);
1598         INIT_LIST_HEAD(&ctx->defer_list);
1599         INIT_LIST_HEAD(&ctx->timeout_list);
1600         INIT_LIST_HEAD(&ctx->ltimeout_list);
1601         spin_lock_init(&ctx->rsrc_ref_lock);
1602         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1603         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1604         init_llist_head(&ctx->rsrc_put_llist);
1605         INIT_LIST_HEAD(&ctx->tctx_list);
1606         ctx->submit_state.free_list.next = NULL;
1607         INIT_WQ_LIST(&ctx->locked_free_list);
1608         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1609         INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
1610         return ctx;
1611 err:
1612         kfree(ctx->dummy_ubuf);
1613         kfree(ctx->cancel_hash);
1614         kfree(ctx->io_buffers);
1615         kfree(ctx);
1616         return NULL;
1617 }
1618
1619 static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1620 {
1621         struct io_rings *r = ctx->rings;
1622
1623         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1624         ctx->cq_extra--;
1625 }
1626
1627 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1628 {
1629         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1630                 struct io_ring_ctx *ctx = req->ctx;
1631
1632                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1633         }
1634
1635         return false;
1636 }
1637
1638 #define FFS_NOWAIT              0x1UL
1639 #define FFS_ISREG               0x2UL
1640 #define FFS_MASK                ~(FFS_NOWAIT|FFS_ISREG)
1641
1642 static inline bool io_req_ffs_set(struct io_kiocb *req)
1643 {
1644         return req->flags & REQ_F_FIXED_FILE;
1645 }
1646
1647 static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1648 {
1649         if (WARN_ON_ONCE(!req->link))
1650                 return NULL;
1651
1652         req->flags &= ~REQ_F_ARM_LTIMEOUT;
1653         req->flags |= REQ_F_LINK_TIMEOUT;
1654
1655         /* linked timeouts should have two refs once prep'ed */
1656         io_req_set_refcount(req);
1657         __io_req_set_refcount(req->link, 2);
1658         return req->link;
1659 }
1660
1661 static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1662 {
1663         if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
1664                 return NULL;
1665         return __io_prep_linked_timeout(req);
1666 }
1667
1668 static void io_prep_async_work(struct io_kiocb *req)
1669 {
1670         const struct io_op_def *def = &io_op_defs[req->opcode];
1671         struct io_ring_ctx *ctx = req->ctx;
1672
1673         if (!(req->flags & REQ_F_CREDS)) {
1674                 req->flags |= REQ_F_CREDS;
1675                 req->creds = get_current_cred();
1676         }
1677
1678         req->work.list.next = NULL;
1679         req->work.flags = 0;
1680         if (req->flags & REQ_F_FORCE_ASYNC)
1681                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1682
1683         if (req->flags & REQ_F_ISREG) {
1684                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1685                         io_wq_hash_work(&req->work, file_inode(req->file));
1686         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1687                 if (def->unbound_nonreg_file)
1688                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1689         }
1690 }
1691
1692 static void io_prep_async_link(struct io_kiocb *req)
1693 {
1694         struct io_kiocb *cur;
1695
1696         if (req->flags & REQ_F_LINK_TIMEOUT) {
1697                 struct io_ring_ctx *ctx = req->ctx;
1698
1699                 spin_lock_irq(&ctx->timeout_lock);
1700                 io_for_each_link(cur, req)
1701                         io_prep_async_work(cur);
1702                 spin_unlock_irq(&ctx->timeout_lock);
1703         } else {
1704                 io_for_each_link(cur, req)
1705                         io_prep_async_work(cur);
1706         }
1707 }
1708
1709 static inline void io_req_add_compl_list(struct io_kiocb *req)
1710 {
1711         struct io_submit_state *state = &req->ctx->submit_state;
1712
1713         if (!(req->flags & REQ_F_CQE_SKIP))
1714                 state->flush_cqes = true;
1715         wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1716 }
1717
1718 static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
1719 {
1720         struct io_ring_ctx *ctx = req->ctx;
1721         struct io_kiocb *link = io_prep_linked_timeout(req);
1722         struct io_uring_task *tctx = req->task->io_uring;
1723
1724         BUG_ON(!tctx);
1725         BUG_ON(!tctx->io_wq);
1726
1727         /* init ->work of the whole link before punting */
1728         io_prep_async_link(req);
1729
1730         /*
1731          * Not expected to happen, but if we do have a bug where this _can_
1732          * happen, catch it here and ensure the request is marked as
1733          * canceled. That will make io-wq go through the usual work cancel
1734          * procedure rather than attempt to run this request (or create a new
1735          * worker for it).
1736          */
1737         if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1738                 req->work.flags |= IO_WQ_WORK_CANCEL;
1739
1740         trace_io_uring_queue_async_work(ctx, req, req->cqe.user_data, req->opcode, req->flags,
1741                                         &req->work, io_wq_is_hashed(&req->work));
1742         io_wq_enqueue(tctx->io_wq, &req->work);
1743         if (link)
1744                 io_queue_linked_timeout(link);
1745 }
1746
1747 static void io_kill_timeout(struct io_kiocb *req, int status)
1748         __must_hold(&req->ctx->completion_lock)
1749         __must_hold(&req->ctx->timeout_lock)
1750 {
1751         struct io_timeout_data *io = req->async_data;
1752
1753         if (hrtimer_try_to_cancel(&io->timer) != -1) {
1754                 if (status)
1755                         req_set_fail(req);
1756                 atomic_set(&req->ctx->cq_timeouts,
1757                         atomic_read(&req->ctx->cq_timeouts) + 1);
1758                 list_del_init(&req->timeout.list);
1759                 io_fill_cqe_req(req, status, 0);
1760                 io_put_req_deferred(req);
1761         }
1762 }
1763
1764 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
1765 {
1766         while (!list_empty(&ctx->defer_list)) {
1767                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1768                                                 struct io_defer_entry, list);
1769
1770                 if (req_need_defer(de->req, de->seq))
1771                         break;
1772                 list_del_init(&de->list);
1773                 io_req_task_queue(de->req);
1774                 kfree(de);
1775         }
1776 }
1777
1778 static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
1779         __must_hold(&ctx->completion_lock)
1780 {
1781         u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1782         struct io_kiocb *req, *tmp;
1783
1784         spin_lock_irq(&ctx->timeout_lock);
1785         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1786                 u32 events_needed, events_got;
1787
1788                 if (io_is_timeout_noseq(req))
1789                         break;
1790
1791                 /*
1792                  * Since seq can easily wrap around over time, subtract
1793                  * the last seq at which timeouts were flushed before comparing.
1794                  * Assuming not more than 2^31-1 events have happened since,
1795                  * these subtractions won't have wrapped, so we can check if
1796                  * target is in [last_seq, current_seq] by comparing the two.
1797                  */
1798                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1799                 events_got = seq - ctx->cq_last_tm_flush;
1800                 if (events_got < events_needed)
1801                         break;
1802
1803                 io_kill_timeout(req, 0);
1804         }
1805         ctx->cq_last_tm_flush = seq;
1806         spin_unlock_irq(&ctx->timeout_lock);
1807 }
1808
1809 static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1810 {
1811         /* order cqe stores with ring update */
1812         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1813 }
1814
1815 static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1816 {
1817         if (ctx->off_timeout_used || ctx->drain_active) {
1818                 spin_lock(&ctx->completion_lock);
1819                 if (ctx->off_timeout_used)
1820                         io_flush_timeouts(ctx);
1821                 if (ctx->drain_active)
1822                         io_queue_deferred(ctx);
1823                 io_commit_cqring(ctx);
1824                 spin_unlock(&ctx->completion_lock);
1825         }
1826         if (ctx->has_evfd)
1827                 io_eventfd_signal(ctx);
1828 }
1829
1830 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1831 {
1832         struct io_rings *r = ctx->rings;
1833
1834         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1835 }
1836
1837 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1838 {
1839         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1840 }
1841
1842 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1843 {
1844         struct io_rings *rings = ctx->rings;
1845         unsigned tail, mask = ctx->cq_entries - 1;
1846
1847         /*
1848          * writes to the cq entry need to come after reading head; the
1849          * control dependency is enough as we're using WRITE_ONCE to
1850          * fill the cq entry
1851          */
1852         if (__io_cqring_events(ctx) == ctx->cq_entries)
1853                 return NULL;
1854
1855         tail = ctx->cached_cq_tail++;
1856         return &rings->cqes[tail & mask];
1857 }
1858
1859 static void io_eventfd_signal(struct io_ring_ctx *ctx)
1860 {
1861         struct io_ev_fd *ev_fd;
1862
1863         rcu_read_lock();
1864         /*
1865          * rcu_dereference ctx->io_ev_fd once and use it for both for checking
1866          * and eventfd_signal
1867          */
1868         ev_fd = rcu_dereference(ctx->io_ev_fd);
1869
1870         /*
1871          * Check again if ev_fd exists incase an io_eventfd_unregister call
1872          * completed between the NULL check of ctx->io_ev_fd at the start of
1873          * the function and rcu_read_lock.
1874          */
1875         if (unlikely(!ev_fd))
1876                 goto out;
1877         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1878                 goto out;
1879
1880         if (!ev_fd->eventfd_async || io_wq_current_is_worker())
1881                 eventfd_signal(ev_fd->cq_ev_fd, 1);
1882 out:
1883         rcu_read_unlock();
1884 }
1885
1886 static inline void io_cqring_wake(struct io_ring_ctx *ctx)
1887 {
1888         /*
1889          * wake_up_all() may seem excessive, but io_wake_function() and
1890          * io_should_wake() handle the termination of the loop and only
1891          * wake as many waiters as we need to.
1892          */
1893         if (wq_has_sleeper(&ctx->cq_wait))
1894                 wake_up_all(&ctx->cq_wait);
1895 }
1896
1897 /*
1898  * This should only get called when at least one event has been posted.
1899  * Some applications rely on the eventfd notification count only changing
1900  * IFF a new CQE has been added to the CQ ring. There's no depedency on
1901  * 1:1 relationship between how many times this function is called (and
1902  * hence the eventfd count) and number of CQEs posted to the CQ ring.
1903  */
1904 static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1905 {
1906         if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
1907                      ctx->has_evfd))
1908                 __io_commit_cqring_flush(ctx);
1909
1910         io_cqring_wake(ctx);
1911 }
1912
1913 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1914 {
1915         if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
1916                      ctx->has_evfd))
1917                 __io_commit_cqring_flush(ctx);
1918
1919         if (ctx->flags & IORING_SETUP_SQPOLL)
1920                 io_cqring_wake(ctx);
1921 }
1922
1923 /* Returns true if there are no backlogged entries after the flush */
1924 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1925 {
1926         bool all_flushed, posted;
1927
1928         if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1929                 return false;
1930
1931         posted = false;
1932         spin_lock(&ctx->completion_lock);
1933         while (!list_empty(&ctx->cq_overflow_list)) {
1934                 struct io_uring_cqe *cqe = io_get_cqe(ctx);
1935                 struct io_overflow_cqe *ocqe;
1936
1937                 if (!cqe && !force)
1938                         break;
1939                 ocqe = list_first_entry(&ctx->cq_overflow_list,
1940                                         struct io_overflow_cqe, list);
1941                 if (cqe)
1942                         memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1943                 else
1944                         io_account_cq_overflow(ctx);
1945
1946                 posted = true;
1947                 list_del(&ocqe->list);
1948                 kfree(ocqe);
1949         }
1950
1951         all_flushed = list_empty(&ctx->cq_overflow_list);
1952         if (all_flushed) {
1953                 clear_bit(0, &ctx->check_cq_overflow);
1954                 WRITE_ONCE(ctx->rings->sq_flags,
1955                            ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1956         }
1957
1958         io_commit_cqring(ctx);
1959         spin_unlock(&ctx->completion_lock);
1960         if (posted)
1961                 io_cqring_ev_posted(ctx);
1962         return all_flushed;
1963 }
1964
1965 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1966 {
1967         bool ret = true;
1968
1969         if (test_bit(0, &ctx->check_cq_overflow)) {
1970                 /* iopoll syncs against uring_lock, not completion_lock */
1971                 if (ctx->flags & IORING_SETUP_IOPOLL)
1972                         mutex_lock(&ctx->uring_lock);
1973                 ret = __io_cqring_overflow_flush(ctx, false);
1974                 if (ctx->flags & IORING_SETUP_IOPOLL)
1975                         mutex_unlock(&ctx->uring_lock);
1976         }
1977
1978         return ret;
1979 }
1980
1981 static void __io_put_task(struct task_struct *task, int nr)
1982 {
1983         struct io_uring_task *tctx = task->io_uring;
1984
1985         percpu_counter_sub(&tctx->inflight, nr);
1986         if (unlikely(atomic_read(&tctx->in_idle)))
1987                 wake_up(&tctx->wait);
1988         put_task_struct_many(task, nr);
1989 }
1990
1991 /* must to be called somewhat shortly after putting a request */
1992 static inline void io_put_task(struct task_struct *task, int nr)
1993 {
1994         if (likely(task == current))
1995                 task->io_uring->cached_refs += nr;
1996         else
1997                 __io_put_task(task, nr);
1998 }
1999
2000 static void io_task_refs_refill(struct io_uring_task *tctx)
2001 {
2002         unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
2003
2004         percpu_counter_add(&tctx->inflight, refill);
2005         refcount_add(refill, &current->usage);
2006         tctx->cached_refs += refill;
2007 }
2008
2009 static inline void io_get_task_refs(int nr)
2010 {
2011         struct io_uring_task *tctx = current->io_uring;
2012
2013         tctx->cached_refs -= nr;
2014         if (unlikely(tctx->cached_refs < 0))
2015                 io_task_refs_refill(tctx);
2016 }
2017
2018 static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
2019 {
2020         struct io_uring_task *tctx = task->io_uring;
2021         unsigned int refs = tctx->cached_refs;
2022
2023         if (refs) {
2024                 tctx->cached_refs = 0;
2025                 percpu_counter_sub(&tctx->inflight, refs);
2026                 put_task_struct_many(task, refs);
2027         }
2028 }
2029
2030 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
2031                                      s32 res, u32 cflags)
2032 {
2033         struct io_overflow_cqe *ocqe;
2034
2035         ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
2036         if (!ocqe) {
2037                 /*
2038                  * If we're in ring overflow flush mode, or in task cancel mode,
2039                  * or cannot allocate an overflow entry, then we need to drop it
2040                  * on the floor.
2041                  */
2042                 io_account_cq_overflow(ctx);
2043                 return false;
2044         }
2045         if (list_empty(&ctx->cq_overflow_list)) {
2046                 set_bit(0, &ctx->check_cq_overflow);
2047                 WRITE_ONCE(ctx->rings->sq_flags,
2048                            ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
2049
2050         }
2051         ocqe->cqe.user_data = user_data;
2052         ocqe->cqe.res = res;
2053         ocqe->cqe.flags = cflags;
2054         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
2055         return true;
2056 }
2057
2058 static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
2059                                  s32 res, u32 cflags)
2060 {
2061         struct io_uring_cqe *cqe;
2062
2063         /*
2064          * If we can't get a cq entry, userspace overflowed the
2065          * submission (by quite a lot). Increment the overflow count in
2066          * the ring.
2067          */
2068         cqe = io_get_cqe(ctx);
2069         if (likely(cqe)) {
2070                 WRITE_ONCE(cqe->user_data, user_data);
2071                 WRITE_ONCE(cqe->res, res);
2072                 WRITE_ONCE(cqe->flags, cflags);
2073                 return true;
2074         }
2075         return io_cqring_event_overflow(ctx, user_data, res, cflags);
2076 }
2077
2078 static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
2079                                             struct io_kiocb *req)
2080 {
2081         struct io_uring_cqe *cqe;
2082
2083         trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
2084                                 req->cqe.res, req->cqe.flags);
2085
2086         /*
2087          * If we can't get a cq entry, userspace overflowed the
2088          * submission (by quite a lot). Increment the overflow count in
2089          * the ring.
2090          */
2091         cqe = io_get_cqe(ctx);
2092         if (likely(cqe)) {
2093                 memcpy(cqe, &req->cqe, sizeof(*cqe));
2094                 return true;
2095         }
2096         return io_cqring_event_overflow(ctx, req->cqe.user_data,
2097                                         req->cqe.res, req->cqe.flags);
2098 }
2099
2100 static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
2101 {
2102         trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
2103         return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
2104 }
2105
2106 static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
2107 {
2108         if (!(req->flags & REQ_F_CQE_SKIP))
2109                 __io_fill_cqe_req(req, res, cflags);
2110 }
2111
2112 static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
2113                                      s32 res, u32 cflags)
2114 {
2115         ctx->cq_extra++;
2116         trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
2117         return __io_fill_cqe(ctx, user_data, res, cflags);
2118 }
2119
2120 static void __io_req_complete_post(struct io_kiocb *req, s32 res,
2121                                    u32 cflags)
2122 {
2123         struct io_ring_ctx *ctx = req->ctx;
2124
2125         if (!(req->flags & REQ_F_CQE_SKIP))
2126                 __io_fill_cqe_req(req, res, cflags);
2127         /*
2128          * If we're the last reference to this request, add to our locked
2129          * free_list cache.
2130          */
2131         if (req_ref_put_and_test(req)) {
2132                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
2133                         if (req->flags & IO_DISARM_MASK)
2134                                 io_disarm_next(req);
2135                         if (req->link) {
2136                                 io_req_task_queue(req->link);
2137                                 req->link = NULL;
2138                         }
2139                 }
2140                 io_req_put_rsrc(req, ctx);
2141                 /*
2142                  * Selected buffer deallocation in io_clean_op() assumes that
2143                  * we don't hold ->completion_lock. Clean them here to avoid
2144                  * deadlocks.
2145                  */
2146                 io_put_kbuf_comp(req);
2147                 io_dismantle_req(req);
2148                 io_put_task(req->task, 1);
2149                 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
2150                 ctx->locked_free_nr++;
2151         }
2152 }
2153
2154 static void io_req_complete_post(struct io_kiocb *req, s32 res,
2155                                  u32 cflags)
2156 {
2157         struct io_ring_ctx *ctx = req->ctx;
2158
2159         spin_lock(&ctx->completion_lock);
2160         __io_req_complete_post(req, res, cflags);
2161         io_commit_cqring(ctx);
2162         spin_unlock(&ctx->completion_lock);
2163         io_cqring_ev_posted(ctx);
2164 }
2165
2166 static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
2167                                          u32 cflags)
2168 {
2169         req->cqe.res = res;
2170         req->cqe.flags = cflags;
2171         req->flags |= REQ_F_COMPLETE_INLINE;
2172 }
2173
2174 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
2175                                      s32 res, u32 cflags)
2176 {
2177         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
2178                 io_req_complete_state(req, res, cflags);
2179         else
2180                 io_req_complete_post(req, res, cflags);
2181 }
2182
2183 static inline void io_req_complete(struct io_kiocb *req, s32 res)
2184 {
2185         __io_req_complete(req, 0, res, 0);
2186 }
2187
2188 static void io_req_complete_failed(struct io_kiocb *req, s32 res)
2189 {
2190         req_set_fail(req);
2191         io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
2192 }
2193
2194 static void io_req_complete_fail_submit(struct io_kiocb *req)
2195 {
2196         /*
2197          * We don't submit, fail them all, for that replace hardlinks with
2198          * normal links. Extra REQ_F_LINK is tolerated.
2199          */
2200         req->flags &= ~REQ_F_HARDLINK;
2201         req->flags |= REQ_F_LINK;
2202         io_req_complete_failed(req, req->cqe.res);
2203 }
2204
2205 /*
2206  * Don't initialise the fields below on every allocation, but do that in
2207  * advance and keep them valid across allocations.
2208  */
2209 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
2210 {
2211         req->ctx = ctx;
2212         req->link = NULL;
2213         req->async_data = NULL;
2214         /* not necessary, but safer to zero */
2215         req->cqe.res = 0;
2216 }
2217
2218 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
2219                                         struct io_submit_state *state)
2220 {
2221         spin_lock(&ctx->completion_lock);
2222         wq_list_splice(&ctx->locked_free_list, &state->free_list);
2223         ctx->locked_free_nr = 0;
2224         spin_unlock(&ctx->completion_lock);
2225 }
2226
2227 /* Returns true IFF there are requests in the cache */
2228 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
2229 {
2230         struct io_submit_state *state = &ctx->submit_state;
2231
2232         /*
2233          * If we have more than a batch's worth of requests in our IRQ side
2234          * locked cache, grab the lock and move them over to our submission
2235          * side cache.
2236          */
2237         if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
2238                 io_flush_cached_locked_reqs(ctx, state);
2239         return !!state->free_list.next;
2240 }
2241
2242 /*
2243  * A request might get retired back into the request caches even before opcode
2244  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
2245  * Because of that, io_alloc_req() should be called only under ->uring_lock
2246  * and with extra caution to not get a request that is still worked on.
2247  */
2248 static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
2249         __must_hold(&ctx->uring_lock)
2250 {
2251         struct io_submit_state *state = &ctx->submit_state;
2252         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2253         void *reqs[IO_REQ_ALLOC_BATCH];
2254         struct io_kiocb *req;
2255         int ret, i;
2256
2257         if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
2258                 return true;
2259
2260         ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
2261
2262         /*
2263          * Bulk alloc is all-or-nothing. If we fail to get a batch,
2264          * retry single alloc to be on the safe side.
2265          */
2266         if (unlikely(ret <= 0)) {
2267                 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2268                 if (!reqs[0])
2269                         return false;
2270                 ret = 1;
2271         }
2272
2273         percpu_ref_get_many(&ctx->refs, ret);
2274         for (i = 0; i < ret; i++) {
2275                 req = reqs[i];
2276
2277                 io_preinit_req(req, ctx);
2278                 wq_stack_add_head(&req->comp_list, &state->free_list);
2279         }
2280         return true;
2281 }
2282
2283 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
2284 {
2285         if (unlikely(!ctx->submit_state.free_list.next))
2286                 return __io_alloc_req_refill(ctx);
2287         return true;
2288 }
2289
2290 static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2291 {
2292         struct io_wq_work_node *node;
2293
2294         node = wq_stack_extract(&ctx->submit_state.free_list);
2295         return container_of(node, struct io_kiocb, comp_list);
2296 }
2297
2298 static inline void io_put_file(struct file *file)
2299 {
2300         if (file)
2301                 fput(file);
2302 }
2303
2304 static inline void io_dismantle_req(struct io_kiocb *req)
2305 {
2306         unsigned int flags = req->flags;
2307
2308         if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
2309                 io_clean_op(req);
2310         if (!(flags & REQ_F_FIXED_FILE))
2311                 io_put_file(req->file);
2312 }
2313
2314 static __cold void __io_free_req(struct io_kiocb *req)
2315 {
2316         struct io_ring_ctx *ctx = req->ctx;
2317
2318         io_req_put_rsrc(req, ctx);
2319         io_dismantle_req(req);
2320         io_put_task(req->task, 1);
2321
2322         spin_lock(&ctx->completion_lock);
2323         wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
2324         ctx->locked_free_nr++;
2325         spin_unlock(&ctx->completion_lock);
2326 }
2327
2328 static inline void io_remove_next_linked(struct io_kiocb *req)
2329 {
2330         struct io_kiocb *nxt = req->link;
2331
2332         req->link = nxt->link;
2333         nxt->link = NULL;
2334 }
2335
2336 static bool io_kill_linked_timeout(struct io_kiocb *req)
2337         __must_hold(&req->ctx->completion_lock)
2338         __must_hold(&req->ctx->timeout_lock)
2339 {
2340         struct io_kiocb *link = req->link;
2341
2342         if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2343                 struct io_timeout_data *io = link->async_data;
2344
2345                 io_remove_next_linked(req);
2346                 link->timeout.head = NULL;
2347                 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2348                         list_del(&link->timeout.list);
2349                         /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
2350                         io_fill_cqe_req(link, -ECANCELED, 0);
2351                         io_put_req_deferred(link);
2352                         return true;
2353                 }
2354         }
2355         return false;
2356 }
2357
2358 static void io_fail_links(struct io_kiocb *req)
2359         __must_hold(&req->ctx->completion_lock)
2360 {
2361         struct io_kiocb *nxt, *link = req->link;
2362         bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
2363
2364         req->link = NULL;
2365         while (link) {
2366                 long res = -ECANCELED;
2367
2368                 if (link->flags & REQ_F_FAIL)
2369                         res = link->cqe.res;
2370
2371                 nxt = link->link;
2372                 link->link = NULL;
2373
2374                 trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
2375                                         req->opcode, link);
2376
2377                 if (!ignore_cqes) {
2378                         link->flags &= ~REQ_F_CQE_SKIP;
2379                         io_fill_cqe_req(link, res, 0);
2380                 }
2381                 io_put_req_deferred(link);
2382                 link = nxt;
2383         }
2384 }
2385
2386 static bool io_disarm_next(struct io_kiocb *req)
2387         __must_hold(&req->ctx->completion_lock)
2388 {
2389         bool posted = false;
2390
2391         if (req->flags & REQ_F_ARM_LTIMEOUT) {
2392                 struct io_kiocb *link = req->link;
2393
2394                 req->flags &= ~REQ_F_ARM_LTIMEOUT;
2395                 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2396                         io_remove_next_linked(req);
2397                         /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
2398                         io_fill_cqe_req(link, -ECANCELED, 0);
2399                         io_put_req_deferred(link);
2400                         posted = true;
2401                 }
2402         } else if (req->flags & REQ_F_LINK_TIMEOUT) {
2403                 struct io_ring_ctx *ctx = req->ctx;
2404
2405                 spin_lock_irq(&ctx->timeout_lock);
2406                 posted = io_kill_linked_timeout(req);
2407                 spin_unlock_irq(&ctx->timeout_lock);
2408         }
2409         if (unlikely((req->flags & REQ_F_FAIL) &&
2410                      !(req->flags & REQ_F_HARDLINK))) {
2411                 posted |= (req->link != NULL);
2412                 io_fail_links(req);
2413         }
2414         return posted;
2415 }
2416
2417 static void __io_req_find_next_prep(struct io_kiocb *req)
2418 {
2419         struct io_ring_ctx *ctx = req->ctx;
2420         bool posted;
2421
2422         spin_lock(&ctx->completion_lock);
2423         posted = io_disarm_next(req);
2424         io_commit_cqring(ctx);
2425         spin_unlock(&ctx->completion_lock);
2426         if (posted)
2427                 io_cqring_ev_posted(ctx);
2428 }
2429
2430 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2431 {
2432         struct io_kiocb *nxt;
2433
2434         /*
2435          * If LINK is set, we have dependent requests in this chain. If we
2436          * didn't fail this request, queue the first one up, moving any other
2437          * dependencies to the next request. In case of failure, fail the rest
2438          * of the chain.
2439          */
2440         if (unlikely(req->flags & IO_DISARM_MASK))
2441                 __io_req_find_next_prep(req);
2442         nxt = req->link;
2443         req->link = NULL;
2444         return nxt;
2445 }
2446
2447 static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2448 {
2449         if (!ctx)
2450                 return;
2451         if (*locked) {
2452                 io_submit_flush_completions(ctx);
2453                 mutex_unlock(&ctx->uring_lock);
2454                 *locked = false;
2455         }
2456         percpu_ref_put(&ctx->refs);
2457 }
2458
2459 static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
2460 {
2461         io_commit_cqring(ctx);
2462         spin_unlock(&ctx->completion_lock);
2463         io_cqring_ev_posted(ctx);
2464 }
2465
2466 static void handle_prev_tw_list(struct io_wq_work_node *node,
2467                                 struct io_ring_ctx **ctx, bool *uring_locked)
2468 {
2469         if (*ctx && !*uring_locked)
2470                 spin_lock(&(*ctx)->completion_lock);
2471
2472         do {
2473                 struct io_wq_work_node *next = node->next;
2474                 struct io_kiocb *req = container_of(node, struct io_kiocb,
2475                                                     io_task_work.node);
2476
2477                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2478
2479                 if (req->ctx != *ctx) {
2480                         if (unlikely(!*uring_locked && *ctx))
2481                                 ctx_commit_and_unlock(*ctx);
2482
2483                         ctx_flush_and_put(*ctx, uring_locked);
2484                         *ctx = req->ctx;
2485                         /* if not contended, grab and improve batching */
2486                         *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
2487                         percpu_ref_get(&(*ctx)->refs);
2488                         if (unlikely(!*uring_locked))
2489                                 spin_lock(&(*ctx)->completion_lock);
2490                 }
2491                 if (likely(*uring_locked))
2492                         req->io_task_work.func(req, uring_locked);
2493                 else
2494                         __io_req_complete_post(req, req->cqe.res,
2495                                                 io_put_kbuf_comp(req));
2496                 node = next;
2497         } while (node);
2498
2499         if (unlikely(!*uring_locked))
2500                 ctx_commit_and_unlock(*ctx);
2501 }
2502
2503 static void handle_tw_list(struct io_wq_work_node *node,
2504                            struct io_ring_ctx **ctx, bool *locked)
2505 {
2506         do {
2507                 struct io_wq_work_node *next = node->next;
2508                 struct io_kiocb *req = container_of(node, struct io_kiocb,
2509                                                     io_task_work.node);
2510
2511                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2512
2513                 if (req->ctx != *ctx) {
2514                         ctx_flush_and_put(*ctx, locked);
2515                         *ctx = req->ctx;
2516                         /* if not contended, grab and improve batching */
2517                         *locked = mutex_trylock(&(*ctx)->uring_lock);
2518                         percpu_ref_get(&(*ctx)->refs);
2519                 }
2520                 req->io_task_work.func(req, locked);
2521                 node = next;
2522         } while (node);
2523 }
2524
2525 static void tctx_task_work(struct callback_head *cb)
2526 {
2527         bool uring_locked = false;
2528         struct io_ring_ctx *ctx = NULL;
2529         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2530                                                   task_work);
2531
2532         while (1) {
2533                 struct io_wq_work_node *node1, *node2;
2534
2535                 spin_lock_irq(&tctx->task_lock);
2536                 node1 = tctx->prior_task_list.first;
2537                 node2 = tctx->task_list.first;
2538                 INIT_WQ_LIST(&tctx->task_list);
2539                 INIT_WQ_LIST(&tctx->prior_task_list);
2540                 if (!node2 && !node1)
2541                         tctx->task_running = false;
2542                 spin_unlock_irq(&tctx->task_lock);
2543                 if (!node2 && !node1)
2544                         break;
2545
2546                 if (node1)
2547                         handle_prev_tw_list(node1, &ctx, &uring_locked);
2548                 if (node2)
2549                         handle_tw_list(node2, &ctx, &uring_locked);
2550                 cond_resched();
2551
2552                 if (!tctx->task_list.first &&
2553                     !tctx->prior_task_list.first && uring_locked)
2554                         io_submit_flush_completions(ctx);
2555         }
2556
2557         ctx_flush_and_put(ctx, &uring_locked);
2558
2559         /* relaxed read is enough as only the task itself sets ->in_idle */
2560         if (unlikely(atomic_read(&tctx->in_idle)))
2561                 io_uring_drop_tctx_refs(current);
2562 }
2563
2564 static void io_req_task_work_add(struct io_kiocb *req, bool priority)
2565 {
2566         struct task_struct *tsk = req->task;
2567         struct io_uring_task *tctx = tsk->io_uring;
2568         enum task_work_notify_mode notify;
2569         struct io_wq_work_node *node;
2570         unsigned long flags;
2571         bool running;
2572
2573         WARN_ON_ONCE(!tctx);
2574
2575         io_drop_inflight_file(req);
2576
2577         spin_lock_irqsave(&tctx->task_lock, flags);
2578         if (priority)
2579                 wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
2580         else
2581                 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2582         running = tctx->task_running;
2583         if (!running)
2584                 tctx->task_running = true;
2585         spin_unlock_irqrestore(&tctx->task_lock, flags);
2586
2587         /* task_work already pending, we're done */
2588         if (running)
2589                 return;
2590
2591         /*
2592          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2593          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2594          * processing task_work. There's no reliable way to tell if TWA_RESUME
2595          * will do the job.
2596          */
2597         notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2598         if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2599                 if (notify == TWA_NONE)
2600                         wake_up_process(tsk);
2601                 return;
2602         }
2603
2604         spin_lock_irqsave(&tctx->task_lock, flags);
2605         tctx->task_running = false;
2606         node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
2607         spin_unlock_irqrestore(&tctx->task_lock, flags);
2608
2609         while (node) {
2610                 req = container_of(node, struct io_kiocb, io_task_work.node);
2611                 node = node->next;
2612                 if (llist_add(&req->io_task_work.fallback_node,
2613                               &req->ctx->fallback_llist))
2614                         schedule_delayed_work(&req->ctx->fallback_work, 1);
2615         }
2616 }
2617
2618 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
2619 {
2620         struct io_ring_ctx *ctx = req->ctx;
2621
2622         /* not needed for normal modes, but SQPOLL depends on it */
2623         io_tw_lock(ctx, locked);
2624         io_req_complete_failed(req, req->cqe.res);
2625 }
2626
2627 static void io_req_task_submit(struct io_kiocb *req, bool *locked)
2628 {
2629         struct io_ring_ctx *ctx = req->ctx;
2630
2631         io_tw_lock(ctx, locked);
2632         /* req->task == current here, checking PF_EXITING is safe */
2633         if (likely(!(req->task->flags & PF_EXITING)))
2634                 __io_queue_sqe(req);
2635         else
2636                 io_req_complete_failed(req, -EFAULT);
2637 }
2638
2639 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2640 {
2641         req->cqe.res = ret;
2642         req->io_task_work.func = io_req_task_cancel;
2643         io_req_task_work_add(req, false);
2644 }
2645
2646 static void io_req_task_queue(struct io_kiocb *req)
2647 {
2648         req->io_task_work.func = io_req_task_submit;
2649         io_req_task_work_add(req, false);
2650 }
2651
2652 static void io_req_task_queue_reissue(struct io_kiocb *req)
2653 {
2654         req->io_task_work.func = io_queue_async_work;
2655         io_req_task_work_add(req, false);
2656 }
2657
2658 static void io_queue_next(struct io_kiocb *req)
2659 {
2660         struct io_kiocb *nxt = io_req_find_next(req);
2661
2662         if (nxt)
2663                 io_req_task_queue(nxt);
2664 }
2665
2666 static void io_free_req(struct io_kiocb *req)
2667 {
2668         io_queue_next(req);
2669         __io_free_req(req);
2670 }
2671
2672 static void io_free_req_work(struct io_kiocb *req, bool *locked)
2673 {
2674         io_free_req(req);
2675 }
2676
2677 static void io_free_batch_list(struct io_ring_ctx *ctx,
2678                                 struct io_wq_work_node *node)
2679         __must_hold(&ctx->uring_lock)
2680 {
2681         struct task_struct *task = NULL;
2682         int task_refs = 0;
2683
2684         do {
2685                 struct io_kiocb *req = container_of(node, struct io_kiocb,
2686                                                     comp_list);
2687
2688                 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
2689                         if (req->flags & REQ_F_REFCOUNT) {
2690                                 node = req->comp_list.next;
2691                                 if (!req_ref_put_and_test(req))
2692                                         continue;
2693                         }
2694                         if ((req->flags & REQ_F_POLLED) && req->apoll) {
2695                                 struct async_poll *apoll = req->apoll;
2696
2697                                 if (apoll->double_poll)
2698                                         kfree(apoll->double_poll);
2699                                 list_add(&apoll->poll.wait.entry,
2700                                                 &ctx->apoll_cache);
2701                                 req->flags &= ~REQ_F_POLLED;
2702                         }
2703                         if (req->flags & (REQ_F_LINK|REQ_F_HARDLINK))
2704                                 io_queue_next(req);
2705                         if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
2706                                 io_clean_op(req);
2707                 }
2708                 if (!(req->flags & REQ_F_FIXED_FILE))
2709                         io_put_file(req->file);
2710
2711                 io_req_put_rsrc_locked(req, ctx);
2712
2713                 if (req->task != task) {
2714                         if (task)
2715                                 io_put_task(task, task_refs);
2716                         task = req->task;
2717                         task_refs = 0;
2718                 }
2719                 task_refs++;
2720                 node = req->comp_list.next;
2721                 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
2722         } while (node);
2723
2724         if (task)
2725                 io_put_task(task, task_refs);
2726 }
2727
2728 static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
2729         __must_hold(&ctx->uring_lock)
2730 {
2731         struct io_wq_work_node *node, *prev;
2732         struct io_submit_state *state = &ctx->submit_state;
2733
2734         if (state->flush_cqes) {
2735                 spin_lock(&ctx->completion_lock);
2736                 wq_list_for_each(node, prev, &state->compl_reqs) {
2737                         struct io_kiocb *req = container_of(node, struct io_kiocb,
2738                                                     comp_list);
2739
2740                         if (!(req->flags & REQ_F_CQE_SKIP))
2741                                 __io_fill_cqe_req_filled(ctx, req);
2742                 }
2743
2744                 io_commit_cqring(ctx);
2745                 spin_unlock(&ctx->completion_lock);
2746                 io_cqring_ev_posted(ctx);
2747                 state->flush_cqes = false;
2748         }
2749
2750         io_free_batch_list(ctx, state->compl_reqs.first);
2751         INIT_WQ_LIST(&state->compl_reqs);
2752 }
2753
2754 /*
2755  * Drop reference to request, return next in chain (if there is one) if this
2756  * was the last reference to this request.
2757  */
2758 static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2759 {
2760         struct io_kiocb *nxt = NULL;
2761
2762         if (req_ref_put_and_test(req)) {
2763                 if (unlikely(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))
2764                         nxt = io_req_find_next(req);
2765                 __io_free_req(req);
2766         }
2767         return nxt;
2768 }
2769
2770 static inline void io_put_req(struct io_kiocb *req)
2771 {
2772         if (req_ref_put_and_test(req))
2773                 io_free_req(req);
2774 }
2775
2776 static inline void io_put_req_deferred(struct io_kiocb *req)
2777 {
2778         if (req_ref_put_and_test(req)) {
2779                 req->io_task_work.func = io_free_req_work;
2780                 io_req_task_work_add(req, false);
2781         }
2782 }
2783
2784 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2785 {
2786         /* See comment at the top of this file */
2787         smp_rmb();
2788         return __io_cqring_events(ctx);
2789 }
2790
2791 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2792 {
2793         struct io_rings *rings = ctx->rings;
2794
2795         /* make sure SQ entry isn't read before tail */
2796         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2797 }
2798
2799 static inline bool io_run_task_work(void)
2800 {
2801         if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
2802                 __set_current_state(TASK_RUNNING);
2803                 clear_notify_signal();
2804                 if (task_work_pending(current))
2805                         task_work_run();
2806                 return true;
2807         }
2808
2809         return false;
2810 }
2811
2812 static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
2813 {
2814         struct io_wq_work_node *pos, *start, *prev;
2815         unsigned int poll_flags = BLK_POLL_NOSLEEP;
2816         DEFINE_IO_COMP_BATCH(iob);
2817         int nr_events = 0;
2818
2819         /*
2820          * Only spin for completions if we don't have multiple devices hanging
2821          * off our complete list.
2822          */
2823         if (ctx->poll_multi_queue || force_nonspin)
2824                 poll_flags |= BLK_POLL_ONESHOT;
2825
2826         wq_list_for_each(pos, start, &ctx->iopoll_list) {
2827                 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2828                 struct kiocb *kiocb = &req->rw.kiocb;
2829                 int ret;
2830
2831                 /*
2832                  * Move completed and retryable entries to our local lists.
2833                  * If we find a request that requires polling, break out
2834                  * and complete those lists first, if we have entries there.
2835                  */
2836                 if (READ_ONCE(req->iopoll_completed))
2837                         break;
2838
2839                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
2840                 if (unlikely(ret < 0))
2841                         return ret;
2842                 else if (ret)
2843                         poll_flags |= BLK_POLL_ONESHOT;
2844
2845                 /* iopoll may have completed current req */
2846                 if (!rq_list_empty(iob.req_list) ||
2847                     READ_ONCE(req->iopoll_completed))
2848                         break;
2849         }
2850
2851         if (!rq_list_empty(iob.req_list))
2852                 iob.complete(&iob);
2853         else if (!pos)
2854                 return 0;
2855
2856         prev = start;
2857         wq_list_for_each_resume(pos, prev) {
2858                 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2859
2860                 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2861                 if (!smp_load_acquire(&req->iopoll_completed))
2862                         break;
2863                 nr_events++;
2864                 if (unlikely(req->flags & REQ_F_CQE_SKIP))
2865                         continue;
2866                 __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
2867         }
2868
2869         if (unlikely(!nr_events))
2870                 return 0;
2871
2872         io_commit_cqring(ctx);
2873         io_cqring_ev_posted_iopoll(ctx);
2874         pos = start ? start->next : ctx->iopoll_list.first;
2875         wq_list_cut(&ctx->iopoll_list, prev, start);
2876         io_free_batch_list(ctx, pos);
2877         return nr_events;
2878 }
2879
2880 /*
2881  * We can't just wait for polled events to come to us, we have to actively
2882  * find and complete them.
2883  */
2884 static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2885 {
2886         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2887                 return;
2888
2889         mutex_lock(&ctx->uring_lock);
2890         while (!wq_list_empty(&ctx->iopoll_list)) {
2891                 /* let it sleep and repeat later if can't complete a request */
2892                 if (io_do_iopoll(ctx, true) == 0)
2893                         break;
2894                 /*
2895                  * Ensure we allow local-to-the-cpu processing to take place,
2896                  * in this case we need to ensure that we reap all events.
2897                  * Also let task_work, etc. to progress by releasing the mutex
2898                  */
2899                 if (need_resched()) {
2900                         mutex_unlock(&ctx->uring_lock);
2901                         cond_resched();
2902                         mutex_lock(&ctx->uring_lock);
2903                 }
2904         }
2905         mutex_unlock(&ctx->uring_lock);
2906 }
2907
2908 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2909 {
2910         unsigned int nr_events = 0;
2911         int ret = 0;
2912
2913         /*
2914          * Don't enter poll loop if we already have events pending.
2915          * If we do, we can potentially be spinning for commands that
2916          * already triggered a CQE (eg in error).
2917          */
2918         if (test_bit(0, &ctx->check_cq_overflow))
2919                 __io_cqring_overflow_flush(ctx, false);
2920         if (io_cqring_events(ctx))
2921                 return 0;
2922         do {
2923                 /*
2924                  * If a submit got punted to a workqueue, we can have the
2925                  * application entering polling for a command before it gets
2926                  * issued. That app will hold the uring_lock for the duration
2927                  * of the poll right here, so we need to take a breather every
2928                  * now and then to ensure that the issue has a chance to add
2929                  * the poll to the issued list. Otherwise we can spin here
2930                  * forever, while the workqueue is stuck trying to acquire the
2931                  * very same mutex.
2932                  */
2933                 if (wq_list_empty(&ctx->iopoll_list)) {
2934                         u32 tail = ctx->cached_cq_tail;
2935
2936                         mutex_unlock(&ctx->uring_lock);
2937                         io_run_task_work();
2938                         mutex_lock(&ctx->uring_lock);
2939
2940                         /* some requests don't go through iopoll_list */
2941                         if (tail != ctx->cached_cq_tail ||
2942                             wq_list_empty(&ctx->iopoll_list))
2943                                 break;
2944                 }
2945                 ret = io_do_iopoll(ctx, !min);
2946                 if (ret < 0)
2947                         break;
2948                 nr_events += ret;
2949                 ret = 0;
2950         } while (nr_events < min && !need_resched());
2951
2952         return ret;
2953 }
2954
2955 static void kiocb_end_write(struct io_kiocb *req)
2956 {
2957         /*
2958          * Tell lockdep we inherited freeze protection from submission
2959          * thread.
2960          */
2961         if (req->flags & REQ_F_ISREG) {
2962                 struct super_block *sb = file_inode(req->file)->i_sb;
2963
2964                 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2965                 sb_end_write(sb);
2966         }
2967 }
2968
2969 #ifdef CONFIG_BLOCK
2970 static bool io_resubmit_prep(struct io_kiocb *req)
2971 {
2972         struct io_async_rw *rw = req->async_data;
2973
2974         if (!req_has_async_data(req))
2975                 return !io_req_prep_async(req);
2976         iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
2977         return true;
2978 }
2979
2980 static bool io_rw_should_reissue(struct io_kiocb *req)
2981 {
2982         umode_t mode = file_inode(req->file)->i_mode;
2983         struct io_ring_ctx *ctx = req->ctx;
2984
2985         if (!S_ISBLK(mode) && !S_ISREG(mode))
2986                 return false;
2987         if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2988             !(ctx->flags & IORING_SETUP_IOPOLL)))
2989                 return false;
2990         /*
2991          * If ref is dying, we might be running poll reap from the exit work.
2992          * Don't attempt to reissue from that path, just let it fail with
2993          * -EAGAIN.
2994          */
2995         if (percpu_ref_is_dying(&ctx->refs))
2996                 return false;
2997         /*
2998          * Play it safe and assume not safe to re-import and reissue if we're
2999          * not in the original thread group (or in task context).
3000          */
3001         if (!same_thread_group(req->task, current) || !in_task())
3002                 return false;
3003         return true;
3004 }
3005 #else
3006 static bool io_resubmit_prep(struct io_kiocb *req)
3007 {
3008         return false;
3009 }
3010 static bool io_rw_should_reissue(struct io_kiocb *req)
3011 {
3012         return false;
3013 }
3014 #endif
3015
3016 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
3017 {
3018         if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
3019                 kiocb_end_write(req);
3020                 fsnotify_modify(req->file);
3021         } else {
3022                 fsnotify_access(req->file);
3023         }
3024         if (unlikely(res != req->cqe.res)) {
3025                 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
3026                     io_rw_should_reissue(req)) {
3027                         req->flags |= REQ_F_REISSUE;
3028                         return true;
3029                 }
3030                 req_set_fail(req);
3031                 req->cqe.res = res;
3032         }
3033         return false;
3034 }
3035
3036 static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
3037 {
3038         int res = req->cqe.res;
3039
3040         if (*locked) {
3041                 io_req_complete_state(req, res, io_put_kbuf(req, 0));
3042                 io_req_add_compl_list(req);
3043         } else {
3044                 io_req_complete_post(req, res,
3045                                         io_put_kbuf(req, IO_URING_F_UNLOCKED));
3046         }
3047 }
3048
3049 static void __io_complete_rw(struct io_kiocb *req, long res,
3050                              unsigned int issue_flags)
3051 {
3052         if (__io_complete_rw_common(req, res))
3053                 return;
3054         __io_req_complete(req, issue_flags, req->cqe.res,
3055                                 io_put_kbuf(req, issue_flags));
3056 }
3057
3058 static void io_complete_rw(struct kiocb *kiocb, long res)
3059 {
3060         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
3061
3062         if (__io_complete_rw_common(req, res))
3063                 return;
3064         req->cqe.res = res;
3065         req->io_task_work.func = io_req_task_complete;
3066         io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
3067 }
3068
3069 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
3070 {
3071         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
3072
3073         if (kiocb->ki_flags & IOCB_WRITE)
3074                 kiocb_end_write(req);
3075         if (unlikely(res != req->cqe.res)) {
3076                 if (res == -EAGAIN && io_rw_should_reissue(req)) {
3077                         req->flags |= REQ_F_REISSUE;
3078                         return;
3079                 }
3080                 req->cqe.res = res;
3081         }
3082
3083         /* order with io_iopoll_complete() checking ->iopoll_completed */
3084         smp_store_release(&req->iopoll_completed, 1);
3085 }
3086
3087 /*
3088  * After the iocb has been issued, it's safe to be found on the poll list.
3089  * Adding the kiocb to the list AFTER submission ensures that we don't
3090  * find it from a io_do_iopoll() thread before the issuer is done
3091  * accessing the kiocb cookie.
3092  */
3093 static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
3094 {
3095         struct io_ring_ctx *ctx = req->ctx;
3096         const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
3097
3098         /* workqueue context doesn't hold uring_lock, grab it now */
3099         if (unlikely(needs_lock))
3100                 mutex_lock(&ctx->uring_lock);
3101
3102         /*
3103          * Track whether we have multiple files in our lists. This will impact
3104          * how we do polling eventually, not spinning if we're on potentially
3105          * different devices.
3106          */
3107         if (wq_list_empty(&ctx->iopoll_list)) {
3108                 ctx->poll_multi_queue = false;
3109         } else if (!ctx->poll_multi_queue) {
3110                 struct io_kiocb *list_req;
3111
3112                 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
3113                                         comp_list);
3114                 if (list_req->file != req->file)
3115                         ctx->poll_multi_queue = true;
3116         }
3117
3118         /*
3119          * For fast devices, IO may have already completed. If it has, add
3120          * it to the front so we find it first.
3121          */
3122         if (READ_ONCE(req->iopoll_completed))
3123                 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
3124         else
3125                 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
3126
3127         if (unlikely(needs_lock)) {
3128                 /*
3129                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
3130                  * in sq thread task context or in io worker task context. If
3131                  * current task context is sq thread, we don't need to check
3132                  * whether should wake up sq thread.
3133                  */
3134                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
3135                     wq_has_sleeper(&ctx->sq_data->wait))
3136                         wake_up(&ctx->sq_data->wait);
3137
3138                 mutex_unlock(&ctx->uring_lock);
3139         }
3140 }
3141
3142 static bool io_bdev_nowait(struct block_device *bdev)
3143 {
3144         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
3145 }
3146
3147 /*
3148  * If we tracked the file through the SCM inflight mechanism, we could support
3149  * any file. For now, just ensure that anything potentially problematic is done
3150  * inline.
3151  */
3152 static bool __io_file_supports_nowait(struct file *file, umode_t mode)
3153 {
3154         if (S_ISBLK(mode)) {
3155                 if (IS_ENABLED(CONFIG_BLOCK) &&
3156                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
3157                         return true;
3158                 return false;
3159         }
3160         if (S_ISSOCK(mode))
3161                 return true;
3162         if (S_ISREG(mode)) {
3163                 if (IS_ENABLED(CONFIG_BLOCK) &&
3164                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
3165                     file->f_op != &io_uring_fops)
3166                         return true;
3167                 return false;
3168         }
3169
3170         /* any ->read/write should understand O_NONBLOCK */
3171         if (file->f_flags & O_NONBLOCK)
3172                 return true;
3173         return file->f_mode & FMODE_NOWAIT;
3174 }
3175
3176 /*
3177  * If we tracked the file through the SCM inflight mechanism, we could support
3178  * any file. For now, just ensure that anything potentially problematic is done
3179  * inline.
3180  */
3181 static unsigned int io_file_get_flags(struct file *file)
3182 {
3183         umode_t mode = file_inode(file)->i_mode;
3184         unsigned int res = 0;
3185
3186         if (S_ISREG(mode))
3187                 res |= FFS_ISREG;
3188         if (__io_file_supports_nowait(file, mode))
3189                 res |= FFS_NOWAIT;
3190         return res;
3191 }
3192
3193 static inline bool io_file_supports_nowait(struct io_kiocb *req)
3194 {
3195         return req->flags & REQ_F_SUPPORT_NOWAIT;
3196 }
3197
3198 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3199 {
3200         struct kiocb *kiocb = &req->rw.kiocb;
3201         unsigned ioprio;
3202         int ret;
3203
3204         kiocb->ki_pos = READ_ONCE(sqe->off);
3205
3206         ioprio = READ_ONCE(sqe->ioprio);
3207         if (ioprio) {
3208                 ret = ioprio_check_cap(ioprio);
3209                 if (ret)
3210                         return ret;
3211
3212                 kiocb->ki_ioprio = ioprio;
3213         } else {
3214                 kiocb->ki_ioprio = get_current_ioprio();
3215         }
3216
3217         req->imu = NULL;
3218         req->rw.addr = READ_ONCE(sqe->addr);
3219         req->rw.len = READ_ONCE(sqe->len);
3220         req->rw.flags = READ_ONCE(sqe->rw_flags);
3221         req->buf_index = READ_ONCE(sqe->buf_index);
3222         return 0;
3223 }
3224
3225 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
3226 {
3227         switch (ret) {
3228         case -EIOCBQUEUED:
3229                 break;
3230         case -ERESTARTSYS:
3231         case -ERESTARTNOINTR:
3232         case -ERESTARTNOHAND:
3233         case -ERESTART_RESTARTBLOCK:
3234                 /*
3235                  * We can't just restart the syscall, since previously
3236                  * submitted sqes may already be in progress. Just fail this
3237                  * IO with EINTR.
3238                  */
3239                 ret = -EINTR;
3240                 fallthrough;
3241         default:
3242                 kiocb->ki_complete(kiocb, ret);
3243         }
3244 }
3245
3246 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
3247 {
3248         struct kiocb *kiocb = &req->rw.kiocb;
3249
3250         if (kiocb->ki_pos != -1)
3251                 return &kiocb->ki_pos;
3252
3253         if (!(req->file->f_mode & FMODE_STREAM)) {
3254                 req->flags |= REQ_F_CUR_POS;
3255                 kiocb->ki_pos = req->file->f_pos;
3256                 return &kiocb->ki_pos;
3257         }
3258
3259         kiocb->ki_pos = 0;
3260         return NULL;
3261 }
3262
3263 static void kiocb_done(struct io_kiocb *req, ssize_t ret,
3264                        unsigned int issue_flags)
3265 {
3266         struct io_async_rw *io = req->async_data;
3267
3268         /* add previously done IO, if any */
3269         if (req_has_async_data(req) && io->bytes_done > 0) {
3270                 if (ret < 0)
3271                         ret = io->bytes_done;
3272                 else
3273                         ret += io->bytes_done;
3274         }
3275
3276         if (req->flags & REQ_F_CUR_POS)
3277                 req->file->f_pos = req->rw.kiocb.ki_pos;
3278         if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
3279                 __io_complete_rw(req, ret, issue_flags);
3280         else
3281                 io_rw_done(&req->rw.kiocb, ret);
3282
3283         if (req->flags & REQ_F_REISSUE) {
3284                 req->flags &= ~REQ_F_REISSUE;
3285                 if (io_resubmit_prep(req))
3286                         io_req_task_queue_reissue(req);
3287                 else
3288                         io_req_task_queue_fail(req, ret);
3289         }
3290 }
3291
3292 static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3293                              struct io_mapped_ubuf *imu)
3294 {
3295         size_t len = req->rw.len;
3296         u64 buf_end, buf_addr = req->rw.addr;
3297         size_t offset;
3298
3299         if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
3300                 return -EFAULT;
3301         /* not inside the mapped region */
3302         if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
3303                 return -EFAULT;
3304
3305         /*
3306          * May not be a start of buffer, set size appropriately
3307          * and advance us to the beginning.
3308          */
3309         offset = buf_addr - imu->ubuf;
3310         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
3311
3312         if (offset) {
3313                 /*
3314                  * Don't use iov_iter_advance() here, as it's really slow for
3315                  * using the latter parts of a big fixed buffer - it iterates
3316                  * over each segment manually. We can cheat a bit here, because
3317                  * we know that:
3318                  *
3319                  * 1) it's a BVEC iter, we set it up
3320                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
3321                  *    first and last bvec
3322                  *
3323                  * So just find our index, and adjust the iterator afterwards.
3324                  * If the offset is within the first bvec (or the whole first
3325                  * bvec, just use iov_iter_advance(). This makes it easier
3326                  * since we can just skip the first segment, which may not
3327                  * be PAGE_SIZE aligned.
3328                  */
3329                 const struct bio_vec *bvec = imu->bvec;
3330
3331                 if (offset <= bvec->bv_len) {
3332                         iov_iter_advance(iter, offset);
3333                 } else {
3334                         unsigned long seg_skip;
3335
3336                         /* skip first vec */
3337                         offset -= bvec->bv_len;
3338                         seg_skip = 1 + (offset >> PAGE_SHIFT);
3339
3340                         iter->bvec = bvec + seg_skip;
3341                         iter->nr_segs -= seg_skip;
3342                         iter->count -= bvec->bv_len + offset;
3343                         iter->iov_offset = offset & ~PAGE_MASK;
3344                 }
3345         }
3346
3347         return 0;
3348 }
3349
3350 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3351                            unsigned int issue_flags)
3352 {
3353         struct io_mapped_ubuf *imu = req->imu;
3354         u16 index, buf_index = req->buf_index;
3355
3356         if (likely(!imu)) {
3357                 struct io_ring_ctx *ctx = req->ctx;
3358
3359                 if (unlikely(buf_index >= ctx->nr_user_bufs))
3360                         return -EFAULT;
3361                 io_req_set_rsrc_node(req, ctx, issue_flags);
3362                 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3363                 imu = READ_ONCE(ctx->user_bufs[index]);
3364                 req->imu = imu;
3365         }
3366         return __io_import_fixed(req, rw, iter, imu);
3367 }
3368
3369 static void io_buffer_add_list(struct io_ring_ctx *ctx,
3370                                struct io_buffer_list *bl, unsigned int bgid)
3371 {
3372         struct list_head *list;
3373
3374         list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
3375         INIT_LIST_HEAD(&bl->buf_list);
3376         bl->bgid = bgid;
3377         list_add(&bl->list, list);
3378 }
3379
3380 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3381                                           int bgid, unsigned int issue_flags)
3382 {
3383         struct io_buffer *kbuf = req->kbuf;
3384         struct io_ring_ctx *ctx = req->ctx;
3385         struct io_buffer_list *bl;
3386
3387         if (req->flags & REQ_F_BUFFER_SELECTED)
3388                 return kbuf;
3389
3390         io_ring_submit_lock(req->ctx, issue_flags);
3391
3392         bl = io_buffer_get_list(ctx, bgid);
3393         if (bl && !list_empty(&bl->buf_list)) {
3394                 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
3395                 list_del(&kbuf->list);
3396                 if (*len > kbuf->len)
3397                         *len = kbuf->len;
3398                 req->flags |= REQ_F_BUFFER_SELECTED;
3399                 req->kbuf = kbuf;
3400         } else {
3401                 kbuf = ERR_PTR(-ENOBUFS);
3402         }
3403
3404         io_ring_submit_unlock(req->ctx, issue_flags);
3405         return kbuf;
3406 }
3407
3408 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3409                                         unsigned int issue_flags)
3410 {
3411         struct io_buffer *kbuf;
3412         u16 bgid;
3413
3414         bgid = req->buf_index;
3415         kbuf = io_buffer_select(req, len, bgid, issue_flags);
3416         if (IS_ERR(kbuf))
3417                 return kbuf;
3418         return u64_to_user_ptr(kbuf->addr);
3419 }
3420
3421 #ifdef CONFIG_COMPAT
3422 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3423                                 unsigned int issue_flags)
3424 {
3425         struct compat_iovec __user *uiov;
3426         compat_ssize_t clen;
3427         void __user *buf;
3428         ssize_t len;
3429
3430         uiov = u64_to_user_ptr(req->rw.addr);
3431         if (!access_ok(uiov, sizeof(*uiov)))
3432                 return -EFAULT;
3433         if (__get_user(clen, &uiov->iov_len))
3434                 return -EFAULT;
3435         if (clen < 0)
3436                 return -EINVAL;
3437
3438         len = clen;
3439         buf = io_rw_buffer_select(req, &len, issue_flags);
3440         if (IS_ERR(buf))
3441                 return PTR_ERR(buf);
3442         iov[0].iov_base = buf;
3443         iov[0].iov_len = (compat_size_t) len;
3444         return 0;
3445 }
3446 #endif
3447
3448 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3449                                       unsigned int issue_flags)
3450 {
3451         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3452         void __user *buf;
3453         ssize_t len;
3454
3455         if (copy_from_user(iov, uiov, sizeof(*uiov)))
3456                 return -EFAULT;
3457
3458         len = iov[0].iov_len;
3459         if (len < 0)
3460                 return -EINVAL;
3461         buf = io_rw_buffer_select(req, &len, issue_flags);
3462         if (IS_ERR(buf))
3463                 return PTR_ERR(buf);
3464         iov[0].iov_base = buf;
3465         iov[0].iov_len = len;
3466         return 0;
3467 }
3468
3469 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3470                                     unsigned int issue_flags)
3471 {
3472         if (req->flags & REQ_F_BUFFER_SELECTED) {
3473                 struct io_buffer *kbuf = req->kbuf;
3474
3475                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3476                 iov[0].iov_len = kbuf->len;
3477                 return 0;
3478         }
3479         if (req->rw.len != 1)
3480                 return -EINVAL;
3481
3482 #ifdef CONFIG_COMPAT
3483         if (req->ctx->compat)
3484                 return io_compat_import(req, iov, issue_flags);
3485 #endif
3486
3487         return __io_iov_buffer_select(req, iov, issue_flags);
3488 }
3489
3490 static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
3491                                        struct io_rw_state *s,
3492                                        unsigned int issue_flags)
3493 {
3494         struct iov_iter *iter = &s->iter;
3495         u8 opcode = req->opcode;
3496         struct iovec *iovec;
3497         void __user *buf;
3498         size_t sqe_len;
3499         ssize_t ret;
3500
3501         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3502                 ret = io_import_fixed(req, rw, iter, issue_flags);
3503                 if (ret)
3504                         return ERR_PTR(ret);
3505                 return NULL;
3506         }
3507
3508         /* buffer index only valid with fixed read/write, or buffer select  */
3509         if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
3510                 return ERR_PTR(-EINVAL);
3511
3512         buf = u64_to_user_ptr(req->rw.addr);
3513         sqe_len = req->rw.len;
3514
3515         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3516                 if (req->flags & REQ_F_BUFFER_SELECT) {
3517                         buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
3518                         if (IS_ERR(buf))
3519                                 return ERR_CAST(buf);
3520                         req->rw.len = sqe_len;
3521                 }
3522
3523                 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
3524                 if (ret)
3525                         return ERR_PTR(ret);
3526                 return NULL;
3527         }
3528
3529         iovec = s->fast_iov;
3530         if (req->flags & REQ_F_BUFFER_SELECT) {
3531                 ret = io_iov_buffer_select(req, iovec, issue_flags);
3532                 if (ret)
3533                         return ERR_PTR(ret);
3534                 iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
3535                 return NULL;
3536         }
3537
3538         ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
3539                               req->ctx->compat);
3540         if (unlikely(ret < 0))
3541                 return ERR_PTR(ret);
3542         return iovec;
3543 }
3544
3545 static inline int io_import_iovec(int rw, struct io_kiocb *req,
3546                                   struct iovec **iovec, struct io_rw_state *s,
3547                                   unsigned int issue_flags)
3548 {
3549         *iovec = __io_import_iovec(rw, req, s, issue_flags);
3550         if (unlikely(IS_ERR(*iovec)))
3551                 return PTR_ERR(*iovec);
3552
3553         iov_iter_save_state(&s->iter, &s->iter_state);
3554         return 0;
3555 }
3556
3557 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3558 {
3559         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3560 }
3561
3562 /*
3563  * For files that don't have ->read_iter() and ->write_iter(), handle them
3564  * by looping over ->read() or ->write() manually.
3565  */
3566 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3567 {
3568         struct kiocb *kiocb = &req->rw.kiocb;
3569         struct file *file = req->file;
3570         ssize_t ret = 0;
3571         loff_t *ppos;
3572
3573         /*
3574          * Don't support polled IO through this interface, and we can't
3575          * support non-blocking either. For the latter, this just causes
3576          * the kiocb to be handled from an async context.
3577          */
3578         if (kiocb->ki_flags & IOCB_HIPRI)
3579                 return -EOPNOTSUPP;
3580         if ((kiocb->ki_flags & IOCB_NOWAIT) &&
3581             !(kiocb->ki_filp->f_flags & O_NONBLOCK))
3582                 return -EAGAIN;
3583
3584         ppos = io_kiocb_ppos(kiocb);
3585
3586         while (iov_iter_count(iter)) {
3587                 struct iovec iovec;
3588                 ssize_t nr;
3589
3590                 if (!iov_iter_is_bvec(iter)) {
3591                         iovec = iov_iter_iovec(iter);
3592                 } else {
3593                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3594                         iovec.iov_len = req->rw.len;
3595                 }
3596
3597                 if (rw == READ) {
3598                         nr = file->f_op->read(file, iovec.iov_base,
3599                                               iovec.iov_len, ppos);
3600                 } else {
3601                         nr = file->f_op->write(file, iovec.iov_base,
3602                                                iovec.iov_len, ppos);
3603                 }
3604
3605                 if (nr < 0) {
3606                         if (!ret)
3607                                 ret = nr;
3608                         break;
3609                 }
3610                 ret += nr;
3611                 if (!iov_iter_is_bvec(iter)) {
3612                         iov_iter_advance(iter, nr);
3613                 } else {
3614                         req->rw.addr += nr;
3615                         req->rw.len -= nr;
3616                         if (!req->rw.len)
3617                                 break;
3618                 }
3619                 if (nr != iovec.iov_len)
3620                         break;
3621         }
3622
3623         return ret;
3624 }
3625
3626 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3627                           const struct iovec *fast_iov, struct iov_iter *iter)
3628 {
3629         struct io_async_rw *rw = req->async_data;
3630
3631         memcpy(&rw->s.iter, iter, sizeof(*iter));
3632         rw->free_iovec = iovec;
3633         rw->bytes_done = 0;
3634         /* can only be fixed buffers, no need to do anything */
3635         if (iov_iter_is_bvec(iter))
3636                 return;
3637         if (!iovec) {
3638                 unsigned iov_off = 0;
3639
3640                 rw->s.iter.iov = rw->s.fast_iov;
3641                 if (iter->iov != fast_iov) {
3642                         iov_off = iter->iov - fast_iov;
3643                         rw->s.iter.iov += iov_off;
3644                 }
3645                 if (rw->s.fast_iov != fast_iov)
3646                         memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
3647                                sizeof(struct iovec) * iter->nr_segs);
3648         } else {
3649                 req->flags |= REQ_F_NEED_CLEANUP;
3650         }
3651 }
3652
3653 static inline bool io_alloc_async_data(struct io_kiocb *req)
3654 {
3655         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3656         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3657         if (req->async_data) {
3658                 req->flags |= REQ_F_ASYNC_DATA;
3659                 return false;
3660         }
3661         return true;
3662 }
3663
3664 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3665                              struct io_rw_state *s, bool force)
3666 {
3667         if (!force && !io_op_defs[req->opcode].needs_async_setup)
3668                 return 0;
3669         if (!req_has_async_data(req)) {
3670                 struct io_async_rw *iorw;
3671
3672                 if (io_alloc_async_data(req)) {
3673                         kfree(iovec);
3674                         return -ENOMEM;
3675                 }
3676
3677                 io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
3678                 iorw = req->async_data;
3679                 /* we've copied and mapped the iter, ensure state is saved */
3680                 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
3681         }
3682         return 0;
3683 }
3684
3685 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3686 {
3687         struct io_async_rw *iorw = req->async_data;
3688         struct iovec *iov;
3689         int ret;
3690
3691         /* submission path, ->uring_lock should already be taken */
3692         ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
3693         if (unlikely(ret < 0))
3694                 return ret;
3695
3696         iorw->bytes_done = 0;
3697         iorw->free_iovec = iov;
3698         if (iov)
3699                 req->flags |= REQ_F_NEED_CLEANUP;
3700         return 0;
3701 }
3702
3703 /*
3704  * This is our waitqueue callback handler, registered through __folio_lock_async()
3705  * when we initially tried to do the IO with the iocb armed our waitqueue.
3706  * This gets called when the page is unlocked, and we generally expect that to
3707  * happen when the page IO is completed and the page is now uptodate. This will
3708  * queue a task_work based retry of the operation, attempting to copy the data
3709  * again. If the latter fails because the page was NOT uptodate, then we will
3710  * do a thread based blocking retry of the operation. That's the unexpected
3711  * slow path.
3712  */
3713 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3714                              int sync, void *arg)
3715 {
3716         struct wait_page_queue *wpq;
3717         struct io_kiocb *req = wait->private;
3718         struct wait_page_key *key = arg;
3719
3720         wpq = container_of(wait, struct wait_page_queue, wait);
3721
3722         if (!wake_page_match(wpq, key))
3723                 return 0;
3724
3725         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3726         list_del_init(&wait->entry);
3727         io_req_task_queue(req);
3728         return 1;
3729 }
3730
3731 /*
3732  * This controls whether a given IO request should be armed for async page
3733  * based retry. If we return false here, the request is handed to the async
3734  * worker threads for retry. If we're doing buffered reads on a regular file,
3735  * we prepare a private wait_page_queue entry and retry the operation. This
3736  * will either succeed because the page is now uptodate and unlocked, or it
3737  * will register a callback when the page is unlocked at IO completion. Through
3738  * that callback, io_uring uses task_work to setup a retry of the operation.
3739  * That retry will attempt the buffered read again. The retry will generally
3740  * succeed, or in rare cases where it fails, we then fall back to using the
3741  * async worker threads for a blocking retry.
3742  */
3743 static bool io_rw_should_retry(struct io_kiocb *req)
3744 {
3745         struct io_async_rw *rw = req->async_data;
3746         struct wait_page_queue *wait = &rw->wpq;
3747         struct kiocb *kiocb = &req->rw.kiocb;
3748
3749         /* never retry for NOWAIT, we just complete with -EAGAIN */
3750         if (req->flags & REQ_F_NOWAIT)
3751                 return false;
3752
3753         /* Only for buffered IO */
3754         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3755                 return false;
3756
3757         /*
3758          * just use poll if we can, and don't attempt if the fs doesn't
3759          * support callback based unlocks
3760          */
3761         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3762                 return false;
3763
3764         wait->wait.func = io_async_buf_func;
3765         wait->wait.private = req;
3766         wait->wait.flags = 0;
3767         INIT_LIST_HEAD(&wait->wait.entry);
3768         kiocb->ki_flags |= IOCB_WAITQ;
3769         kiocb->ki_flags &= ~IOCB_NOWAIT;
3770         kiocb->ki_waitq = wait;
3771         return true;
3772 }
3773
3774 static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3775 {
3776         if (likely(req->file->f_op->read_iter))
3777                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3778         else if (req->file->f_op->read)
3779                 return loop_rw_iter(READ, req, iter);
3780         else
3781                 return -EINVAL;
3782 }
3783
3784 static bool need_read_all(struct io_kiocb *req)
3785 {
3786         return req->flags & REQ_F_ISREG ||
3787                 S_ISBLK(file_inode(req->file)->i_mode);
3788 }
3789
3790 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
3791 {
3792         struct kiocb *kiocb = &req->rw.kiocb;
3793         struct io_ring_ctx *ctx = req->ctx;
3794         struct file *file = req->file;
3795         int ret;
3796
3797         if (unlikely(!file || !(file->f_mode & mode)))
3798                 return -EBADF;
3799
3800         if (!io_req_ffs_set(req))
3801                 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
3802
3803         kiocb->ki_flags = iocb_flags(file);
3804         ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
3805         if (unlikely(ret))
3806                 return ret;
3807
3808         /*
3809          * If the file is marked O_NONBLOCK, still allow retry for it if it
3810          * supports async. Otherwise it's impossible to use O_NONBLOCK files
3811          * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
3812          */
3813         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
3814             ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
3815                 req->flags |= REQ_F_NOWAIT;
3816
3817         if (ctx->flags & IORING_SETUP_IOPOLL) {
3818                 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
3819                         return -EOPNOTSUPP;
3820
3821                 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
3822                 kiocb->ki_complete = io_complete_rw_iopoll;
3823                 req->iopoll_completed = 0;
3824         } else {
3825                 if (kiocb->ki_flags & IOCB_HIPRI)
3826                         return -EINVAL;
3827                 kiocb->ki_complete = io_complete_rw;
3828         }
3829
3830         return 0;
3831 }
3832
3833 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3834 {
3835         struct io_rw_state __s, *s = &__s;
3836         struct iovec *iovec;
3837         struct kiocb *kiocb = &req->rw.kiocb;
3838         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3839         struct io_async_rw *rw;
3840         ssize_t ret, ret2;
3841         loff_t *ppos;
3842
3843         if (!req_has_async_data(req)) {
3844                 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3845                 if (unlikely(ret < 0))
3846                         return ret;
3847         } else {
3848                 /*
3849                  * Safe and required to re-import if we're using provided
3850                  * buffers, as we dropped the selected one before retry.
3851                  */
3852                 if (req->flags & REQ_F_BUFFER_SELECT) {
3853                         ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3854                         if (unlikely(ret < 0))
3855                                 return ret;
3856                 }
3857
3858                 rw = req->async_data;
3859                 s = &rw->s;
3860                 /*
3861                  * We come here from an earlier attempt, restore our state to
3862                  * match in case it doesn't. It's cheap enough that we don't
3863                  * need to make this conditional.
3864                  */
3865                 iov_iter_restore(&s->iter, &s->iter_state);
3866                 iovec = NULL;
3867         }
3868         ret = io_rw_init_file(req, FMODE_READ);
3869         if (unlikely(ret)) {
3870                 kfree(iovec);
3871                 return ret;
3872         }
3873         req->cqe.res = iov_iter_count(&s->iter);
3874
3875         if (force_nonblock) {
3876                 /* If the file doesn't support async, just async punt */
3877                 if (unlikely(!io_file_supports_nowait(req))) {
3878                         ret = io_setup_async_rw(req, iovec, s, true);
3879                         return ret ?: -EAGAIN;
3880                 }
3881                 kiocb->ki_flags |= IOCB_NOWAIT;
3882         } else {
3883                 /* Ensure we clear previously set non-block flag */
3884                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3885         }
3886
3887         ppos = io_kiocb_update_pos(req);
3888
3889         ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
3890         if (unlikely(ret)) {
3891                 kfree(iovec);
3892                 return ret;
3893         }
3894
3895         ret = io_iter_do_read(req, &s->iter);
3896
3897         if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3898                 req->flags &= ~REQ_F_REISSUE;
3899                 /* if we can poll, just do that */
3900                 if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
3901                         return -EAGAIN;
3902                 /* IOPOLL retry should happen for io-wq threads */
3903                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3904                         goto done;
3905                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3906                 if (req->flags & REQ_F_NOWAIT)
3907                         goto done;
3908                 ret = 0;
3909         } else if (ret == -EIOCBQUEUED) {
3910                 goto out_free;
3911         } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
3912                    (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
3913                 /* read all, failed, already did sync or don't want to retry */
3914                 goto done;
3915         }
3916
3917         /*
3918          * Don't depend on the iter state matching what was consumed, or being
3919          * untouched in case of error. Restore it and we'll advance it
3920          * manually if we need to.
3921          */
3922         iov_iter_restore(&s->iter, &s->iter_state);
3923
3924         ret2 = io_setup_async_rw(req, iovec, s, true);
3925         if (ret2)
3926                 return ret2;
3927
3928         iovec = NULL;
3929         rw = req->async_data;
3930         s = &rw->s;
3931         /*
3932          * Now use our persistent iterator and state, if we aren't already.
3933          * We've restored and mapped the iter to match.
3934          */
3935
3936         do {
3937                 /*
3938                  * We end up here because of a partial read, either from
3939                  * above or inside this loop. Advance the iter by the bytes
3940                  * that were consumed.
3941                  */
3942                 iov_iter_advance(&s->iter, ret);
3943                 if (!iov_iter_count(&s->iter))
3944                         break;
3945                 rw->bytes_done += ret;
3946                 iov_iter_save_state(&s->iter, &s->iter_state);
3947
3948                 /* if we can retry, do so with the callbacks armed */
3949                 if (!io_rw_should_retry(req)) {
3950                         kiocb->ki_flags &= ~IOCB_WAITQ;
3951                         return -EAGAIN;
3952                 }
3953
3954                 /*
3955                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3956                  * we get -EIOCBQUEUED, then we'll get a notification when the
3957                  * desired page gets unlocked. We can also get a partial read
3958                  * here, and if we do, then just retry at the new offset.
3959                  */
3960                 ret = io_iter_do_read(req, &s->iter);
3961                 if (ret == -EIOCBQUEUED)
3962                         return 0;
3963                 /* we got some bytes, but not all. retry. */
3964                 kiocb->ki_flags &= ~IOCB_WAITQ;
3965                 iov_iter_restore(&s->iter, &s->iter_state);
3966         } while (ret > 0);
3967 done:
3968         kiocb_done(req, ret, issue_flags);
3969 out_free:
3970         /* it's faster to check here then delegate to kfree */
3971         if (iovec)
3972                 kfree(iovec);
3973         return 0;
3974 }
3975
3976 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3977 {
3978         struct io_rw_state __s, *s = &__s;
3979         struct iovec *iovec;
3980         struct kiocb *kiocb = &req->rw.kiocb;
3981         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3982         ssize_t ret, ret2;
3983         loff_t *ppos;
3984
3985         if (!req_has_async_data(req)) {
3986                 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
3987                 if (unlikely(ret < 0))
3988                         return ret;
3989         } else {
3990                 struct io_async_rw *rw = req->async_data;
3991
3992                 s = &rw->s;
3993                 iov_iter_restore(&s->iter, &s->iter_state);
3994                 iovec = NULL;
3995         }
3996         ret = io_rw_init_file(req, FMODE_WRITE);
3997         if (unlikely(ret)) {
3998                 kfree(iovec);
3999                 return ret;
4000         }
4001         req->cqe.res = iov_iter_count(&s->iter);
4002
4003         if (force_nonblock) {
4004                 /* If the file doesn't support async, just async punt */
4005                 if (unlikely(!io_file_supports_nowait(req)))
4006                         goto copy_iov;
4007
4008                 /* file path doesn't support NOWAIT for non-direct_IO */
4009                 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
4010                     (req->flags & REQ_F_ISREG))
4011                         goto copy_iov;
4012
4013                 kiocb->ki_flags |= IOCB_NOWAIT;
4014         } else {
4015                 /* Ensure we clear previously set non-block flag */
4016                 kiocb->ki_flags &= ~IOCB_NOWAIT;
4017         }
4018
4019         ppos = io_kiocb_update_pos(req);
4020
4021         ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
4022         if (unlikely(ret))
4023                 goto out_free;
4024
4025         /*
4026          * Open-code file_start_write here to grab freeze protection,
4027          * which will be released by another thread in
4028          * io_complete_rw().  Fool lockdep by telling it the lock got
4029          * released so that it doesn't complain about the held lock when
4030          * we return to userspace.
4031          */
4032         if (req->flags & REQ_F_ISREG) {
4033                 sb_start_write(file_inode(req->file)->i_sb);
4034                 __sb_writers_release(file_inode(req->file)->i_sb,
4035                                         SB_FREEZE_WRITE);
4036         }
4037         kiocb->ki_flags |= IOCB_WRITE;
4038
4039         if (likely(req->file->f_op->write_iter))
4040                 ret2 = call_write_iter(req->file, kiocb, &s->iter);
4041         else if (req->file->f_op->write)
4042                 ret2 = loop_rw_iter(WRITE, req, &s->iter);
4043         else
4044                 ret2 = -EINVAL;
4045
4046         if (req->flags & REQ_F_REISSUE) {
4047                 req->flags &= ~REQ_F_REISSUE;
4048                 ret2 = -EAGAIN;
4049         }
4050
4051         /*
4052          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
4053          * retry them without IOCB_NOWAIT.
4054          */
4055         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
4056                 ret2 = -EAGAIN;
4057         /* no retry on NONBLOCK nor RWF_NOWAIT */
4058         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
4059                 goto done;
4060         if (!force_nonblock || ret2 != -EAGAIN) {
4061                 /* IOPOLL retry should happen for io-wq threads */
4062                 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
4063                         goto copy_iov;
4064 done:
4065                 kiocb_done(req, ret2, issue_flags);
4066         } else {
4067 copy_iov:
4068                 iov_iter_restore(&s->iter, &s->iter_state);
4069                 ret = io_setup_async_rw(req, iovec, s, false);
4070                 return ret ?: -EAGAIN;
4071         }
4072 out_free:
4073         /* it's reportedly faster than delegating the null check to kfree() */
4074         if (iovec)
4075                 kfree(iovec);
4076         return ret;
4077 }
4078
4079 static int io_renameat_prep(struct io_kiocb *req,
4080                             const struct io_uring_sqe *sqe)
4081 {
4082         struct io_rename *ren = &req->rename;
4083         const char __user *oldf, *newf;
4084
4085         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4086                 return -EINVAL;
4087         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4088                 return -EINVAL;
4089         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4090                 return -EBADF;
4091
4092         ren->old_dfd = READ_ONCE(sqe->fd);
4093         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4094         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4095         ren->new_dfd = READ_ONCE(sqe->len);
4096         ren->flags = READ_ONCE(sqe->rename_flags);
4097
4098         ren->oldpath = getname(oldf);
4099         if (IS_ERR(ren->oldpath))
4100                 return PTR_ERR(ren->oldpath);
4101
4102         ren->newpath = getname(newf);
4103         if (IS_ERR(ren->newpath)) {
4104                 putname(ren->oldpath);
4105                 return PTR_ERR(ren->newpath);
4106         }
4107
4108         req->flags |= REQ_F_NEED_CLEANUP;
4109         return 0;
4110 }
4111
4112 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
4113 {
4114         struct io_rename *ren = &req->rename;
4115         int ret;
4116
4117         if (issue_flags & IO_URING_F_NONBLOCK)
4118                 return -EAGAIN;
4119
4120         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
4121                                 ren->newpath, ren->flags);
4122
4123         req->flags &= ~REQ_F_NEED_CLEANUP;
4124         if (ret < 0)
4125                 req_set_fail(req);
4126         io_req_complete(req, ret);
4127         return 0;
4128 }
4129
4130 static int io_unlinkat_prep(struct io_kiocb *req,
4131                             const struct io_uring_sqe *sqe)
4132 {
4133         struct io_unlink *un = &req->unlink;
4134         const char __user *fname;
4135
4136         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4137                 return -EINVAL;
4138         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4139             sqe->splice_fd_in)
4140                 return -EINVAL;
4141         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4142                 return -EBADF;
4143
4144         un->dfd = READ_ONCE(sqe->fd);
4145
4146         un->flags = READ_ONCE(sqe->unlink_flags);
4147         if (un->flags & ~AT_REMOVEDIR)
4148                 return -EINVAL;
4149
4150         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4151         un->filename = getname(fname);
4152         if (IS_ERR(un->filename))
4153                 return PTR_ERR(un->filename);
4154
4155         req->flags |= REQ_F_NEED_CLEANUP;
4156         return 0;
4157 }
4158
4159 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
4160 {
4161         struct io_unlink *un = &req->unlink;
4162         int ret;
4163
4164         if (issue_flags & IO_URING_F_NONBLOCK)
4165                 return -EAGAIN;
4166
4167         if (un->flags & AT_REMOVEDIR)
4168                 ret = do_rmdir(un->dfd, un->filename);
4169         else
4170                 ret = do_unlinkat(un->dfd, un->filename);
4171
4172         req->flags &= ~REQ_F_NEED_CLEANUP;
4173         if (ret < 0)
4174                 req_set_fail(req);
4175         io_req_complete(req, ret);
4176         return 0;
4177 }
4178
4179 static int io_mkdirat_prep(struct io_kiocb *req,
4180                             const struct io_uring_sqe *sqe)
4181 {
4182         struct io_mkdir *mkd = &req->mkdir;
4183         const char __user *fname;
4184
4185         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4186                 return -EINVAL;
4187         if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
4188             sqe->splice_fd_in)
4189                 return -EINVAL;
4190         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4191                 return -EBADF;
4192
4193         mkd->dfd = READ_ONCE(sqe->fd);
4194         mkd->mode = READ_ONCE(sqe->len);
4195
4196         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4197         mkd->filename = getname(fname);
4198         if (IS_ERR(mkd->filename))
4199                 return PTR_ERR(mkd->filename);
4200
4201         req->flags |= REQ_F_NEED_CLEANUP;
4202         return 0;
4203 }
4204
4205 static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
4206 {
4207         struct io_mkdir *mkd = &req->mkdir;
4208         int ret;
4209
4210         if (issue_flags & IO_URING_F_NONBLOCK)
4211                 return -EAGAIN;
4212
4213         ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
4214
4215         req->flags &= ~REQ_F_NEED_CLEANUP;
4216         if (ret < 0)
4217                 req_set_fail(req);
4218         io_req_complete(req, ret);
4219         return 0;
4220 }
4221
4222 static int io_symlinkat_prep(struct io_kiocb *req,
4223                             const struct io_uring_sqe *sqe)
4224 {
4225         struct io_symlink *sl = &req->symlink;
4226         const char __user *oldpath, *newpath;
4227
4228         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4229                 return -EINVAL;
4230         if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
4231             sqe->splice_fd_in)
4232                 return -EINVAL;
4233         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4234                 return -EBADF;
4235
4236         sl->new_dfd = READ_ONCE(sqe->fd);
4237         oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
4238         newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4239
4240         sl->oldpath = getname(oldpath);
4241         if (IS_ERR(sl->oldpath))
4242                 return PTR_ERR(sl->oldpath);
4243
4244         sl->newpath = getname(newpath);
4245         if (IS_ERR(sl->newpath)) {
4246                 putname(sl->oldpath);
4247                 return PTR_ERR(sl->newpath);
4248         }
4249
4250         req->flags |= REQ_F_NEED_CLEANUP;
4251         return 0;
4252 }
4253
4254 static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
4255 {
4256         struct io_symlink *sl = &req->symlink;
4257         int ret;
4258
4259         if (issue_flags & IO_URING_F_NONBLOCK)
4260                 return -EAGAIN;
4261
4262         ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
4263
4264         req->flags &= ~REQ_F_NEED_CLEANUP;
4265         if (ret < 0)
4266                 req_set_fail(req);
4267         io_req_complete(req, ret);
4268         return 0;
4269 }
4270
4271 static int io_linkat_prep(struct io_kiocb *req,
4272                             const struct io_uring_sqe *sqe)
4273 {
4274         struct io_hardlink *lnk = &req->hardlink;
4275         const char __user *oldf, *newf;
4276
4277         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4278                 return -EINVAL;
4279         if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
4280                 return -EINVAL;
4281         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4282                 return -EBADF;
4283
4284         lnk->old_dfd = READ_ONCE(sqe->fd);
4285         lnk->new_dfd = READ_ONCE(sqe->len);
4286         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4287         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4288         lnk->flags = READ_ONCE(sqe->hardlink_flags);
4289
4290         lnk->oldpath = getname(oldf);
4291         if (IS_ERR(lnk->oldpath))
4292                 return PTR_ERR(lnk->oldpath);
4293
4294         lnk->newpath = getname(newf);
4295         if (IS_ERR(lnk->newpath)) {
4296                 putname(lnk->oldpath);
4297                 return PTR_ERR(lnk->newpath);
4298         }
4299
4300         req->flags |= REQ_F_NEED_CLEANUP;
4301         return 0;
4302 }
4303
4304 static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
4305 {
4306         struct io_hardlink *lnk = &req->hardlink;
4307         int ret;
4308
4309         if (issue_flags & IO_URING_F_NONBLOCK)
4310                 return -EAGAIN;
4311
4312         ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
4313                                 lnk->newpath, lnk->flags);
4314
4315         req->flags &= ~REQ_F_NEED_CLEANUP;
4316         if (ret < 0)
4317                 req_set_fail(req);
4318         io_req_complete(req, ret);
4319         return 0;
4320 }
4321
4322 static int io_shutdown_prep(struct io_kiocb *req,
4323                             const struct io_uring_sqe *sqe)
4324 {
4325 #if defined(CONFIG_NET)
4326         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4327                 return -EINVAL;
4328         if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
4329                      sqe->buf_index || sqe->splice_fd_in))
4330                 return -EINVAL;
4331
4332         req->shutdown.how = READ_ONCE(sqe->len);
4333         return 0;
4334 #else
4335         return -EOPNOTSUPP;
4336 #endif
4337 }
4338
4339 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
4340 {
4341 #if defined(CONFIG_NET)
4342         struct socket *sock;
4343         int ret;
4344
4345         if (issue_flags & IO_URING_F_NONBLOCK)
4346                 return -EAGAIN;
4347
4348         sock = sock_from_file(req->file);
4349         if (unlikely(!sock))
4350                 return -ENOTSOCK;
4351
4352         ret = __sys_shutdown_sock(sock, req->shutdown.how);
4353         if (ret < 0)
4354                 req_set_fail(req);
4355         io_req_complete(req, ret);
4356         return 0;
4357 #else
4358         return -EOPNOTSUPP;
4359 #endif
4360 }
4361
4362 static int __io_splice_prep(struct io_kiocb *req,
4363                             const struct io_uring_sqe *sqe)
4364 {
4365         struct io_splice *sp = &req->splice;
4366         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
4367
4368         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4369                 return -EINVAL;
4370
4371         sp->len = READ_ONCE(sqe->len);
4372         sp->flags = READ_ONCE(sqe->splice_flags);
4373         if (unlikely(sp->flags & ~valid_flags))
4374                 return -EINVAL;
4375         sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
4376         return 0;
4377 }
4378
4379 static int io_tee_prep(struct io_kiocb *req,
4380                        const struct io_uring_sqe *sqe)
4381 {
4382         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
4383                 return -EINVAL;
4384         return __io_splice_prep(req, sqe);
4385 }
4386
4387 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
4388 {
4389         struct io_splice *sp = &req->splice;
4390         struct file *out = sp->file_out;
4391         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4392         struct file *in;
4393         long ret = 0;
4394
4395         if (issue_flags & IO_URING_F_NONBLOCK)
4396                 return -EAGAIN;
4397
4398         if (sp->flags & SPLICE_F_FD_IN_FIXED)
4399                 in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
4400         else
4401                 in = io_file_get_normal(req, sp->splice_fd_in);
4402         if (!in) {
4403                 ret = -EBADF;
4404                 goto done;
4405         }
4406
4407         if (sp->len)
4408                 ret = do_tee(in, out, sp->len, flags);
4409
4410         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4411                 io_put_file(in);
4412 done:
4413         if (ret != sp->len)
4414                 req_set_fail(req);
4415         io_req_complete(req, ret);
4416         return 0;
4417 }
4418
4419 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4420 {
4421         struct io_splice *sp = &req->splice;
4422
4423         sp->off_in = READ_ONCE(sqe->splice_off_in);
4424         sp->off_out = READ_ONCE(sqe->off);
4425         return __io_splice_prep(req, sqe);
4426 }
4427
4428 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
4429 {
4430         struct io_splice *sp = &req->splice;
4431         struct file *out = sp->file_out;
4432         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4433         loff_t *poff_in, *poff_out;
4434         struct file *in;
4435         long ret = 0;
4436
4437         if (issue_flags & IO_URING_F_NONBLOCK)
4438                 return -EAGAIN;
4439
4440         if (sp->flags & SPLICE_F_FD_IN_FIXED)
4441                 in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
4442         else
4443                 in = io_file_get_normal(req, sp->splice_fd_in);
4444         if (!in) {
4445                 ret = -EBADF;
4446                 goto done;
4447         }
4448
4449         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4450         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
4451
4452         if (sp->len)
4453                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
4454
4455         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4456                 io_put_file(in);
4457 done:
4458         if (ret != sp->len)
4459                 req_set_fail(req);
4460         io_req_complete(req, ret);
4461         return 0;
4462 }
4463
4464 /*
4465  * IORING_OP_NOP just posts a completion event, nothing else.
4466  */
4467 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
4468 {
4469         struct io_ring_ctx *ctx = req->ctx;
4470
4471         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4472                 return -EINVAL;
4473
4474         __io_req_complete(req, issue_flags, 0, 0);
4475         return 0;
4476 }
4477
4478 static int io_msg_ring_prep(struct io_kiocb *req,
4479                             const struct io_uring_sqe *sqe)
4480 {
4481         if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
4482                      sqe->splice_fd_in || sqe->buf_index || sqe->personality))
4483                 return -EINVAL;
4484
4485         req->msg.user_data = READ_ONCE(sqe->off);
4486         req->msg.len = READ_ONCE(sqe->len);
4487         return 0;
4488 }
4489
4490 static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
4491 {
4492         struct io_ring_ctx *target_ctx;
4493         struct io_msg *msg = &req->msg;
4494         bool filled;
4495         int ret;
4496
4497         ret = -EBADFD;
4498         if (req->file->f_op != &io_uring_fops)
4499                 goto done;
4500
4501         ret = -EOVERFLOW;
4502         target_ctx = req->file->private_data;
4503
4504         spin_lock(&target_ctx->completion_lock);
4505         filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
4506         io_commit_cqring(target_ctx);
4507         spin_unlock(&target_ctx->completion_lock);
4508
4509         if (filled) {
4510                 io_cqring_ev_posted(target_ctx);
4511                 ret = 0;
4512         }
4513
4514 done:
4515         if (ret < 0)
4516                 req_set_fail(req);
4517         __io_req_complete(req, issue_flags, ret, 0);
4518         return 0;
4519 }
4520
4521 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4522 {
4523         struct io_ring_ctx *ctx = req->ctx;
4524
4525         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4526                 return -EINVAL;
4527         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4528                      sqe->splice_fd_in))
4529                 return -EINVAL;
4530
4531         req->sync.flags = READ_ONCE(sqe->fsync_flags);
4532         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4533                 return -EINVAL;
4534
4535         req->sync.off = READ_ONCE(sqe->off);
4536         req->sync.len = READ_ONCE(sqe->len);
4537         return 0;
4538 }
4539
4540 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
4541 {
4542         loff_t end = req->sync.off + req->sync.len;
4543         int ret;
4544
4545         /* fsync always requires a blocking context */
4546         if (issue_flags & IO_URING_F_NONBLOCK)
4547                 return -EAGAIN;
4548
4549         ret = vfs_fsync_range(req->file, req->sync.off,
4550                                 end > 0 ? end : LLONG_MAX,
4551                                 req->sync.flags & IORING_FSYNC_DATASYNC);
4552         if (ret < 0)
4553                 req_set_fail(req);
4554         io_req_complete(req, ret);
4555         return 0;
4556 }
4557
4558 static int io_fallocate_prep(struct io_kiocb *req,
4559                              const struct io_uring_sqe *sqe)
4560 {
4561         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4562             sqe->splice_fd_in)
4563                 return -EINVAL;
4564         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4565                 return -EINVAL;
4566
4567         req->sync.off = READ_ONCE(sqe->off);
4568         req->sync.len = READ_ONCE(sqe->addr);
4569         req->sync.mode = READ_ONCE(sqe->len);
4570         return 0;
4571 }
4572
4573 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
4574 {
4575         int ret;
4576
4577         /* fallocate always requiring blocking context */
4578         if (issue_flags & IO_URING_F_NONBLOCK)
4579                 return -EAGAIN;
4580         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4581                                 req->sync.len);
4582         if (ret < 0)
4583                 req_set_fail(req);
4584         else
4585                 fsnotify_modify(req->file);
4586         io_req_complete(req, ret);
4587         return 0;
4588 }
4589
4590 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4591 {
4592         const char __user *fname;
4593         int ret;
4594
4595         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4596                 return -EINVAL;
4597         if (unlikely(sqe->ioprio || sqe->buf_index))
4598                 return -EINVAL;
4599         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4600                 return -EBADF;
4601
4602         /* open.how should be already initialised */
4603         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
4604                 req->open.how.flags |= O_LARGEFILE;
4605
4606         req->open.dfd = READ_ONCE(sqe->fd);
4607         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4608         req->open.filename = getname(fname);
4609         if (IS_ERR(req->open.filename)) {
4610                 ret = PTR_ERR(req->open.filename);
4611                 req->open.filename = NULL;
4612                 return ret;
4613         }
4614
4615         req->open.file_slot = READ_ONCE(sqe->file_index);
4616         if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4617                 return -EINVAL;
4618
4619         req->open.nofile = rlimit(RLIMIT_NOFILE);
4620         req->flags |= REQ_F_NEED_CLEANUP;
4621         return 0;
4622 }
4623
4624 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4625 {
4626         u64 mode = READ_ONCE(sqe->len);
4627         u64 flags = READ_ONCE(sqe->open_flags);
4628
4629         req->open.how = build_open_how(flags, mode);
4630         return __io_openat_prep(req, sqe);
4631 }
4632
4633 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4634 {
4635         struct open_how __user *how;
4636         size_t len;
4637         int ret;
4638
4639         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4640         len = READ_ONCE(sqe->len);
4641         if (len < OPEN_HOW_SIZE_VER0)
4642                 return -EINVAL;
4643
4644         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4645                                         len);
4646         if (ret)
4647                 return ret;
4648
4649         return __io_openat_prep(req, sqe);
4650 }
4651
4652 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
4653 {
4654         struct open_flags op;
4655         struct file *file;
4656         bool resolve_nonblock, nonblock_set;
4657         bool fixed = !!req->open.file_slot;
4658         int ret;
4659
4660         ret = build_open_flags(&req->open.how, &op);
4661         if (ret)
4662                 goto err;
4663         nonblock_set = op.open_flag & O_NONBLOCK;
4664         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
4665         if (issue_flags & IO_URING_F_NONBLOCK) {
4666                 /*
4667                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4668                  * it'll always -EAGAIN
4669                  */
4670                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4671                         return -EAGAIN;
4672                 op.lookup_flags |= LOOKUP_CACHED;
4673                 op.open_flag |= O_NONBLOCK;
4674         }
4675
4676         if (!fixed) {
4677                 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4678                 if (ret < 0)
4679                         goto err;
4680         }
4681
4682         file = do_filp_open(req->open.dfd, req->open.filename, &op);
4683         if (IS_ERR(file)) {
4684                 /*
4685                  * We could hang on to this 'fd' on retrying, but seems like
4686                  * marginal gain for something that is now known to be a slower
4687                  * path. So just put it, and we'll get a new one when we retry.
4688                  */
4689                 if (!fixed)
4690                         put_unused_fd(ret);
4691
4692                 ret = PTR_ERR(file);
4693                 /* only retry if RESOLVE_CACHED wasn't already set by application */
4694                 if (ret == -EAGAIN &&
4695                     (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4696                         return -EAGAIN;
4697                 goto err;
4698         }
4699
4700         if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4701                 file->f_flags &= ~O_NONBLOCK;
4702         fsnotify_open(file);
4703
4704         if (!fixed)
4705                 fd_install(ret, file);
4706         else
4707                 ret = io_install_fixed_file(req, file, issue_flags,
4708                                             req->open.file_slot - 1);
4709 err:
4710         putname(req->open.filename);
4711         req->flags &= ~REQ_F_NEED_CLEANUP;
4712         if (ret < 0)
4713                 req_set_fail(req);
4714         __io_req_complete(req, issue_flags, ret, 0);
4715         return 0;
4716 }
4717
4718 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
4719 {
4720         return io_openat2(req, issue_flags);
4721 }
4722
4723 static int io_remove_buffers_prep(struct io_kiocb *req,
4724                                   const struct io_uring_sqe *sqe)
4725 {
4726         struct io_provide_buf *p = &req->pbuf;
4727         u64 tmp;
4728
4729         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4730             sqe->splice_fd_in)
4731                 return -EINVAL;
4732
4733         tmp = READ_ONCE(sqe->fd);
4734         if (!tmp || tmp > USHRT_MAX)
4735                 return -EINVAL;
4736
4737         memset(p, 0, sizeof(*p));
4738         p->nbufs = tmp;
4739         p->bgid = READ_ONCE(sqe->buf_group);
4740         return 0;
4741 }
4742
4743 static int __io_remove_buffers(struct io_ring_ctx *ctx,
4744                                struct io_buffer_list *bl, unsigned nbufs)
4745 {
4746         unsigned i = 0;
4747
4748         /* shouldn't happen */
4749         if (!nbufs)
4750                 return 0;
4751
4752         /* the head kbuf is the list itself */
4753         while (!list_empty(&bl->buf_list)) {
4754                 struct io_buffer *nxt;
4755
4756                 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
4757                 list_del(&nxt->list);
4758                 if (++i == nbufs)
4759                         return i;
4760                 cond_resched();
4761         }
4762         i++;
4763
4764         return i;
4765 }
4766
4767 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4768 {
4769         struct io_provide_buf *p = &req->pbuf;
4770         struct io_ring_ctx *ctx = req->ctx;
4771         struct io_buffer_list *bl;
4772         int ret = 0;
4773
4774         io_ring_submit_lock(ctx, issue_flags);
4775
4776         ret = -ENOENT;
4777         bl = io_buffer_get_list(ctx, p->bgid);
4778         if (bl)
4779                 ret = __io_remove_buffers(ctx, bl, p->nbufs);
4780         if (ret < 0)
4781                 req_set_fail(req);
4782
4783         /* complete before unlock, IOPOLL may need the lock */
4784         __io_req_complete(req, issue_flags, ret, 0);
4785         io_ring_submit_unlock(ctx, issue_flags);
4786         return 0;
4787 }
4788
4789 static int io_provide_buffers_prep(struct io_kiocb *req,
4790                                    const struct io_uring_sqe *sqe)
4791 {
4792         unsigned long size, tmp_check;
4793         struct io_provide_buf *p = &req->pbuf;
4794         u64 tmp;
4795
4796         if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
4797                 return -EINVAL;
4798
4799         tmp = READ_ONCE(sqe->fd);
4800         if (!tmp || tmp > USHRT_MAX)
4801                 return -E2BIG;
4802         p->nbufs = tmp;
4803         p->addr = READ_ONCE(sqe->addr);
4804         p->len = READ_ONCE(sqe->len);
4805
4806         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4807                                 &size))
4808                 return -EOVERFLOW;
4809         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4810                 return -EOVERFLOW;
4811
4812         size = (unsigned long)p->len * p->nbufs;
4813         if (!access_ok(u64_to_user_ptr(p->addr), size))
4814                 return -EFAULT;
4815
4816         p->bgid = READ_ONCE(sqe->buf_group);
4817         tmp = READ_ONCE(sqe->off);
4818         if (tmp > USHRT_MAX)
4819                 return -E2BIG;
4820         p->bid = tmp;
4821         return 0;
4822 }
4823
4824 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
4825 {
4826         struct io_buffer *buf;
4827         struct page *page;
4828         int bufs_in_page;
4829
4830         /*
4831          * Completions that don't happen inline (eg not under uring_lock) will
4832          * add to ->io_buffers_comp. If we don't have any free buffers, check
4833          * the completion list and splice those entries first.
4834          */
4835         if (!list_empty_careful(&ctx->io_buffers_comp)) {
4836                 spin_lock(&ctx->completion_lock);
4837                 if (!list_empty(&ctx->io_buffers_comp)) {
4838                         list_splice_init(&ctx->io_buffers_comp,
4839                                                 &ctx->io_buffers_cache);
4840                         spin_unlock(&ctx->completion_lock);
4841                         return 0;
4842                 }
4843                 spin_unlock(&ctx->completion_lock);
4844         }
4845
4846         /*
4847          * No free buffers and no completion entries either. Allocate a new
4848          * page worth of buffer entries and add those to our freelist.
4849          */
4850         page = alloc_page(GFP_KERNEL_ACCOUNT);
4851         if (!page)
4852                 return -ENOMEM;
4853
4854         list_add(&page->lru, &ctx->io_buffers_pages);
4855
4856         buf = page_address(page);
4857         bufs_in_page = PAGE_SIZE / sizeof(*buf);
4858         while (bufs_in_page) {
4859                 list_add_tail(&buf->list, &ctx->io_buffers_cache);
4860                 buf++;
4861                 bufs_in_page--;
4862         }
4863
4864         return 0;
4865 }
4866
4867 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
4868                           struct io_buffer_list *bl)
4869 {
4870         struct io_buffer *buf;
4871         u64 addr = pbuf->addr;
4872         int i, bid = pbuf->bid;
4873
4874         for (i = 0; i < pbuf->nbufs; i++) {
4875                 if (list_empty(&ctx->io_buffers_cache) &&
4876                     io_refill_buffer_cache(ctx))
4877                         break;
4878                 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
4879                                         list);
4880                 list_move_tail(&buf->list, &bl->buf_list);
4881                 buf->addr = addr;
4882                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4883                 buf->bid = bid;
4884                 buf->bgid = pbuf->bgid;
4885                 addr += pbuf->len;
4886                 bid++;
4887                 cond_resched();
4888         }
4889
4890         return i ? 0 : -ENOMEM;
4891 }
4892
4893 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4894 {
4895         struct io_provide_buf *p = &req->pbuf;
4896         struct io_ring_ctx *ctx = req->ctx;
4897         struct io_buffer_list *bl;
4898         int ret = 0;
4899
4900         io_ring_submit_lock(ctx, issue_flags);
4901
4902         bl = io_buffer_get_list(ctx, p->bgid);
4903         if (unlikely(!bl)) {
4904                 bl = kmalloc(sizeof(*bl), GFP_KERNEL);
4905                 if (!bl) {
4906                         ret = -ENOMEM;
4907                         goto err;
4908                 }
4909                 io_buffer_add_list(ctx, bl, p->bgid);
4910         }
4911
4912         ret = io_add_buffers(ctx, p, bl);
4913 err:
4914         if (ret < 0)
4915                 req_set_fail(req);
4916         /* complete before unlock, IOPOLL may need the lock */
4917         __io_req_complete(req, issue_flags, ret, 0);
4918         io_ring_submit_unlock(ctx, issue_flags);
4919         return 0;
4920 }
4921
4922 static int io_epoll_ctl_prep(struct io_kiocb *req,
4923                              const struct io_uring_sqe *sqe)
4924 {
4925 #if defined(CONFIG_EPOLL)
4926         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4927                 return -EINVAL;
4928         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4929                 return -EINVAL;
4930
4931         req->epoll.epfd = READ_ONCE(sqe->fd);
4932         req->epoll.op = READ_ONCE(sqe->len);
4933         req->epoll.fd = READ_ONCE(sqe->off);
4934
4935         if (ep_op_has_event(req->epoll.op)) {
4936                 struct epoll_event __user *ev;
4937
4938                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4939                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4940                         return -EFAULT;
4941         }
4942
4943         return 0;
4944 #else
4945         return -EOPNOTSUPP;
4946 #endif
4947 }
4948
4949 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4950 {
4951 #if defined(CONFIG_EPOLL)
4952         struct io_epoll *ie = &req->epoll;
4953         int ret;
4954         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4955
4956         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4957         if (force_nonblock && ret == -EAGAIN)
4958                 return -EAGAIN;
4959
4960         if (ret < 0)
4961                 req_set_fail(req);
4962         __io_req_complete(req, issue_flags, ret, 0);
4963         return 0;
4964 #else
4965         return -EOPNOTSUPP;
4966 #endif
4967 }
4968
4969 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4970 {
4971 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4972         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
4973                 return -EINVAL;
4974         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4975                 return -EINVAL;
4976
4977         req->madvise.addr = READ_ONCE(sqe->addr);
4978         req->madvise.len = READ_ONCE(sqe->len);
4979         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4980         return 0;
4981 #else
4982         return -EOPNOTSUPP;
4983 #endif
4984 }
4985
4986 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4987 {
4988 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4989         struct io_madvise *ma = &req->madvise;
4990         int ret;
4991
4992         if (issue_flags & IO_URING_F_NONBLOCK)
4993                 return -EAGAIN;
4994
4995         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4996         if (ret < 0)
4997                 req_set_fail(req);
4998         io_req_complete(req, ret);
4999         return 0;
5000 #else
5001         return -EOPNOTSUPP;
5002 #endif
5003 }
5004
5005 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5006 {
5007         if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
5008                 return -EINVAL;
5009         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5010                 return -EINVAL;
5011
5012         req->fadvise.offset = READ_ONCE(sqe->off);
5013         req->fadvise.len = READ_ONCE(sqe->len);
5014         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
5015         return 0;
5016 }
5017
5018 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
5019 {
5020         struct io_fadvise *fa = &req->fadvise;
5021         int ret;
5022
5023         if (issue_flags & IO_URING_F_NONBLOCK) {
5024                 switch (fa->advice) {
5025                 case POSIX_FADV_NORMAL:
5026                 case POSIX_FADV_RANDOM:
5027                 case POSIX_FADV_SEQUENTIAL:
5028                         break;
5029                 default:
5030                         return -EAGAIN;
5031                 }
5032         }
5033
5034         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
5035         if (ret < 0)
5036                 req_set_fail(req);
5037         __io_req_complete(req, issue_flags, ret, 0);
5038         return 0;
5039 }
5040
5041 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5042 {
5043         const char __user *path;
5044
5045         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5046                 return -EINVAL;
5047         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
5048                 return -EINVAL;
5049         if (req->flags & REQ_F_FIXED_FILE)
5050                 return -EBADF;
5051
5052         req->statx.dfd = READ_ONCE(sqe->fd);
5053         req->statx.mask = READ_ONCE(sqe->len);
5054         path = u64_to_user_ptr(READ_ONCE(sqe->addr));
5055         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5056         req->statx.flags = READ_ONCE(sqe->statx_flags);
5057
5058         req->statx.filename = getname_flags(path,
5059                                         getname_statx_lookup_flags(req->statx.flags),
5060                                         NULL);
5061
5062         if (IS_ERR(req->statx.filename)) {
5063                 int ret = PTR_ERR(req->statx.filename);
5064
5065                 req->statx.filename = NULL;
5066                 return ret;
5067         }
5068
5069         req->flags |= REQ_F_NEED_CLEANUP;
5070         return 0;
5071 }
5072
5073 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
5074 {
5075         struct io_statx *ctx = &req->statx;
5076         int ret;
5077
5078         if (issue_flags & IO_URING_F_NONBLOCK)
5079                 return -EAGAIN;
5080
5081         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
5082                        ctx->buffer);
5083
5084         if (ret < 0)
5085                 req_set_fail(req);
5086         io_req_complete(req, ret);
5087         return 0;
5088 }
5089
5090 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5091 {
5092         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5093                 return -EINVAL;
5094         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
5095             sqe->rw_flags || sqe->buf_index)
5096                 return -EINVAL;
5097         if (req->flags & REQ_F_FIXED_FILE)
5098                 return -EBADF;
5099
5100         req->close.fd = READ_ONCE(sqe->fd);
5101         req->close.file_slot = READ_ONCE(sqe->file_index);
5102         if (req->close.file_slot && req->close.fd)
5103                 return -EINVAL;
5104
5105         return 0;
5106 }
5107
5108 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
5109 {
5110         struct files_struct *files = current->files;
5111         struct io_close *close = &req->close;
5112         struct fdtable *fdt;
5113         struct file *file = NULL;
5114         int ret = -EBADF;
5115
5116         if (req->close.file_slot) {
5117                 ret = io_close_fixed(req, issue_flags);
5118                 goto err;
5119         }
5120
5121         spin_lock(&files->file_lock);
5122         fdt = files_fdtable(files);
5123         if (close->fd >= fdt->max_fds) {
5124                 spin_unlock(&files->file_lock);
5125                 goto err;
5126         }
5127         file = fdt->fd[close->fd];
5128         if (!file || file->f_op == &io_uring_fops) {
5129                 spin_unlock(&files->file_lock);
5130                 file = NULL;
5131                 goto err;
5132         }
5133
5134         /* if the file has a flush method, be safe and punt to async */
5135         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
5136                 spin_unlock(&files->file_lock);
5137                 return -EAGAIN;
5138         }
5139
5140         ret = __close_fd_get_file(close->fd, &file);
5141         spin_unlock(&files->file_lock);
5142         if (ret < 0) {
5143                 if (ret == -ENOENT)
5144                         ret = -EBADF;
5145                 goto err;
5146         }
5147
5148         /* No ->flush() or already async, safely close from here */
5149         ret = filp_close(file, current->files);
5150 err:
5151         if (ret < 0)
5152                 req_set_fail(req);
5153         if (file)
5154                 fput(file);
5155         __io_req_complete(req, issue_flags, ret, 0);
5156         return 0;
5157 }
5158
5159 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5160 {
5161         struct io_ring_ctx *ctx = req->ctx;
5162
5163         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
5164                 return -EINVAL;
5165         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
5166                      sqe->splice_fd_in))
5167                 return -EINVAL;
5168
5169         req->sync.off = READ_ONCE(sqe->off);
5170         req->sync.len = READ_ONCE(sqe->len);
5171         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
5172         return 0;
5173 }
5174
5175 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
5176 {
5177         int ret;
5178
5179         /* sync_file_range always requires a blocking context */
5180         if (issue_flags & IO_URING_F_NONBLOCK)
5181                 return -EAGAIN;
5182
5183         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
5184                                 req->sync.flags);
5185         if (ret < 0)
5186                 req_set_fail(req);
5187         io_req_complete(req, ret);
5188         return 0;
5189 }
5190
5191 #if defined(CONFIG_NET)
5192 static int io_setup_async_msg(struct io_kiocb *req,
5193                               struct io_async_msghdr *kmsg)
5194 {
5195         struct io_async_msghdr *async_msg = req->async_data;
5196
5197         if (async_msg)
5198                 return -EAGAIN;
5199         if (io_alloc_async_data(req)) {
5200                 kfree(kmsg->free_iov);
5201                 return -ENOMEM;
5202         }
5203         async_msg = req->async_data;
5204         req->flags |= REQ_F_NEED_CLEANUP;
5205         memcpy(async_msg, kmsg, sizeof(*kmsg));
5206         async_msg->msg.msg_name = &async_msg->addr;
5207         /* if were using fast_iov, set it to the new one */
5208         if (!async_msg->free_iov)
5209                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
5210
5211         return -EAGAIN;
5212 }
5213
5214 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
5215                                struct io_async_msghdr *iomsg)
5216 {
5217         iomsg->msg.msg_name = &iomsg->addr;
5218         iomsg->free_iov = iomsg->fast_iov;
5219         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
5220                                    req->sr_msg.msg_flags, &iomsg->free_iov);
5221 }
5222
5223 static int io_sendmsg_prep_async(struct io_kiocb *req)
5224 {
5225         int ret;
5226
5227         ret = io_sendmsg_copy_hdr(req, req->async_data);
5228         if (!ret)
5229                 req->flags |= REQ_F_NEED_CLEANUP;
5230         return ret;
5231 }
5232
5233 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5234 {
5235         struct io_sr_msg *sr = &req->sr_msg;
5236
5237         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5238                 return -EINVAL;
5239
5240         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
5241         sr->len = READ_ONCE(sqe->len);
5242         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5243         if (sr->msg_flags & MSG_DONTWAIT)
5244                 req->flags |= REQ_F_NOWAIT;
5245
5246 #ifdef CONFIG_COMPAT
5247         if (req->ctx->compat)
5248                 sr->msg_flags |= MSG_CMSG_COMPAT;
5249 #endif
5250         return 0;
5251 }
5252
5253 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
5254 {
5255         struct io_async_msghdr iomsg, *kmsg;
5256         struct socket *sock;
5257         unsigned flags;
5258         int min_ret = 0;
5259         int ret;
5260
5261         sock = sock_from_file(req->file);
5262         if (unlikely(!sock))
5263                 return -ENOTSOCK;
5264
5265         if (req_has_async_data(req)) {
5266                 kmsg = req->async_data;
5267         } else {
5268                 ret = io_sendmsg_copy_hdr(req, &iomsg);
5269                 if (ret)
5270                         return ret;
5271                 kmsg = &iomsg;
5272         }
5273
5274         flags = req->sr_msg.msg_flags;
5275         if (issue_flags & IO_URING_F_NONBLOCK)
5276                 flags |= MSG_DONTWAIT;
5277         if (flags & MSG_WAITALL)
5278                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5279
5280         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
5281
5282         if (ret < min_ret) {
5283                 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5284                         return io_setup_async_msg(req, kmsg);
5285                 if (ret == -ERESTARTSYS)
5286                         ret = -EINTR;
5287                 req_set_fail(req);
5288         }
5289         /* fast path, check for non-NULL to avoid function call */
5290         if (kmsg->free_iov)
5291                 kfree(kmsg->free_iov);
5292         req->flags &= ~REQ_F_NEED_CLEANUP;
5293         __io_req_complete(req, issue_flags, ret, 0);
5294         return 0;
5295 }
5296
5297 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
5298 {
5299         struct io_sr_msg *sr = &req->sr_msg;
5300         struct msghdr msg;
5301         struct iovec iov;
5302         struct socket *sock;
5303         unsigned flags;
5304         int min_ret = 0;
5305         int ret;
5306
5307         sock = sock_from_file(req->file);
5308         if (unlikely(!sock))
5309                 return -ENOTSOCK;
5310
5311         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
5312         if (unlikely(ret))
5313                 return ret;
5314
5315         msg.msg_name = NULL;
5316         msg.msg_control = NULL;
5317         msg.msg_controllen = 0;
5318         msg.msg_namelen = 0;
5319
5320         flags = req->sr_msg.msg_flags;
5321         if (issue_flags & IO_URING_F_NONBLOCK)
5322                 flags |= MSG_DONTWAIT;
5323         if (flags & MSG_WAITALL)
5324                 min_ret = iov_iter_count(&msg.msg_iter);
5325
5326         msg.msg_flags = flags;
5327         ret = sock_sendmsg(sock, &msg);
5328         if (ret < min_ret) {
5329                 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5330                         return -EAGAIN;
5331                 if (ret == -ERESTARTSYS)
5332                         ret = -EINTR;
5333                 req_set_fail(req);
5334         }
5335         __io_req_complete(req, issue_flags, ret, 0);
5336         return 0;
5337 }
5338
5339 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
5340                                  struct io_async_msghdr *iomsg)
5341 {
5342         struct io_sr_msg *sr = &req->sr_msg;
5343         struct iovec __user *uiov;
5344         size_t iov_len;
5345         int ret;
5346
5347         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
5348                                         &iomsg->uaddr, &uiov, &iov_len);
5349         if (ret)
5350                 return ret;
5351
5352         if (req->flags & REQ_F_BUFFER_SELECT) {
5353                 if (iov_len > 1)
5354                         return -EINVAL;
5355                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
5356                         return -EFAULT;
5357                 sr->len = iomsg->fast_iov[0].iov_len;
5358                 iomsg->free_iov = NULL;
5359         } else {
5360                 iomsg->free_iov = iomsg->fast_iov;
5361                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
5362                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
5363                                      false);
5364                 if (ret > 0)
5365                         ret = 0;
5366         }
5367
5368         return ret;
5369 }
5370
5371 #ifdef CONFIG_COMPAT
5372 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
5373                                         struct io_async_msghdr *iomsg)
5374 {
5375         struct io_sr_msg *sr = &req->sr_msg;
5376         struct compat_iovec __user *uiov;
5377         compat_uptr_t ptr;
5378         compat_size_t len;
5379         int ret;
5380
5381         ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
5382                                   &ptr, &len);
5383         if (ret)
5384                 return ret;
5385
5386         uiov = compat_ptr(ptr);
5387         if (req->flags & REQ_F_BUFFER_SELECT) {
5388                 compat_ssize_t clen;
5389
5390                 if (len > 1)
5391                         return -EINVAL;
5392                 if (!access_ok(uiov, sizeof(*uiov)))
5393                         return -EFAULT;
5394                 if (__get_user(clen, &uiov->iov_len))
5395                         return -EFAULT;
5396                 if (clen < 0)
5397                         return -EINVAL;
5398                 sr->len = clen;
5399                 iomsg->free_iov = NULL;
5400         } else {
5401                 iomsg->free_iov = iomsg->fast_iov;
5402                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
5403                                    UIO_FASTIOV, &iomsg->free_iov,
5404                                    &iomsg->msg.msg_iter, true);
5405                 if (ret < 0)
5406                         return ret;
5407         }
5408
5409         return 0;
5410 }
5411 #endif
5412
5413 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
5414                                struct io_async_msghdr *iomsg)
5415 {
5416         iomsg->msg.msg_name = &iomsg->addr;
5417
5418 #ifdef CONFIG_COMPAT
5419         if (req->ctx->compat)
5420                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
5421 #endif
5422
5423         return __io_recvmsg_copy_hdr(req, iomsg);
5424 }
5425
5426 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
5427                                                unsigned int issue_flags)
5428 {
5429         struct io_sr_msg *sr = &req->sr_msg;
5430
5431         return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
5432 }
5433
5434 static int io_recvmsg_prep_async(struct io_kiocb *req)
5435 {
5436         int ret;
5437
5438         ret = io_recvmsg_copy_hdr(req, req->async_data);
5439         if (!ret)
5440                 req->flags |= REQ_F_NEED_CLEANUP;
5441         return ret;
5442 }
5443
5444 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5445 {
5446         struct io_sr_msg *sr = &req->sr_msg;
5447
5448         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5449                 return -EINVAL;
5450
5451         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
5452         sr->len = READ_ONCE(sqe->len);
5453         sr->bgid = READ_ONCE(sqe->buf_group);
5454         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5455         if (sr->msg_flags & MSG_DONTWAIT)
5456                 req->flags |= REQ_F_NOWAIT;
5457
5458 #ifdef CONFIG_COMPAT
5459         if (req->ctx->compat)
5460                 sr->msg_flags |= MSG_CMSG_COMPAT;
5461 #endif
5462         sr->done_io = 0;
5463         return 0;
5464 }
5465
5466 static bool io_net_retry(struct socket *sock, int flags)
5467 {
5468         if (!(flags & MSG_WAITALL))
5469                 return false;
5470         return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
5471 }
5472
5473 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
5474 {
5475         struct io_async_msghdr iomsg, *kmsg;
5476         struct io_sr_msg *sr = &req->sr_msg;
5477         struct socket *sock;
5478         struct io_buffer *kbuf;
5479         unsigned flags;
5480         int ret, min_ret = 0;
5481         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5482
5483         sock = sock_from_file(req->file);
5484         if (unlikely(!sock))
5485                 return -ENOTSOCK;
5486
5487         if (req_has_async_data(req)) {
5488                 kmsg = req->async_data;
5489         } else {
5490                 ret = io_recvmsg_copy_hdr(req, &iomsg);
5491                 if (ret)
5492                         return ret;
5493                 kmsg = &iomsg;
5494         }
5495
5496         if (req->flags & REQ_F_BUFFER_SELECT) {
5497                 kbuf = io_recv_buffer_select(req, issue_flags);
5498                 if (IS_ERR(kbuf))
5499                         return PTR_ERR(kbuf);
5500                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5501                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
5502                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
5503                                 1, req->sr_msg.len);
5504         }
5505
5506         flags = req->sr_msg.msg_flags;
5507         if (force_nonblock)
5508                 flags |= MSG_DONTWAIT;
5509         if (flags & MSG_WAITALL)
5510                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5511
5512         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
5513                                         kmsg->uaddr, flags);
5514         if (ret < min_ret) {
5515                 if (ret == -EAGAIN && force_nonblock)
5516                         return io_setup_async_msg(req, kmsg);
5517                 if (ret == -ERESTARTSYS)
5518                         ret = -EINTR;
5519                 if (ret > 0 && io_net_retry(sock, flags)) {
5520                         sr->done_io += ret;
5521                         req->flags |= REQ_F_PARTIAL_IO;
5522                         return io_setup_async_msg(req, kmsg);
5523                 }
5524                 req_set_fail(req);
5525         } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5526                 req_set_fail(req);
5527         }
5528
5529         /* fast path, check for non-NULL to avoid function call */
5530         if (kmsg->free_iov)
5531                 kfree(kmsg->free_iov);
5532         req->flags &= ~REQ_F_NEED_CLEANUP;
5533         if (ret >= 0)
5534                 ret += sr->done_io;
5535         else if (sr->done_io)
5536                 ret = sr->done_io;
5537         __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
5538         return 0;
5539 }
5540
5541 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
5542 {
5543         struct io_buffer *kbuf;
5544         struct io_sr_msg *sr = &req->sr_msg;
5545         struct msghdr msg;
5546         void __user *buf = sr->buf;
5547         struct socket *sock;
5548         struct iovec iov;
5549         unsigned flags;
5550         int ret, min_ret = 0;
5551         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5552
5553         sock = sock_from_file(req->file);
5554         if (unlikely(!sock))
5555                 return -ENOTSOCK;
5556
5557         if (req->flags & REQ_F_BUFFER_SELECT) {
5558                 kbuf = io_recv_buffer_select(req, issue_flags);
5559                 if (IS_ERR(kbuf))
5560                         return PTR_ERR(kbuf);
5561                 buf = u64_to_user_ptr(kbuf->addr);
5562         }
5563
5564         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
5565         if (unlikely(ret))
5566                 goto out_free;
5567
5568         msg.msg_name = NULL;
5569         msg.msg_control = NULL;
5570         msg.msg_controllen = 0;
5571         msg.msg_namelen = 0;
5572         msg.msg_iocb = NULL;
5573         msg.msg_flags = 0;
5574
5575         flags = req->sr_msg.msg_flags;
5576         if (force_nonblock)
5577                 flags |= MSG_DONTWAIT;
5578         if (flags & MSG_WAITALL)
5579                 min_ret = iov_iter_count(&msg.msg_iter);
5580
5581         ret = sock_recvmsg(sock, &msg, flags);
5582         if (ret < min_ret) {
5583                 if (ret == -EAGAIN && force_nonblock)
5584                         return -EAGAIN;
5585                 if (ret == -ERESTARTSYS)
5586                         ret = -EINTR;
5587                 if (ret > 0 && io_net_retry(sock, flags)) {
5588                         sr->len -= ret;
5589                         sr->buf += ret;
5590                         sr->done_io += ret;
5591                         req->flags |= REQ_F_PARTIAL_IO;
5592                         return -EAGAIN;
5593                 }
5594                 req_set_fail(req);
5595         } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5596 out_free:
5597                 req_set_fail(req);
5598         }
5599
5600         if (ret >= 0)
5601                 ret += sr->done_io;
5602         else if (sr->done_io)
5603                 ret = sr->done_io;
5604         __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
5605         return 0;
5606 }
5607
5608 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5609 {
5610         struct io_accept *accept = &req->accept;
5611
5612         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5613                 return -EINVAL;
5614         if (sqe->ioprio || sqe->len || sqe->buf_index)
5615                 return -EINVAL;
5616
5617         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5618         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5619         accept->flags = READ_ONCE(sqe->accept_flags);
5620         accept->nofile = rlimit(RLIMIT_NOFILE);
5621
5622         accept->file_slot = READ_ONCE(sqe->file_index);
5623         if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
5624                 return -EINVAL;
5625         if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5626                 return -EINVAL;
5627         if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5628                 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
5629         return 0;
5630 }
5631
5632 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
5633 {
5634         struct io_accept *accept = &req->accept;
5635         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5636         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
5637         bool fixed = !!accept->file_slot;
5638         struct file *file;
5639         int ret, fd;
5640
5641         if (!fixed) {
5642                 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5643                 if (unlikely(fd < 0))
5644                         return fd;
5645         }
5646         file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5647                          accept->flags);
5648         if (IS_ERR(file)) {
5649                 if (!fixed)
5650                         put_unused_fd(fd);
5651                 ret = PTR_ERR(file);
5652                 if (ret == -EAGAIN && force_nonblock)
5653                         return -EAGAIN;
5654                 if (ret == -ERESTARTSYS)
5655                         ret = -EINTR;
5656                 req_set_fail(req);
5657         } else if (!fixed) {
5658                 fd_install(fd, file);
5659                 ret = fd;
5660         } else {
5661                 ret = io_install_fixed_file(req, file, issue_flags,
5662                                             accept->file_slot - 1);
5663         }
5664         __io_req_complete(req, issue_flags, ret, 0);
5665         return 0;
5666 }
5667
5668 static int io_connect_prep_async(struct io_kiocb *req)
5669 {
5670         struct io_async_connect *io = req->async_data;
5671         struct io_connect *conn = &req->connect;
5672
5673         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5674 }
5675
5676 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5677 {
5678         struct io_connect *conn = &req->connect;
5679
5680         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5681                 return -EINVAL;
5682         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5683             sqe->splice_fd_in)
5684                 return -EINVAL;
5685
5686         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5687         conn->addr_len =  READ_ONCE(sqe->addr2);
5688         return 0;
5689 }
5690
5691 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
5692 {
5693         struct io_async_connect __io, *io;
5694         unsigned file_flags;
5695         int ret;
5696         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5697
5698         if (req_has_async_data(req)) {
5699                 io = req->async_data;
5700         } else {
5701                 ret = move_addr_to_kernel(req->connect.addr,
5702                                                 req->connect.addr_len,
5703                                                 &__io.address);
5704                 if (ret)
5705                         goto out;
5706                 io = &__io;
5707         }
5708
5709         file_flags = force_nonblock ? O_NONBLOCK : 0;
5710
5711         ret = __sys_connect_file(req->file, &io->address,
5712                                         req->connect.addr_len, file_flags);
5713         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
5714                 if (req_has_async_data(req))
5715                         return -EAGAIN;
5716                 if (io_alloc_async_data(req)) {
5717                         ret = -ENOMEM;
5718                         goto out;
5719                 }
5720                 memcpy(req->async_data, &__io, sizeof(__io));
5721                 return -EAGAIN;
5722         }
5723         if (ret == -ERESTARTSYS)
5724                 ret = -EINTR;
5725 out:
5726         if (ret < 0)
5727                 req_set_fail(req);
5728         __io_req_complete(req, issue_flags, ret, 0);
5729         return 0;
5730 }
5731 #else /* !CONFIG_NET */
5732 #define IO_NETOP_FN(op)                                                 \
5733 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
5734 {                                                                       \
5735         return -EOPNOTSUPP;                                             \
5736 }
5737
5738 #define IO_NETOP_PREP(op)                                               \
5739 IO_NETOP_FN(op)                                                         \
5740 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5741 {                                                                       \
5742         return -EOPNOTSUPP;                                             \
5743 }                                                                       \
5744
5745 #define IO_NETOP_PREP_ASYNC(op)                                         \
5746 IO_NETOP_PREP(op)                                                       \
5747 static int io_##op##_prep_async(struct io_kiocb *req)                   \
5748 {                                                                       \
5749         return -EOPNOTSUPP;                                             \
5750 }
5751
5752 IO_NETOP_PREP_ASYNC(sendmsg);
5753 IO_NETOP_PREP_ASYNC(recvmsg);
5754 IO_NETOP_PREP_ASYNC(connect);
5755 IO_NETOP_PREP(accept);
5756 IO_NETOP_FN(send);
5757 IO_NETOP_FN(recv);
5758 #endif /* CONFIG_NET */
5759
5760 struct io_poll_table {
5761         struct poll_table_struct pt;
5762         struct io_kiocb *req;
5763         int nr_entries;
5764         int error;
5765 };
5766
5767 #define IO_POLL_CANCEL_FLAG     BIT(31)
5768 #define IO_POLL_REF_MASK        GENMASK(30, 0)
5769
5770 /*
5771  * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
5772  * bump it and acquire ownership. It's disallowed to modify requests while not
5773  * owning it, that prevents from races for enqueueing task_work's and b/w
5774  * arming poll and wakeups.
5775  */
5776 static inline bool io_poll_get_ownership(struct io_kiocb *req)
5777 {
5778         return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
5779 }
5780
5781 static void io_poll_mark_cancelled(struct io_kiocb *req)
5782 {
5783         atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
5784 }
5785
5786 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5787 {
5788         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5789         if (req->opcode == IORING_OP_POLL_ADD)
5790                 return req->async_data;
5791         return req->apoll->double_poll;
5792 }
5793
5794 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5795 {
5796         if (req->opcode == IORING_OP_POLL_ADD)
5797                 return &req->poll;
5798         return &req->apoll->poll;
5799 }
5800
5801 static void io_poll_req_insert(struct io_kiocb *req)
5802 {
5803         struct io_ring_ctx *ctx = req->ctx;
5804         struct hlist_head *list;
5805
5806         list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
5807         hlist_add_head(&req->hash_node, list);
5808 }
5809
5810 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5811                               wait_queue_func_t wake_func)
5812 {
5813         poll->head = NULL;
5814 #define IO_POLL_UNMASK  (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5815         /* mask in events that we always want/need */
5816         poll->events = events | IO_POLL_UNMASK;
5817         INIT_LIST_HEAD(&poll->wait.entry);
5818         init_waitqueue_func_entry(&poll->wait, wake_func);
5819 }
5820
5821 static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
5822 {
5823         struct wait_queue_head *head = smp_load_acquire(&poll->head);
5824
5825         if (head) {
5826                 spin_lock_irq(&head->lock);
5827                 list_del_init(&poll->wait.entry);
5828                 poll->head = NULL;
5829                 spin_unlock_irq(&head->lock);
5830         }
5831 }
5832
5833 static void io_poll_remove_entries(struct io_kiocb *req)
5834 {
5835         /*
5836          * Nothing to do if neither of those flags are set. Avoid dipping
5837          * into the poll/apoll/double cachelines if we can.
5838          */
5839         if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
5840                 return;
5841
5842         /*
5843          * While we hold the waitqueue lock and the waitqueue is nonempty,
5844          * wake_up_pollfree() will wait for us.  However, taking the waitqueue
5845          * lock in the first place can race with the waitqueue being freed.
5846          *
5847          * We solve this as eventpoll does: by taking advantage of the fact that
5848          * all users of wake_up_pollfree() will RCU-delay the actual free.  If
5849          * we enter rcu_read_lock() and see that the pointer to the queue is
5850          * non-NULL, we can then lock it without the memory being freed out from
5851          * under us.
5852          *
5853          * Keep holding rcu_read_lock() as long as we hold the queue lock, in
5854          * case the caller deletes the entry from the queue, leaving it empty.
5855          * In that case, only RCU prevents the queue memory from being freed.
5856          */
5857         rcu_read_lock();
5858         if (req->flags & REQ_F_SINGLE_POLL)
5859                 io_poll_remove_entry(io_poll_get_single(req));
5860         if (req->flags & REQ_F_DOUBLE_POLL)
5861                 io_poll_remove_entry(io_poll_get_double(req));
5862         rcu_read_unlock();
5863 }
5864
5865 /*
5866  * All poll tw should go through this. Checks for poll events, manages
5867  * references, does rewait, etc.
5868  *
5869  * Returns a negative error on failure. >0 when no action require, which is
5870  * either spurious wakeup or multishot CQE is served. 0 when it's done with
5871  * the request, then the mask is stored in req->cqe.res.
5872  */
5873 static int io_poll_check_events(struct io_kiocb *req, bool locked)
5874 {
5875         struct io_ring_ctx *ctx = req->ctx;
5876         int v;
5877
5878         /* req->task == current here, checking PF_EXITING is safe */
5879         if (unlikely(req->task->flags & PF_EXITING))
5880                 io_poll_mark_cancelled(req);
5881
5882         do {
5883                 v = atomic_read(&req->poll_refs);
5884
5885                 /* tw handler should be the owner, and so have some references */
5886                 if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
5887                         return 0;
5888                 if (v & IO_POLL_CANCEL_FLAG)
5889                         return -ECANCELED;
5890
5891                 if (!req->cqe.res) {
5892                         struct poll_table_struct pt = { ._key = req->apoll_events };
5893                         unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
5894
5895                         if (unlikely(!io_assign_file(req, flags)))
5896                                 return -EBADF;
5897                         req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
5898                 }
5899
5900                 /* multishot, just fill an CQE and proceed */
5901                 if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) {
5902                         __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events);
5903                         bool filled;
5904
5905                         spin_lock(&ctx->completion_lock);
5906                         filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask,
5907                                                  IORING_CQE_F_MORE);
5908                         io_commit_cqring(ctx);
5909                         spin_unlock(&ctx->completion_lock);
5910                         if (unlikely(!filled))
5911                                 return -ECANCELED;
5912                         io_cqring_ev_posted(ctx);
5913                 } else if (req->cqe.res) {
5914                         return 0;
5915                 }
5916
5917                 /*
5918                  * Release all references, retry if someone tried to restart
5919                  * task_work while we were executing it.
5920                  */
5921         } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
5922
5923         return 1;
5924 }
5925
5926 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
5927 {
5928         struct io_ring_ctx *ctx = req->ctx;
5929         int ret;
5930
5931         ret = io_poll_check_events(req, *locked);
5932         if (ret > 0)
5933                 return;
5934
5935         if (!ret) {
5936                 req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
5937         } else {
5938                 req->cqe.res = ret;
5939                 req_set_fail(req);
5940         }
5941
5942         io_poll_remove_entries(req);
5943         spin_lock(&ctx->completion_lock);
5944         hash_del(&req->hash_node);
5945         __io_req_complete_post(req, req->cqe.res, 0);
5946         io_commit_cqring(ctx);
5947         spin_unlock(&ctx->completion_lock);
5948         io_cqring_ev_posted(ctx);
5949 }
5950
5951 static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
5952 {
5953         struct io_ring_ctx *ctx = req->ctx;
5954         int ret;
5955
5956         ret = io_poll_check_events(req, *locked);
5957         if (ret > 0)
5958                 return;
5959
5960         io_poll_remove_entries(req);
5961         spin_lock(&ctx->completion_lock);
5962         hash_del(&req->hash_node);
5963         spin_unlock(&ctx->completion_lock);
5964
5965         if (!ret)
5966                 io_req_task_submit(req, locked);
5967         else
5968                 io_req_complete_failed(req, ret);
5969 }
5970
5971 static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
5972 {
5973         req->cqe.res = mask;
5974         /*
5975          * This is useful for poll that is armed on behalf of another
5976          * request, and where the wakeup path could be on a different
5977          * CPU. We want to avoid pulling in req->apoll->events for that
5978          * case.
5979          */
5980         req->apoll_events = events;
5981         if (req->opcode == IORING_OP_POLL_ADD)
5982                 req->io_task_work.func = io_poll_task_func;
5983         else
5984                 req->io_task_work.func = io_apoll_task_func;
5985
5986         trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
5987         io_req_task_work_add(req, false);
5988 }
5989
5990 static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
5991 {
5992         if (io_poll_get_ownership(req))
5993                 __io_poll_execute(req, res, events);
5994 }
5995
5996 static void io_poll_cancel_req(struct io_kiocb *req)
5997 {
5998         io_poll_mark_cancelled(req);
5999         /* kick tw, which should complete the request */
6000         io_poll_execute(req, 0, 0);
6001 }
6002
6003 #define wqe_to_req(wait)        ((void *)((unsigned long) (wait)->private & ~1))
6004 #define wqe_is_double(wait)     ((unsigned long) (wait)->private & 1)
6005
6006 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
6007                         void *key)
6008 {
6009         struct io_kiocb *req = wqe_to_req(wait);
6010         struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
6011                                                  wait);
6012         __poll_t mask = key_to_poll(key);
6013
6014         if (unlikely(mask & POLLFREE)) {
6015                 io_poll_mark_cancelled(req);
6016                 /* we have to kick tw in case it's not already */
6017                 io_poll_execute(req, 0, poll->events);
6018
6019                 /*
6020                  * If the waitqueue is being freed early but someone is already
6021                  * holds ownership over it, we have to tear down the request as
6022                  * best we can. That means immediately removing the request from
6023                  * its waitqueue and preventing all further accesses to the
6024                  * waitqueue via the request.
6025                  */
6026                 list_del_init(&poll->wait.entry);
6027
6028                 /*
6029                  * Careful: this *must* be the last step, since as soon
6030                  * as req->head is NULL'ed out, the request can be
6031                  * completed and freed, since aio_poll_complete_work()
6032                  * will no longer need to take the waitqueue lock.
6033                  */
6034                 smp_store_release(&poll->head, NULL);
6035                 return 1;
6036         }
6037
6038         /* for instances that support it check for an event match first */
6039         if (mask && !(mask & poll->events))
6040                 return 0;
6041
6042         if (io_poll_get_ownership(req)) {
6043                 /* optional, saves extra locking for removal in tw handler */
6044                 if (mask && poll->events & EPOLLONESHOT) {
6045                         list_del_init(&poll->wait.entry);
6046                         poll->head = NULL;
6047                         if (wqe_is_double(wait))
6048                                 req->flags &= ~REQ_F_DOUBLE_POLL;
6049                         else
6050                                 req->flags &= ~REQ_F_SINGLE_POLL;
6051                 }
6052                 __io_poll_execute(req, mask, poll->events);
6053         }
6054         return 1;
6055 }
6056
6057 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
6058                             struct wait_queue_head *head,
6059                             struct io_poll_iocb **poll_ptr)
6060 {
6061         struct io_kiocb *req = pt->req;
6062         unsigned long wqe_private = (unsigned long) req;
6063
6064         /*
6065          * The file being polled uses multiple waitqueues for poll handling
6066          * (e.g. one for read, one for write). Setup a separate io_poll_iocb
6067          * if this happens.
6068          */
6069         if (unlikely(pt->nr_entries)) {
6070                 struct io_poll_iocb *first = poll;
6071
6072                 /* double add on the same waitqueue head, ignore */
6073                 if (first->head == head)
6074                         return;
6075                 /* already have a 2nd entry, fail a third attempt */
6076                 if (*poll_ptr) {
6077                         if ((*poll_ptr)->head == head)
6078                                 return;
6079                         pt->error = -EINVAL;
6080                         return;
6081                 }
6082
6083                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
6084                 if (!poll) {
6085                         pt->error = -ENOMEM;
6086                         return;
6087                 }
6088                 /* mark as double wq entry */
6089                 wqe_private |= 1;
6090                 req->flags |= REQ_F_DOUBLE_POLL;
6091                 io_init_poll_iocb(poll, first->events, first->wait.func);
6092                 *poll_ptr = poll;
6093                 if (req->opcode == IORING_OP_POLL_ADD)
6094                         req->flags |= REQ_F_ASYNC_DATA;
6095         }
6096
6097         req->flags |= REQ_F_SINGLE_POLL;
6098         pt->nr_entries++;
6099         poll->head = head;
6100         poll->wait.private = (void *) wqe_private;
6101
6102         if (poll->events & EPOLLEXCLUSIVE)
6103                 add_wait_queue_exclusive(head, &poll->wait);
6104         else
6105                 add_wait_queue(head, &poll->wait);
6106 }
6107
6108 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
6109                                struct poll_table_struct *p)
6110 {
6111         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
6112
6113         __io_queue_proc(&pt->req->poll, pt, head,
6114                         (struct io_poll_iocb **) &pt->req->async_data);
6115 }
6116
6117 static int __io_arm_poll_handler(struct io_kiocb *req,
6118                                  struct io_poll_iocb *poll,
6119                                  struct io_poll_table *ipt, __poll_t mask)
6120 {
6121         struct io_ring_ctx *ctx = req->ctx;
6122         int v;
6123
6124         INIT_HLIST_NODE(&req->hash_node);
6125         io_init_poll_iocb(poll, mask, io_poll_wake);
6126         poll->file = req->file;
6127
6128         ipt->pt._key = mask;
6129         ipt->req = req;
6130         ipt->error = 0;
6131         ipt->nr_entries = 0;
6132
6133         /*
6134          * Take the ownership to delay any tw execution up until we're done
6135          * with poll arming. see io_poll_get_ownership().
6136          */
6137         atomic_set(&req->poll_refs, 1);
6138         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
6139
6140         if (mask && (poll->events & EPOLLONESHOT)) {
6141                 io_poll_remove_entries(req);
6142                 /* no one else has access to the req, forget about the ref */
6143                 return mask;
6144         }
6145         if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
6146                 io_poll_remove_entries(req);
6147                 if (!ipt->error)
6148                         ipt->error = -EINVAL;
6149                 return 0;
6150         }
6151
6152         spin_lock(&ctx->completion_lock);
6153         io_poll_req_insert(req);
6154         spin_unlock(&ctx->completion_lock);
6155
6156         if (mask) {
6157                 /* can't multishot if failed, just queue the event we've got */
6158                 if (unlikely(ipt->error || !ipt->nr_entries))
6159                         poll->events |= EPOLLONESHOT;
6160                 __io_poll_execute(req, mask, poll->events);
6161                 return 0;
6162         }
6163
6164         /*
6165          * Release ownership. If someone tried to queue a tw while it was
6166          * locked, kick it off for them.
6167          */
6168         v = atomic_dec_return(&req->poll_refs);
6169         if (unlikely(v & IO_POLL_REF_MASK))
6170                 __io_poll_execute(req, 0, poll->events);
6171         return 0;
6172 }
6173
6174 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
6175                                struct poll_table_struct *p)
6176 {
6177         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
6178         struct async_poll *apoll = pt->req->apoll;
6179
6180         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
6181 }
6182
6183 enum {
6184         IO_APOLL_OK,
6185         IO_APOLL_ABORTED,
6186         IO_APOLL_READY
6187 };
6188
6189 static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
6190 {
6191         const struct io_op_def *def = &io_op_defs[req->opcode];
6192         struct io_ring_ctx *ctx = req->ctx;
6193         struct async_poll *apoll;
6194         struct io_poll_table ipt;
6195         __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
6196         int ret;
6197
6198         if (!def->pollin && !def->pollout)
6199                 return IO_APOLL_ABORTED;
6200         if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
6201                 return IO_APOLL_ABORTED;
6202
6203         if (def->pollin) {
6204                 mask |= POLLIN | POLLRDNORM;
6205
6206                 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
6207                 if ((req->opcode == IORING_OP_RECVMSG) &&
6208                     (req->sr_msg.msg_flags & MSG_ERRQUEUE))
6209                         mask &= ~POLLIN;
6210         } else {
6211                 mask |= POLLOUT | POLLWRNORM;
6212         }
6213         if (def->poll_exclusive)
6214                 mask |= EPOLLEXCLUSIVE;
6215         if (!(issue_flags & IO_URING_F_UNLOCKED) &&
6216             !list_empty(&ctx->apoll_cache)) {
6217                 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
6218                                                 poll.wait.entry);
6219                 list_del_init(&apoll->poll.wait.entry);
6220         } else {
6221                 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
6222                 if (unlikely(!apoll))
6223                         return IO_APOLL_ABORTED;
6224         }
6225         apoll->double_poll = NULL;
6226         req->apoll = apoll;
6227         req->flags |= REQ_F_POLLED;
6228         ipt.pt._qproc = io_async_queue_proc;
6229
6230         io_kbuf_recycle(req, issue_flags);
6231
6232         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
6233         if (ret || ipt.error)
6234                 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
6235
6236         trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
6237                                 mask, apoll->poll.events);
6238         return IO_APOLL_OK;
6239 }
6240
6241 /*
6242  * Returns true if we found and killed one or more poll requests
6243  */
6244 static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
6245                                       struct task_struct *tsk, bool cancel_all)
6246 {
6247         struct hlist_node *tmp;
6248         struct io_kiocb *req;
6249         bool found = false;
6250         int i;
6251
6252         spin_lock(&ctx->completion_lock);
6253         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
6254                 struct hlist_head *list;
6255
6256                 list = &ctx->cancel_hash[i];
6257                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
6258                         if (io_match_task_safe(req, tsk, cancel_all)) {
6259                                 hlist_del_init(&req->hash_node);
6260                                 io_poll_cancel_req(req);
6261                                 found = true;
6262                         }
6263                 }
6264         }
6265         spin_unlock(&ctx->completion_lock);
6266         return found;
6267 }
6268
6269 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
6270                                      bool poll_only)
6271         __must_hold(&ctx->completion_lock)
6272 {
6273         struct hlist_head *list;
6274         struct io_kiocb *req;
6275
6276         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
6277         hlist_for_each_entry(req, list, hash_node) {
6278                 if (sqe_addr != req->cqe.user_data)
6279                         continue;
6280                 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
6281                         continue;
6282                 return req;
6283         }
6284         return NULL;
6285 }
6286
6287 static bool io_poll_disarm(struct io_kiocb *req)
6288         __must_hold(&ctx->completion_lock)
6289 {
6290         if (!io_poll_get_ownership(req))
6291                 return false;
6292         io_poll_remove_entries(req);
6293         hash_del(&req->hash_node);
6294         return true;
6295 }
6296
6297 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
6298                           bool poll_only)
6299         __must_hold(&ctx->completion_lock)
6300 {
6301         struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
6302
6303         if (!req)
6304                 return -ENOENT;
6305         io_poll_cancel_req(req);
6306         return 0;
6307 }
6308
6309 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
6310                                      unsigned int flags)
6311 {
6312         u32 events;
6313
6314         events = READ_ONCE(sqe->poll32_events);
6315 #ifdef __BIG_ENDIAN
6316         events = swahw32(events);
6317 #endif
6318         if (!(flags & IORING_POLL_ADD_MULTI))
6319                 events |= EPOLLONESHOT;
6320         return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
6321 }
6322
6323 static int io_poll_update_prep(struct io_kiocb *req,
6324                                const struct io_uring_sqe *sqe)
6325 {
6326         struct io_poll_update *upd = &req->poll_update;
6327         u32 flags;
6328
6329         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6330                 return -EINVAL;
6331         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
6332                 return -EINVAL;
6333         flags = READ_ONCE(sqe->len);
6334         if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
6335                       IORING_POLL_ADD_MULTI))
6336                 return -EINVAL;
6337         /* meaningless without update */
6338         if (flags == IORING_POLL_ADD_MULTI)
6339                 return -EINVAL;
6340
6341         upd->old_user_data = READ_ONCE(sqe->addr);
6342         upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
6343         upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
6344
6345         upd->new_user_data = READ_ONCE(sqe->off);
6346         if (!upd->update_user_data && upd->new_user_data)
6347                 return -EINVAL;
6348         if (upd->update_events)
6349                 upd->events = io_poll_parse_events(sqe, flags);
6350         else if (sqe->poll32_events)
6351                 return -EINVAL;
6352
6353         return 0;
6354 }
6355
6356 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6357 {
6358         struct io_poll_iocb *poll = &req->poll;
6359         u32 flags;
6360
6361         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6362                 return -EINVAL;
6363         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
6364                 return -EINVAL;
6365         flags = READ_ONCE(sqe->len);
6366         if (flags & ~IORING_POLL_ADD_MULTI)
6367                 return -EINVAL;
6368         if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
6369                 return -EINVAL;
6370
6371         io_req_set_refcount(req);
6372         req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
6373         return 0;
6374 }
6375
6376 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
6377 {
6378         struct io_poll_iocb *poll = &req->poll;
6379         struct io_poll_table ipt;
6380         int ret;
6381
6382         ipt.pt._qproc = io_poll_queue_proc;
6383
6384         ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
6385         ret = ret ?: ipt.error;
6386         if (ret)
6387                 __io_req_complete(req, issue_flags, ret, 0);
6388         return 0;
6389 }
6390
6391 static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
6392 {
6393         struct io_ring_ctx *ctx = req->ctx;
6394         struct io_kiocb *preq;
6395         int ret2, ret = 0;
6396         bool locked;
6397
6398         spin_lock(&ctx->completion_lock);
6399         preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
6400         if (!preq || !io_poll_disarm(preq)) {
6401                 spin_unlock(&ctx->completion_lock);
6402                 ret = preq ? -EALREADY : -ENOENT;
6403                 goto out;
6404         }
6405         spin_unlock(&ctx->completion_lock);
6406
6407         if (req->poll_update.update_events || req->poll_update.update_user_data) {
6408                 /* only mask one event flags, keep behavior flags */
6409                 if (req->poll_update.update_events) {
6410                         preq->poll.events &= ~0xffff;
6411                         preq->poll.events |= req->poll_update.events & 0xffff;
6412                         preq->poll.events |= IO_POLL_UNMASK;
6413                 }
6414                 if (req->poll_update.update_user_data)
6415                         preq->cqe.user_data = req->poll_update.new_user_data;
6416
6417                 ret2 = io_poll_add(preq, issue_flags);
6418                 /* successfully updated, don't complete poll request */
6419                 if (!ret2)
6420                         goto out;
6421         }
6422
6423         req_set_fail(preq);
6424         preq->cqe.res = -ECANCELED;
6425         locked = !(issue_flags & IO_URING_F_UNLOCKED);
6426         io_req_task_complete(preq, &locked);
6427 out:
6428         if (ret < 0)
6429                 req_set_fail(req);
6430         /* complete update request, we're done with it */
6431         __io_req_complete(req, issue_flags, ret, 0);
6432         return 0;
6433 }
6434
6435 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
6436 {
6437         struct io_timeout_data *data = container_of(timer,
6438                                                 struct io_timeout_data, timer);
6439         struct io_kiocb *req = data->req;
6440         struct io_ring_ctx *ctx = req->ctx;
6441         unsigned long flags;
6442
6443         spin_lock_irqsave(&ctx->timeout_lock, flags);
6444         list_del_init(&req->timeout.list);
6445         atomic_set(&req->ctx->cq_timeouts,
6446                 atomic_read(&req->ctx->cq_timeouts) + 1);
6447         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
6448
6449         if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
6450                 req_set_fail(req);
6451
6452         req->cqe.res = -ETIME;
6453         req->io_task_work.func = io_req_task_complete;
6454         io_req_task_work_add(req, false);
6455         return HRTIMER_NORESTART;
6456 }
6457
6458 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
6459                                            __u64 user_data)
6460         __must_hold(&ctx->timeout_lock)
6461 {
6462         struct io_timeout_data *io;
6463         struct io_kiocb *req;
6464         bool found = false;
6465
6466         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
6467                 found = user_data == req->cqe.user_data;
6468                 if (found)
6469                         break;
6470         }
6471         if (!found)
6472                 return ERR_PTR(-ENOENT);
6473
6474         io = req->async_data;
6475         if (hrtimer_try_to_cancel(&io->timer) == -1)
6476                 return ERR_PTR(-EALREADY);
6477         list_del_init(&req->timeout.list);
6478         return req;
6479 }
6480
6481 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
6482         __must_hold(&ctx->completion_lock)
6483         __must_hold(&ctx->timeout_lock)
6484 {
6485         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6486
6487         if (IS_ERR(req))
6488                 return PTR_ERR(req);
6489         io_req_task_queue_fail(req, -ECANCELED);
6490         return 0;
6491 }
6492
6493 static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
6494 {
6495         switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
6496         case IORING_TIMEOUT_BOOTTIME:
6497                 return CLOCK_BOOTTIME;
6498         case IORING_TIMEOUT_REALTIME:
6499                 return CLOCK_REALTIME;
6500         default:
6501                 /* can't happen, vetted at prep time */
6502                 WARN_ON_ONCE(1);
6503                 fallthrough;
6504         case 0:
6505                 return CLOCK_MONOTONIC;
6506         }
6507 }
6508
6509 static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6510                                     struct timespec64 *ts, enum hrtimer_mode mode)
6511         __must_hold(&ctx->timeout_lock)
6512 {
6513         struct io_timeout_data *io;
6514         struct io_kiocb *req;
6515         bool found = false;
6516
6517         list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
6518                 found = user_data == req->cqe.user_data;
6519                 if (found)
6520                         break;
6521         }
6522         if (!found)
6523                 return -ENOENT;
6524
6525         io = req->async_data;
6526         if (hrtimer_try_to_cancel(&io->timer) == -1)
6527                 return -EALREADY;
6528         hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
6529         io->timer.function = io_link_timeout_fn;
6530         hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6531         return 0;
6532 }
6533
6534 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6535                              struct timespec64 *ts, enum hrtimer_mode mode)
6536         __must_hold(&ctx->timeout_lock)
6537 {
6538         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6539         struct io_timeout_data *data;
6540
6541         if (IS_ERR(req))
6542                 return PTR_ERR(req);
6543
6544         req->timeout.off = 0; /* noseq */
6545         data = req->async_data;
6546         list_add_tail(&req->timeout.list, &ctx->timeout_list);
6547         hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
6548         data->timer.function = io_timeout_fn;
6549         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6550         return 0;
6551 }
6552
6553 static int io_timeout_remove_prep(struct io_kiocb *req,
6554                                   const struct io_uring_sqe *sqe)
6555 {
6556         struct io_timeout_rem *tr = &req->timeout_rem;
6557
6558         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6559                 return -EINVAL;
6560         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6561                 return -EINVAL;
6562         if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
6563                 return -EINVAL;
6564
6565         tr->ltimeout = false;
6566         tr->addr = READ_ONCE(sqe->addr);
6567         tr->flags = READ_ONCE(sqe->timeout_flags);
6568         if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6569                 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6570                         return -EINVAL;
6571                 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6572                         tr->ltimeout = true;
6573                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
6574                         return -EINVAL;
6575                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6576                         return -EFAULT;
6577                 if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
6578                         return -EINVAL;
6579         } else if (tr->flags) {
6580                 /* timeout removal doesn't support flags */
6581                 return -EINVAL;
6582         }
6583
6584         return 0;
6585 }
6586
6587 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6588 {
6589         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6590                                             : HRTIMER_MODE_REL;
6591 }
6592
6593 /*
6594  * Remove or update an existing timeout command
6595  */
6596 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
6597 {
6598         struct io_timeout_rem *tr = &req->timeout_rem;
6599         struct io_ring_ctx *ctx = req->ctx;
6600         int ret;
6601
6602         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6603                 spin_lock(&ctx->completion_lock);
6604                 spin_lock_irq(&ctx->timeout_lock);
6605                 ret = io_timeout_cancel(ctx, tr->addr);
6606                 spin_unlock_irq(&ctx->timeout_lock);
6607                 spin_unlock(&ctx->completion_lock);
6608         } else {
6609                 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6610
6611                 spin_lock_irq(&ctx->timeout_lock);
6612                 if (tr->ltimeout)
6613                         ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6614                 else
6615                         ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
6616                 spin_unlock_irq(&ctx->timeout_lock);
6617         }
6618
6619         if (ret < 0)
6620                 req_set_fail(req);
6621         io_req_complete_post(req, ret, 0);
6622         return 0;
6623 }
6624
6625 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6626                            bool is_timeout_link)
6627 {
6628         struct io_timeout_data *data;
6629         unsigned flags;
6630         u32 off = READ_ONCE(sqe->off);
6631
6632         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6633                 return -EINVAL;
6634         if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6635             sqe->splice_fd_in)
6636                 return -EINVAL;
6637         if (off && is_timeout_link)
6638                 return -EINVAL;
6639         flags = READ_ONCE(sqe->timeout_flags);
6640         if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6641                       IORING_TIMEOUT_ETIME_SUCCESS))
6642                 return -EINVAL;
6643         /* more than one clock specified is invalid, obviously */
6644         if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6645                 return -EINVAL;
6646
6647         INIT_LIST_HEAD(&req->timeout.list);
6648         req->timeout.off = off;
6649         if (unlikely(off && !req->ctx->off_timeout_used))
6650                 req->ctx->off_timeout_used = true;
6651
6652         if (WARN_ON_ONCE(req_has_async_data(req)))
6653                 return -EFAULT;
6654         if (io_alloc_async_data(req))
6655                 return -ENOMEM;
6656
6657         data = req->async_data;
6658         data->req = req;
6659         data->flags = flags;
6660
6661         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
6662                 return -EFAULT;
6663
6664         if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
6665                 return -EINVAL;
6666
6667         INIT_LIST_HEAD(&req->timeout.list);
6668         data->mode = io_translate_timeout_mode(flags);
6669         hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
6670
6671         if (is_timeout_link) {
6672                 struct io_submit_link *link = &req->ctx->submit_state.link;
6673
6674                 if (!link->head)
6675                         return -EINVAL;
6676                 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6677                         return -EINVAL;
6678                 req->timeout.head = link->last;
6679                 link->last->flags |= REQ_F_ARM_LTIMEOUT;
6680         }
6681         return 0;
6682 }
6683
6684 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
6685 {
6686         struct io_ring_ctx *ctx = req->ctx;
6687         struct io_timeout_data *data = req->async_data;
6688         struct list_head *entry;
6689         u32 tail, off = req->timeout.off;
6690
6691         spin_lock_irq(&ctx->timeout_lock);
6692
6693         /*
6694          * sqe->off holds how many events that need to occur for this
6695          * timeout event to be satisfied. If it isn't set, then this is
6696          * a pure timeout request, sequence isn't used.
6697          */
6698         if (io_is_timeout_noseq(req)) {
6699                 entry = ctx->timeout_list.prev;
6700                 goto add;
6701         }
6702
6703         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6704         req->timeout.target_seq = tail + off;
6705
6706         /* Update the last seq here in case io_flush_timeouts() hasn't.
6707          * This is safe because ->completion_lock is held, and submissions
6708          * and completions are never mixed in the same ->completion_lock section.
6709          */
6710         ctx->cq_last_tm_flush = tail;
6711
6712         /*
6713          * Insertion sort, ensuring the first entry in the list is always
6714          * the one we need first.
6715          */
6716         list_for_each_prev(entry, &ctx->timeout_list) {
6717                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6718                                                   timeout.list);
6719
6720                 if (io_is_timeout_noseq(nxt))
6721                         continue;
6722                 /* nxt.seq is behind @tail, otherwise would've been completed */
6723                 if (off >= nxt->timeout.target_seq - tail)
6724                         break;
6725         }
6726 add:
6727         list_add(&req->timeout.list, entry);
6728         data->timer.function = io_timeout_fn;
6729         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
6730         spin_unlock_irq(&ctx->timeout_lock);
6731         return 0;
6732 }
6733
6734 struct io_cancel_data {
6735         struct io_ring_ctx *ctx;
6736         u64 user_data;
6737 };
6738
6739 static bool io_cancel_cb(struct io_wq_work *work, void *data)
6740 {
6741         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6742         struct io_cancel_data *cd = data;
6743
6744         return req->ctx == cd->ctx && req->cqe.user_data == cd->user_data;
6745 }
6746
6747 static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6748                                struct io_ring_ctx *ctx)
6749 {
6750         struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
6751         enum io_wq_cancel cancel_ret;
6752         int ret = 0;
6753
6754         if (!tctx || !tctx->io_wq)
6755                 return -ENOENT;
6756
6757         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
6758         switch (cancel_ret) {
6759         case IO_WQ_CANCEL_OK:
6760                 ret = 0;
6761                 break;
6762         case IO_WQ_CANCEL_RUNNING:
6763                 ret = -EALREADY;
6764                 break;
6765         case IO_WQ_CANCEL_NOTFOUND:
6766                 ret = -ENOENT;
6767                 break;
6768         }
6769
6770         return ret;
6771 }
6772
6773 static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
6774 {
6775         struct io_ring_ctx *ctx = req->ctx;
6776         int ret;
6777
6778         WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
6779
6780         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
6781         /*
6782          * Fall-through even for -EALREADY, as we may have poll armed
6783          * that need unarming.
6784          */
6785         if (!ret)
6786                 return 0;
6787
6788         spin_lock(&ctx->completion_lock);
6789         ret = io_poll_cancel(ctx, sqe_addr, false);
6790         if (ret != -ENOENT)
6791                 goto out;
6792
6793         spin_lock_irq(&ctx->timeout_lock);
6794         ret = io_timeout_cancel(ctx, sqe_addr);
6795         spin_unlock_irq(&ctx->timeout_lock);
6796 out:
6797         spin_unlock(&ctx->completion_lock);
6798         return ret;
6799 }
6800
6801 static int io_async_cancel_prep(struct io_kiocb *req,
6802                                 const struct io_uring_sqe *sqe)
6803 {
6804         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6805                 return -EINVAL;
6806         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6807                 return -EINVAL;
6808         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6809             sqe->splice_fd_in)
6810                 return -EINVAL;
6811
6812         req->cancel.addr = READ_ONCE(sqe->addr);
6813         return 0;
6814 }
6815
6816 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
6817 {
6818         struct io_ring_ctx *ctx = req->ctx;
6819         u64 sqe_addr = req->cancel.addr;
6820         struct io_tctx_node *node;
6821         int ret;
6822
6823         ret = io_try_cancel_userdata(req, sqe_addr);
6824         if (ret != -ENOENT)
6825                 goto done;
6826
6827         /* slow path, try all io-wq's */
6828         io_ring_submit_lock(ctx, issue_flags);
6829         ret = -ENOENT;
6830         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6831                 struct io_uring_task *tctx = node->task->io_uring;
6832
6833                 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6834                 if (ret != -ENOENT)
6835                         break;
6836         }
6837         io_ring_submit_unlock(ctx, issue_flags);
6838 done:
6839         if (ret < 0)
6840                 req_set_fail(req);
6841         io_req_complete_post(req, ret, 0);
6842         return 0;
6843 }
6844
6845 static int io_rsrc_update_prep(struct io_kiocb *req,
6846                                 const struct io_uring_sqe *sqe)
6847 {
6848         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6849                 return -EINVAL;
6850         if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
6851                 return -EINVAL;
6852
6853         req->rsrc_update.offset = READ_ONCE(sqe->off);
6854         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6855         if (!req->rsrc_update.nr_args)
6856                 return -EINVAL;
6857         req->rsrc_update.arg = READ_ONCE(sqe->addr);
6858         return 0;
6859 }
6860
6861 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
6862 {
6863         struct io_ring_ctx *ctx = req->ctx;
6864         struct io_uring_rsrc_update2 up;
6865         int ret;
6866
6867         up.offset = req->rsrc_update.offset;
6868         up.data = req->rsrc_update.arg;
6869         up.nr = 0;
6870         up.tags = 0;
6871         up.resv = 0;
6872         up.resv2 = 0;
6873
6874         io_ring_submit_lock(ctx, issue_flags);
6875         ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
6876                                         &up, req->rsrc_update.nr_args);
6877         io_ring_submit_unlock(ctx, issue_flags);
6878
6879         if (ret < 0)
6880                 req_set_fail(req);
6881         __io_req_complete(req, issue_flags, ret, 0);
6882         return 0;
6883 }
6884
6885 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6886 {
6887         switch (req->opcode) {
6888         case IORING_OP_NOP:
6889                 return 0;
6890         case IORING_OP_READV:
6891         case IORING_OP_READ_FIXED:
6892         case IORING_OP_READ:
6893         case IORING_OP_WRITEV:
6894         case IORING_OP_WRITE_FIXED:
6895         case IORING_OP_WRITE:
6896                 return io_prep_rw(req, sqe);
6897         case IORING_OP_POLL_ADD:
6898                 return io_poll_add_prep(req, sqe);
6899         case IORING_OP_POLL_REMOVE:
6900                 return io_poll_update_prep(req, sqe);
6901         case IORING_OP_FSYNC:
6902                 return io_fsync_prep(req, sqe);
6903         case IORING_OP_SYNC_FILE_RANGE:
6904                 return io_sfr_prep(req, sqe);
6905         case IORING_OP_SENDMSG:
6906         case IORING_OP_SEND:
6907                 return io_sendmsg_prep(req, sqe);
6908         case IORING_OP_RECVMSG:
6909         case IORING_OP_RECV:
6910                 return io_recvmsg_prep(req, sqe);
6911         case IORING_OP_CONNECT:
6912                 return io_connect_prep(req, sqe);
6913         case IORING_OP_TIMEOUT:
6914                 return io_timeout_prep(req, sqe, false);
6915         case IORING_OP_TIMEOUT_REMOVE:
6916                 return io_timeout_remove_prep(req, sqe);
6917         case IORING_OP_ASYNC_CANCEL:
6918                 return io_async_cancel_prep(req, sqe);
6919         case IORING_OP_LINK_TIMEOUT:
6920                 return io_timeout_prep(req, sqe, true);
6921         case IORING_OP_ACCEPT:
6922                 return io_accept_prep(req, sqe);
6923         case IORING_OP_FALLOCATE:
6924                 return io_fallocate_prep(req, sqe);
6925         case IORING_OP_OPENAT:
6926                 return io_openat_prep(req, sqe);
6927         case IORING_OP_CLOSE:
6928                 return io_close_prep(req, sqe);
6929         case IORING_OP_FILES_UPDATE:
6930                 return io_rsrc_update_prep(req, sqe);
6931         case IORING_OP_STATX:
6932                 return io_statx_prep(req, sqe);
6933         case IORING_OP_FADVISE:
6934                 return io_fadvise_prep(req, sqe);
6935         case IORING_OP_MADVISE:
6936                 return io_madvise_prep(req, sqe);
6937         case IORING_OP_OPENAT2:
6938                 return io_openat2_prep(req, sqe);
6939         case IORING_OP_EPOLL_CTL:
6940                 return io_epoll_ctl_prep(req, sqe);
6941         case IORING_OP_SPLICE:
6942                 return io_splice_prep(req, sqe);
6943         case IORING_OP_PROVIDE_BUFFERS:
6944                 return io_provide_buffers_prep(req, sqe);
6945         case IORING_OP_REMOVE_BUFFERS:
6946                 return io_remove_buffers_prep(req, sqe);
6947         case IORING_OP_TEE:
6948                 return io_tee_prep(req, sqe);
6949         case IORING_OP_SHUTDOWN:
6950                 return io_shutdown_prep(req, sqe);
6951         case IORING_OP_RENAMEAT:
6952                 return io_renameat_prep(req, sqe);
6953         case IORING_OP_UNLINKAT:
6954                 return io_unlinkat_prep(req, sqe);
6955         case IORING_OP_MKDIRAT:
6956                 return io_mkdirat_prep(req, sqe);
6957         case IORING_OP_SYMLINKAT:
6958                 return io_symlinkat_prep(req, sqe);
6959         case IORING_OP_LINKAT:
6960                 return io_linkat_prep(req, sqe);
6961         case IORING_OP_MSG_RING:
6962                 return io_msg_ring_prep(req, sqe);
6963         }
6964
6965         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6966                         req->opcode);
6967         return -EINVAL;
6968 }
6969
6970 static int io_req_prep_async(struct io_kiocb *req)
6971 {
6972         if (!io_op_defs[req->opcode].needs_async_setup)
6973                 return 0;
6974         if (WARN_ON_ONCE(req_has_async_data(req)))
6975                 return -EFAULT;
6976         if (io_alloc_async_data(req))
6977                 return -EAGAIN;
6978
6979         switch (req->opcode) {
6980         case IORING_OP_READV:
6981                 return io_rw_prep_async(req, READ);
6982         case IORING_OP_WRITEV:
6983                 return io_rw_prep_async(req, WRITE);
6984         case IORING_OP_SENDMSG:
6985                 return io_sendmsg_prep_async(req);
6986         case IORING_OP_RECVMSG:
6987                 return io_recvmsg_prep_async(req);
6988         case IORING_OP_CONNECT:
6989                 return io_connect_prep_async(req);
6990         }
6991         printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6992                     req->opcode);
6993         return -EFAULT;
6994 }
6995
6996 static u32 io_get_sequence(struct io_kiocb *req)
6997 {
6998         u32 seq = req->ctx->cached_sq_head;
6999         struct io_kiocb *cur;
7000
7001         /* need original cached_sq_head, but it was increased for each req */
7002         io_for_each_link(cur, req)
7003                 seq--;
7004         return seq;
7005 }
7006
7007 static __cold void io_drain_req(struct io_kiocb *req)
7008 {
7009         struct io_ring_ctx *ctx = req->ctx;
7010         struct io_defer_entry *de;
7011         int ret;
7012         u32 seq = io_get_sequence(req);
7013
7014         /* Still need defer if there is pending req in defer list. */
7015         spin_lock(&ctx->completion_lock);
7016         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
7017                 spin_unlock(&ctx->completion_lock);
7018 queue:
7019                 ctx->drain_active = false;
7020                 io_req_task_queue(req);
7021                 return;
7022         }
7023         spin_unlock(&ctx->completion_lock);
7024
7025         ret = io_req_prep_async(req);
7026         if (ret) {
7027 fail:
7028                 io_req_complete_failed(req, ret);
7029                 return;
7030         }
7031         io_prep_async_link(req);
7032         de = kmalloc(sizeof(*de), GFP_KERNEL);
7033         if (!de) {
7034                 ret = -ENOMEM;
7035                 goto fail;
7036         }
7037
7038         spin_lock(&ctx->completion_lock);
7039         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
7040                 spin_unlock(&ctx->completion_lock);
7041                 kfree(de);
7042                 goto queue;
7043         }
7044
7045         trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
7046         de->req = req;
7047         de->seq = seq;
7048         list_add_tail(&de->list, &ctx->defer_list);
7049         spin_unlock(&ctx->completion_lock);
7050 }
7051
7052 static void io_clean_op(struct io_kiocb *req)
7053 {
7054         if (req->flags & REQ_F_BUFFER_SELECTED) {
7055                 spin_lock(&req->ctx->completion_lock);
7056                 io_put_kbuf_comp(req);
7057                 spin_unlock(&req->ctx->completion_lock);
7058         }
7059
7060         if (req->flags & REQ_F_NEED_CLEANUP) {
7061                 switch (req->opcode) {
7062                 case IORING_OP_READV:
7063                 case IORING_OP_READ_FIXED:
7064                 case IORING_OP_READ:
7065                 case IORING_OP_WRITEV:
7066                 case IORING_OP_WRITE_FIXED:
7067                 case IORING_OP_WRITE: {
7068                         struct io_async_rw *io = req->async_data;
7069
7070                         kfree(io->free_iovec);
7071                         break;
7072                         }
7073                 case IORING_OP_RECVMSG:
7074                 case IORING_OP_SENDMSG: {
7075                         struct io_async_msghdr *io = req->async_data;
7076
7077                         kfree(io->free_iov);
7078                         break;
7079                         }
7080                 case IORING_OP_OPENAT:
7081                 case IORING_OP_OPENAT2:
7082                         if (req->open.filename)
7083                                 putname(req->open.filename);
7084                         break;
7085                 case IORING_OP_RENAMEAT:
7086                         putname(req->rename.oldpath);
7087                         putname(req->rename.newpath);
7088                         break;
7089                 case IORING_OP_UNLINKAT:
7090                         putname(req->unlink.filename);
7091                         break;
7092                 case IORING_OP_MKDIRAT:
7093                         putname(req->mkdir.filename);
7094                         break;
7095                 case IORING_OP_SYMLINKAT:
7096                         putname(req->symlink.oldpath);
7097                         putname(req->symlink.newpath);
7098                         break;
7099                 case IORING_OP_LINKAT:
7100                         putname(req->hardlink.oldpath);
7101                         putname(req->hardlink.newpath);
7102                         break;
7103                 case IORING_OP_STATX:
7104                         if (req->statx.filename)
7105                                 putname(req->statx.filename);
7106                         break;
7107                 }
7108         }
7109         if ((req->flags & REQ_F_POLLED) && req->apoll) {
7110                 kfree(req->apoll->double_poll);
7111                 kfree(req->apoll);
7112                 req->apoll = NULL;
7113         }
7114         if (req->flags & REQ_F_CREDS)
7115                 put_cred(req->creds);
7116         if (req->flags & REQ_F_ASYNC_DATA) {
7117                 kfree(req->async_data);
7118                 req->async_data = NULL;
7119         }
7120         req->flags &= ~IO_REQ_CLEAN_FLAGS;
7121 }
7122
7123 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
7124 {
7125         if (req->file || !io_op_defs[req->opcode].needs_file)
7126                 return true;
7127
7128         if (req->flags & REQ_F_FIXED_FILE)
7129                 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
7130         else
7131                 req->file = io_file_get_normal(req, req->cqe.fd);
7132         if (req->file)
7133                 return true;
7134
7135         req_set_fail(req);
7136         req->cqe.res = -EBADF;
7137         return false;
7138 }
7139
7140 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
7141 {
7142         const struct cred *creds = NULL;
7143         int ret;
7144
7145         if (unlikely(!io_assign_file(req, issue_flags)))
7146                 return -EBADF;
7147
7148         if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
7149                 creds = override_creds(req->creds);
7150
7151         if (!io_op_defs[req->opcode].audit_skip)
7152                 audit_uring_entry(req->opcode);
7153
7154         switch (req->opcode) {
7155         case IORING_OP_NOP:
7156                 ret = io_nop(req, issue_flags);
7157                 break;
7158         case IORING_OP_READV:
7159         case IORING_OP_READ_FIXED:
7160         case IORING_OP_READ:
7161                 ret = io_read(req, issue_flags);
7162                 break;
7163         case IORING_OP_WRITEV:
7164         case IORING_OP_WRITE_FIXED:
7165         case IORING_OP_WRITE:
7166                 ret = io_write(req, issue_flags);
7167                 break;
7168         case IORING_OP_FSYNC:
7169                 ret = io_fsync(req, issue_flags);
7170                 break;
7171         case IORING_OP_POLL_ADD:
7172                 ret = io_poll_add(req, issue_flags);
7173                 break;
7174         case IORING_OP_POLL_REMOVE:
7175                 ret = io_poll_update(req, issue_flags);
7176                 break;
7177         case IORING_OP_SYNC_FILE_RANGE:
7178                 ret = io_sync_file_range(req, issue_flags);
7179                 break;
7180         case IORING_OP_SENDMSG:
7181                 ret = io_sendmsg(req, issue_flags);
7182                 break;
7183         case IORING_OP_SEND:
7184                 ret = io_send(req, issue_flags);
7185                 break;
7186         case IORING_OP_RECVMSG:
7187                 ret = io_recvmsg(req, issue_flags);
7188                 break;
7189         case IORING_OP_RECV:
7190                 ret = io_recv(req, issue_flags);
7191                 break;
7192         case IORING_OP_TIMEOUT:
7193                 ret = io_timeout(req, issue_flags);
7194                 break;
7195         case IORING_OP_TIMEOUT_REMOVE:
7196                 ret = io_timeout_remove(req, issue_flags);
7197                 break;
7198         case IORING_OP_ACCEPT:
7199                 ret = io_accept(req, issue_flags);
7200                 break;
7201         case IORING_OP_CONNECT:
7202                 ret = io_connect(req, issue_flags);
7203                 break;
7204         case IORING_OP_ASYNC_CANCEL:
7205                 ret = io_async_cancel(req, issue_flags);
7206                 break;
7207         case IORING_OP_FALLOCATE:
7208                 ret = io_fallocate(req, issue_flags);
7209                 break;
7210         case IORING_OP_OPENAT:
7211                 ret = io_openat(req, issue_flags);
7212                 break;
7213         case IORING_OP_CLOSE:
7214                 ret = io_close(req, issue_flags);
7215                 break;
7216         case IORING_OP_FILES_UPDATE:
7217                 ret = io_files_update(req, issue_flags);
7218                 break;
7219         case IORING_OP_STATX:
7220                 ret = io_statx(req, issue_flags);
7221                 break;
7222         case IORING_OP_FADVISE:
7223                 ret = io_fadvise(req, issue_flags);
7224                 break;
7225         case IORING_OP_MADVISE:
7226                 ret = io_madvise(req, issue_flags);
7227                 break;
7228         case IORING_OP_OPENAT2:
7229                 ret = io_openat2(req, issue_flags);
7230                 break;
7231         case IORING_OP_EPOLL_CTL:
7232                 ret = io_epoll_ctl(req, issue_flags);
7233                 break;
7234         case IORING_OP_SPLICE:
7235                 ret = io_splice(req, issue_flags);
7236                 break;
7237         case IORING_OP_PROVIDE_BUFFERS:
7238                 ret = io_provide_buffers(req, issue_flags);
7239                 break;
7240         case IORING_OP_REMOVE_BUFFERS:
7241                 ret = io_remove_buffers(req, issue_flags);
7242                 break;
7243         case IORING_OP_TEE:
7244                 ret = io_tee(req, issue_flags);
7245                 break;
7246         case IORING_OP_SHUTDOWN:
7247                 ret = io_shutdown(req, issue_flags);
7248                 break;
7249         case IORING_OP_RENAMEAT:
7250                 ret = io_renameat(req, issue_flags);
7251                 break;
7252         case IORING_OP_UNLINKAT:
7253                 ret = io_unlinkat(req, issue_flags);
7254                 break;
7255         case IORING_OP_MKDIRAT:
7256                 ret = io_mkdirat(req, issue_flags);
7257                 break;
7258         case IORING_OP_SYMLINKAT:
7259                 ret = io_symlinkat(req, issue_flags);
7260                 break;
7261         case IORING_OP_LINKAT:
7262                 ret = io_linkat(req, issue_flags);
7263                 break;
7264         case IORING_OP_MSG_RING:
7265                 ret = io_msg_ring(req, issue_flags);
7266                 break;
7267         default:
7268                 ret = -EINVAL;
7269                 break;
7270         }
7271
7272         if (!io_op_defs[req->opcode].audit_skip)
7273                 audit_uring_exit(!ret, ret);
7274
7275         if (creds)
7276                 revert_creds(creds);
7277         if (ret)
7278                 return ret;
7279         /* If the op doesn't have a file, we're not polling for it */
7280         if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
7281                 io_iopoll_req_issued(req, issue_flags);
7282
7283         return 0;
7284 }
7285
7286 static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
7287 {
7288         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7289
7290         req = io_put_req_find_next(req);
7291         return req ? &req->work : NULL;
7292 }
7293
7294 static void io_wq_submit_work(struct io_wq_work *work)
7295 {
7296         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7297         const struct io_op_def *def = &io_op_defs[req->opcode];
7298         unsigned int issue_flags = IO_URING_F_UNLOCKED;
7299         bool needs_poll = false;
7300         struct io_kiocb *timeout;
7301         int ret = 0, err = -ECANCELED;
7302
7303         /* one will be dropped by ->io_free_work() after returning to io-wq */
7304         if (!(req->flags & REQ_F_REFCOUNT))
7305                 __io_req_set_refcount(req, 2);
7306         else
7307                 req_ref_get(req);
7308
7309         timeout = io_prep_linked_timeout(req);
7310         if (timeout)
7311                 io_queue_linked_timeout(timeout);
7312
7313
7314         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
7315         if (work->flags & IO_WQ_WORK_CANCEL) {
7316 fail:
7317                 io_req_task_queue_fail(req, err);
7318                 return;
7319         }
7320         if (!io_assign_file(req, issue_flags)) {
7321                 err = -EBADF;
7322                 work->flags |= IO_WQ_WORK_CANCEL;
7323                 goto fail;
7324         }
7325
7326         if (req->flags & REQ_F_FORCE_ASYNC) {
7327                 bool opcode_poll = def->pollin || def->pollout;
7328
7329                 if (opcode_poll && file_can_poll(req->file)) {
7330                         needs_poll = true;
7331                         issue_flags |= IO_URING_F_NONBLOCK;
7332                 }
7333         }
7334
7335         do {
7336                 ret = io_issue_sqe(req, issue_flags);
7337                 if (ret != -EAGAIN)
7338                         break;
7339                 /*
7340                  * We can get EAGAIN for iopolled IO even though we're
7341                  * forcing a sync submission from here, since we can't
7342                  * wait for request slots on the block side.
7343                  */
7344                 if (!needs_poll) {
7345                         cond_resched();
7346                         continue;
7347                 }
7348
7349                 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
7350                         return;
7351                 /* aborted or ready, in either case retry blocking */
7352                 needs_poll = false;
7353                 issue_flags &= ~IO_URING_F_NONBLOCK;
7354         } while (1);
7355
7356         /* avoid locking problems by failing it from a clean context */
7357         if (ret)
7358                 io_req_task_queue_fail(req, ret);
7359 }
7360
7361 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
7362                                                        unsigned i)
7363 {
7364         return &table->files[i];
7365 }
7366
7367 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
7368                                               int index)
7369 {
7370         struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
7371
7372         return (struct file *) (slot->file_ptr & FFS_MASK);
7373 }
7374
7375 static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
7376 {
7377         unsigned long file_ptr = (unsigned long) file;
7378
7379         file_ptr |= io_file_get_flags(file);
7380         file_slot->file_ptr = file_ptr;
7381 }
7382
7383 static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
7384                                              unsigned int issue_flags)
7385 {
7386         struct io_ring_ctx *ctx = req->ctx;
7387         struct file *file = NULL;
7388         unsigned long file_ptr;
7389
7390         if (issue_flags & IO_URING_F_UNLOCKED)
7391                 mutex_lock(&ctx->uring_lock);
7392
7393         if (unlikely((unsigned int)fd >= ctx->nr_user_files))
7394                 goto out;
7395         fd = array_index_nospec(fd, ctx->nr_user_files);
7396         file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
7397         file = (struct file *) (file_ptr & FFS_MASK);
7398         file_ptr &= ~FFS_MASK;
7399         /* mask in overlapping REQ_F and FFS bits */
7400         req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
7401         io_req_set_rsrc_node(req, ctx, 0);
7402 out:
7403         if (issue_flags & IO_URING_F_UNLOCKED)
7404                 mutex_unlock(&ctx->uring_lock);
7405         return file;
7406 }
7407
7408 /*
7409  * Drop the file for requeue operations. Only used of req->file is the
7410  * io_uring descriptor itself.
7411  */
7412 static void io_drop_inflight_file(struct io_kiocb *req)
7413 {
7414         if (unlikely(req->flags & REQ_F_INFLIGHT)) {
7415                 fput(req->file);
7416                 req->file = NULL;
7417                 req->flags &= ~REQ_F_INFLIGHT;
7418         }
7419 }
7420
7421 static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
7422 {
7423         struct file *file = fget(fd);
7424
7425         trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
7426
7427         /* we don't allow fixed io_uring files */
7428         if (file && file->f_op == &io_uring_fops)
7429                 req->flags |= REQ_F_INFLIGHT;
7430         return file;
7431 }
7432
7433 static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
7434 {
7435         struct io_kiocb *prev = req->timeout.prev;
7436         int ret = -ENOENT;
7437
7438         if (prev) {
7439                 if (!(req->task->flags & PF_EXITING))
7440                         ret = io_try_cancel_userdata(req, prev->cqe.user_data);
7441                 io_req_complete_post(req, ret ?: -ETIME, 0);
7442                 io_put_req(prev);
7443         } else {
7444                 io_req_complete_post(req, -ETIME, 0);
7445         }
7446 }
7447
7448 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
7449 {
7450         struct io_timeout_data *data = container_of(timer,
7451                                                 struct io_timeout_data, timer);
7452         struct io_kiocb *prev, *req = data->req;
7453         struct io_ring_ctx *ctx = req->ctx;
7454         unsigned long flags;
7455
7456         spin_lock_irqsave(&ctx->timeout_lock, flags);
7457         prev = req->timeout.head;
7458         req->timeout.head = NULL;
7459
7460         /*
7461          * We don't expect the list to be empty, that will only happen if we
7462          * race with the completion of the linked work.
7463          */
7464         if (prev) {
7465                 io_remove_next_linked(prev);
7466                 if (!req_ref_inc_not_zero(prev))
7467                         prev = NULL;
7468         }
7469         list_del(&req->timeout.list);
7470         req->timeout.prev = prev;
7471         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
7472
7473         req->io_task_work.func = io_req_task_link_timeout;
7474         io_req_task_work_add(req, false);
7475         return HRTIMER_NORESTART;
7476 }
7477
7478 static void io_queue_linked_timeout(struct io_kiocb *req)
7479 {
7480         struct io_ring_ctx *ctx = req->ctx;
7481
7482         spin_lock_irq(&ctx->timeout_lock);
7483         /*
7484          * If the back reference is NULL, then our linked request finished
7485          * before we got a chance to setup the timer
7486          */
7487         if (req->timeout.head) {
7488                 struct io_timeout_data *data = req->async_data;
7489
7490                 data->timer.function = io_link_timeout_fn;
7491                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
7492                                 data->mode);
7493                 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
7494         }
7495         spin_unlock_irq(&ctx->timeout_lock);
7496         /* drop submission reference */
7497         io_put_req(req);
7498 }
7499
7500 static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
7501         __must_hold(&req->ctx->uring_lock)
7502 {
7503         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
7504
7505         switch (io_arm_poll_handler(req, 0)) {
7506         case IO_APOLL_READY:
7507                 io_req_task_queue(req);
7508                 break;
7509         case IO_APOLL_ABORTED:
7510                 /*
7511                  * Queued up for async execution, worker will release
7512                  * submit reference when the iocb is actually submitted.
7513                  */
7514                 io_queue_async_work(req, NULL);
7515                 break;
7516         case IO_APOLL_OK:
7517                 break;
7518         }
7519
7520         if (linked_timeout)
7521                 io_queue_linked_timeout(linked_timeout);
7522 }
7523
7524 static inline void __io_queue_sqe(struct io_kiocb *req)
7525         __must_hold(&req->ctx->uring_lock)
7526 {
7527         struct io_kiocb *linked_timeout;
7528         int ret;
7529
7530         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
7531
7532         if (req->flags & REQ_F_COMPLETE_INLINE) {
7533                 io_req_add_compl_list(req);
7534                 return;
7535         }
7536         /*
7537          * We async punt it if the file wasn't marked NOWAIT, or if the file
7538          * doesn't support non-blocking read/write attempts
7539          */
7540         if (likely(!ret)) {
7541                 linked_timeout = io_prep_linked_timeout(req);
7542                 if (linked_timeout)
7543                         io_queue_linked_timeout(linked_timeout);
7544         } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
7545                 io_queue_sqe_arm_apoll(req);
7546         } else {
7547                 io_req_complete_failed(req, ret);
7548         }
7549 }
7550
7551 static void io_queue_sqe_fallback(struct io_kiocb *req)
7552         __must_hold(&req->ctx->uring_lock)
7553 {
7554         if (req->flags & REQ_F_FAIL) {
7555                 io_req_complete_fail_submit(req);
7556         } else if (unlikely(req->ctx->drain_active)) {
7557                 io_drain_req(req);
7558         } else {
7559                 int ret = io_req_prep_async(req);
7560
7561                 if (unlikely(ret))
7562                         io_req_complete_failed(req, ret);
7563                 else
7564                         io_queue_async_work(req, NULL);
7565         }
7566 }
7567
7568 static inline void io_queue_sqe(struct io_kiocb *req)
7569         __must_hold(&req->ctx->uring_lock)
7570 {
7571         if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
7572                 __io_queue_sqe(req);
7573         else
7574                 io_queue_sqe_fallback(req);
7575 }
7576
7577 /*
7578  * Check SQE restrictions (opcode and flags).
7579  *
7580  * Returns 'true' if SQE is allowed, 'false' otherwise.
7581  */
7582 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
7583                                         struct io_kiocb *req,
7584                                         unsigned int sqe_flags)
7585 {
7586         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
7587                 return false;
7588
7589         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
7590             ctx->restrictions.sqe_flags_required)
7591                 return false;
7592
7593         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
7594                           ctx->restrictions.sqe_flags_required))
7595                 return false;
7596
7597         return true;
7598 }
7599
7600 static void io_init_req_drain(struct io_kiocb *req)
7601 {
7602         struct io_ring_ctx *ctx = req->ctx;
7603         struct io_kiocb *head = ctx->submit_state.link.head;
7604
7605         ctx->drain_active = true;
7606         if (head) {
7607                 /*
7608                  * If we need to drain a request in the middle of a link, drain
7609                  * the head request and the next request/link after the current
7610                  * link. Considering sequential execution of links,
7611                  * REQ_F_IO_DRAIN will be maintained for every request of our
7612                  * link.
7613                  */
7614                 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
7615                 ctx->drain_next = true;
7616         }
7617 }
7618
7619 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7620                        const struct io_uring_sqe *sqe)
7621         __must_hold(&ctx->uring_lock)
7622 {
7623         unsigned int sqe_flags;
7624         int personality;
7625         u8 opcode;
7626
7627         /* req is partially pre-initialised, see io_preinit_req() */
7628         req->opcode = opcode = READ_ONCE(sqe->opcode);
7629         /* same numerical values with corresponding REQ_F_*, safe to copy */
7630         req->flags = sqe_flags = READ_ONCE(sqe->flags);
7631         req->cqe.user_data = READ_ONCE(sqe->user_data);
7632         req->file = NULL;
7633         req->fixed_rsrc_refs = NULL;
7634         req->task = current;
7635
7636         if (unlikely(opcode >= IORING_OP_LAST)) {
7637                 req->opcode = 0;
7638                 return -EINVAL;
7639         }
7640         if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
7641                 /* enforce forwards compatibility on users */
7642                 if (sqe_flags & ~SQE_VALID_FLAGS)
7643                         return -EINVAL;
7644                 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
7645                     !io_op_defs[opcode].buffer_select)
7646                         return -EOPNOTSUPP;
7647                 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
7648                         ctx->drain_disabled = true;
7649                 if (sqe_flags & IOSQE_IO_DRAIN) {
7650                         if (ctx->drain_disabled)
7651                                 return -EOPNOTSUPP;
7652                         io_init_req_drain(req);
7653                 }
7654         }
7655         if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7656                 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7657                         return -EACCES;
7658                 /* knock it to the slow queue path, will be drained there */
7659                 if (ctx->drain_active)
7660                         req->flags |= REQ_F_FORCE_ASYNC;
7661                 /* if there is no link, we're at "next" request and need to drain */
7662                 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7663                         ctx->drain_next = false;
7664                         ctx->drain_active = true;
7665                         req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
7666                 }
7667         }
7668
7669         if (io_op_defs[opcode].needs_file) {
7670                 struct io_submit_state *state = &ctx->submit_state;
7671
7672                 req->cqe.fd = READ_ONCE(sqe->fd);
7673
7674                 /*
7675                  * Plug now if we have more than 2 IO left after this, and the
7676                  * target is potentially a read/write to block based storage.
7677                  */
7678                 if (state->need_plug && io_op_defs[opcode].plug) {
7679                         state->plug_started = true;
7680                         state->need_plug = false;
7681                         blk_start_plug_nr_ios(&state->plug, state->submit_nr);
7682                 }
7683         }
7684
7685         personality = READ_ONCE(sqe->personality);
7686         if (personality) {
7687                 int ret;
7688
7689                 req->creds = xa_load(&ctx->personalities, personality);
7690                 if (!req->creds)
7691                         return -EINVAL;
7692                 get_cred(req->creds);
7693                 ret = security_uring_override_creds(req->creds);
7694                 if (ret) {
7695                         put_cred(req->creds);
7696                         return ret;
7697                 }
7698                 req->flags |= REQ_F_CREDS;
7699         }
7700
7701         return io_req_prep(req, sqe);
7702 }
7703
7704 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
7705                          const struct io_uring_sqe *sqe)
7706         __must_hold(&ctx->uring_lock)
7707 {
7708         struct io_submit_link *link = &ctx->submit_state.link;
7709         int ret;
7710
7711         ret = io_init_req(ctx, req, sqe);
7712         if (unlikely(ret)) {
7713                 trace_io_uring_req_failed(sqe, ctx, req, ret);
7714
7715                 /* fail even hard links since we don't submit */
7716                 if (link->head) {
7717                         /*
7718                          * we can judge a link req is failed or cancelled by if
7719                          * REQ_F_FAIL is set, but the head is an exception since
7720                          * it may be set REQ_F_FAIL because of other req's failure
7721                          * so let's leverage req->cqe.res to distinguish if a head
7722                          * is set REQ_F_FAIL because of its failure or other req's
7723                          * failure so that we can set the correct ret code for it.
7724                          * init result here to avoid affecting the normal path.
7725                          */
7726                         if (!(link->head->flags & REQ_F_FAIL))
7727                                 req_fail_link_node(link->head, -ECANCELED);
7728                 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7729                         /*
7730                          * the current req is a normal req, we should return
7731                          * error and thus break the submittion loop.
7732                          */
7733                         io_req_complete_failed(req, ret);
7734                         return ret;
7735                 }
7736                 req_fail_link_node(req, ret);
7737         }
7738
7739         /* don't need @sqe from now on */
7740         trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
7741                                   req->flags, true,
7742                                   ctx->flags & IORING_SETUP_SQPOLL);
7743
7744         /*
7745          * If we already have a head request, queue this one for async
7746          * submittal once the head completes. If we don't have a head but
7747          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7748          * submitted sync once the chain is complete. If none of those
7749          * conditions are true (normal request), then just queue it.
7750          */
7751         if (link->head) {
7752                 struct io_kiocb *head = link->head;
7753
7754                 if (!(req->flags & REQ_F_FAIL)) {
7755                         ret = io_req_prep_async(req);
7756                         if (unlikely(ret)) {
7757                                 req_fail_link_node(req, ret);
7758                                 if (!(head->flags & REQ_F_FAIL))
7759                                         req_fail_link_node(head, -ECANCELED);
7760                         }
7761                 }
7762                 trace_io_uring_link(ctx, req, head);
7763                 link->last->link = req;
7764                 link->last = req;
7765
7766                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
7767                         return 0;
7768                 /* last request of a link, enqueue the link */
7769                 link->head = NULL;
7770                 req = head;
7771         } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
7772                 link->head = req;
7773                 link->last = req;
7774                 return 0;
7775         }
7776
7777         io_queue_sqe(req);
7778         return 0;
7779 }
7780
7781 /*
7782  * Batched submission is done, ensure local IO is flushed out.
7783  */
7784 static void io_submit_state_end(struct io_ring_ctx *ctx)
7785 {
7786         struct io_submit_state *state = &ctx->submit_state;
7787
7788         if (state->link.head)
7789                 io_queue_sqe(state->link.head);
7790         /* flush only after queuing links as they can generate completions */
7791         io_submit_flush_completions(ctx);
7792         if (state->plug_started)
7793                 blk_finish_plug(&state->plug);
7794 }
7795
7796 /*
7797  * Start submission side cache.
7798  */
7799 static void io_submit_state_start(struct io_submit_state *state,
7800                                   unsigned int max_ios)
7801 {
7802         state->plug_started = false;
7803         state->need_plug = max_ios > 2;
7804         state->submit_nr = max_ios;
7805         /* set only head, no need to init link_last in advance */
7806         state->link.head = NULL;
7807 }
7808
7809 static void io_commit_sqring(struct io_ring_ctx *ctx)
7810 {
7811         struct io_rings *rings = ctx->rings;
7812
7813         /*
7814          * Ensure any loads from the SQEs are done at this point,
7815          * since once we write the new head, the application could
7816          * write new data to them.
7817          */
7818         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
7819 }
7820
7821 /*
7822  * Fetch an sqe, if one is available. Note this returns a pointer to memory
7823  * that is mapped by userspace. This means that care needs to be taken to
7824  * ensure that reads are stable, as we cannot rely on userspace always
7825  * being a good citizen. If members of the sqe are validated and then later
7826  * used, it's important that those reads are done through READ_ONCE() to
7827  * prevent a re-load down the line.
7828  */
7829 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
7830 {
7831         unsigned head, mask = ctx->sq_entries - 1;
7832         unsigned sq_idx = ctx->cached_sq_head++ & mask;
7833
7834         /*
7835          * The cached sq head (or cq tail) serves two purposes:
7836          *
7837          * 1) allows us to batch the cost of updating the user visible
7838          *    head updates.
7839          * 2) allows the kernel side to track the head on its own, even
7840          *    though the application is the one updating it.
7841          */
7842         head = READ_ONCE(ctx->sq_array[sq_idx]);
7843         if (likely(head < ctx->sq_entries))
7844                 return &ctx->sq_sqes[head];
7845
7846         /* drop invalid entries */
7847         ctx->cq_extra--;
7848         WRITE_ONCE(ctx->rings->sq_dropped,
7849                    READ_ONCE(ctx->rings->sq_dropped) + 1);
7850         return NULL;
7851 }
7852
7853 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
7854         __must_hold(&ctx->uring_lock)
7855 {
7856         unsigned int entries = io_sqring_entries(ctx);
7857         int submitted = 0;
7858
7859         if (unlikely(!entries))
7860                 return 0;
7861         /* make sure SQ entry isn't read before tail */
7862         nr = min3(nr, ctx->sq_entries, entries);
7863         io_get_task_refs(nr);
7864
7865         io_submit_state_start(&ctx->submit_state, nr);
7866         do {
7867                 const struct io_uring_sqe *sqe;
7868                 struct io_kiocb *req;
7869
7870                 if (unlikely(!io_alloc_req_refill(ctx))) {
7871                         if (!submitted)
7872                                 submitted = -EAGAIN;
7873                         break;
7874                 }
7875                 req = io_alloc_req(ctx);
7876                 sqe = io_get_sqe(ctx);
7877                 if (unlikely(!sqe)) {
7878                         wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
7879                         break;
7880                 }
7881                 /* will complete beyond this point, count as submitted */
7882                 submitted++;
7883                 if (io_submit_sqe(ctx, req, sqe)) {
7884                         /*
7885                          * Continue submitting even for sqe failure if the
7886                          * ring was setup with IORING_SETUP_SUBMIT_ALL
7887                          */
7888                         if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
7889                                 break;
7890                 }
7891         } while (submitted < nr);
7892
7893         if (unlikely(submitted != nr)) {
7894                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
7895                 int unused = nr - ref_used;
7896
7897                 current->io_uring->cached_refs += unused;
7898         }
7899
7900         io_submit_state_end(ctx);
7901          /* Commit SQ ring head once we've consumed and submitted all SQEs */
7902         io_commit_sqring(ctx);
7903
7904         return submitted;
7905 }
7906
7907 static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7908 {
7909         return READ_ONCE(sqd->state);
7910 }
7911
7912 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7913 {
7914         /* Tell userspace we may need a wakeup call */
7915         spin_lock(&ctx->completion_lock);
7916         WRITE_ONCE(ctx->rings->sq_flags,
7917                    ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
7918         spin_unlock(&ctx->completion_lock);
7919 }
7920
7921 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7922 {
7923         spin_lock(&ctx->completion_lock);
7924         WRITE_ONCE(ctx->rings->sq_flags,
7925                    ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
7926         spin_unlock(&ctx->completion_lock);
7927 }
7928
7929 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
7930 {
7931         unsigned int to_submit;
7932         int ret = 0;
7933
7934         to_submit = io_sqring_entries(ctx);
7935         /* if we're handling multiple rings, cap submit size for fairness */
7936         if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7937                 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
7938
7939         if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
7940                 const struct cred *creds = NULL;
7941
7942                 if (ctx->sq_creds != current_cred())
7943                         creds = override_creds(ctx->sq_creds);
7944
7945                 mutex_lock(&ctx->uring_lock);
7946                 if (!wq_list_empty(&ctx->iopoll_list))
7947                         io_do_iopoll(ctx, true);
7948
7949                 /*
7950                  * Don't submit if refs are dying, good for io_uring_register(),
7951                  * but also it is relied upon by io_ring_exit_work()
7952                  */
7953                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7954                     !(ctx->flags & IORING_SETUP_R_DISABLED))
7955                         ret = io_submit_sqes(ctx, to_submit);
7956                 mutex_unlock(&ctx->uring_lock);
7957
7958                 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7959                         wake_up(&ctx->sqo_sq_wait);
7960                 if (creds)
7961                         revert_creds(creds);
7962         }
7963
7964         return ret;
7965 }
7966
7967 static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
7968 {
7969         struct io_ring_ctx *ctx;
7970         unsigned sq_thread_idle = 0;
7971
7972         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7973                 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
7974         sqd->sq_thread_idle = sq_thread_idle;
7975 }
7976
7977 static bool io_sqd_handle_event(struct io_sq_data *sqd)
7978 {
7979         bool did_sig = false;
7980         struct ksignal ksig;
7981
7982         if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7983             signal_pending(current)) {
7984                 mutex_unlock(&sqd->lock);
7985                 if (signal_pending(current))
7986                         did_sig = get_signal(&ksig);
7987                 cond_resched();
7988                 mutex_lock(&sqd->lock);
7989         }
7990         return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7991 }
7992
7993 static int io_sq_thread(void *data)
7994 {
7995         struct io_sq_data *sqd = data;
7996         struct io_ring_ctx *ctx;
7997         unsigned long timeout = 0;
7998         char buf[TASK_COMM_LEN];
7999         DEFINE_WAIT(wait);
8000
8001         snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
8002         set_task_comm(current, buf);
8003
8004         if (sqd->sq_cpu != -1)
8005                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
8006         else
8007                 set_cpus_allowed_ptr(current, cpu_online_mask);
8008         current->flags |= PF_NO_SETAFFINITY;
8009
8010         audit_alloc_kernel(current);
8011
8012         mutex_lock(&sqd->lock);
8013         while (1) {
8014                 bool cap_entries, sqt_spin = false;
8015
8016                 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
8017                         if (io_sqd_handle_event(sqd))
8018                                 break;
8019                         timeout = jiffies + sqd->sq_thread_idle;
8020                 }
8021
8022                 cap_entries = !list_is_singular(&sqd->ctx_list);
8023                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
8024                         int ret = __io_sq_thread(ctx, cap_entries);
8025
8026                         if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
8027                                 sqt_spin = true;
8028                 }
8029                 if (io_run_task_work())
8030                         sqt_spin = true;
8031
8032                 if (sqt_spin || !time_after(jiffies, timeout)) {
8033                         cond_resched();
8034                         if (sqt_spin)
8035                                 timeout = jiffies + sqd->sq_thread_idle;
8036                         continue;
8037                 }
8038
8039                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
8040                 if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
8041                         bool needs_sched = true;
8042
8043                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
8044                                 io_ring_set_wakeup_flag(ctx);
8045
8046                                 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
8047                                     !wq_list_empty(&ctx->iopoll_list)) {
8048                                         needs_sched = false;
8049                                         break;
8050                                 }
8051
8052                                 /*
8053                                  * Ensure the store of the wakeup flag is not
8054                                  * reordered with the load of the SQ tail
8055                                  */
8056                                 smp_mb();
8057
8058                                 if (io_sqring_entries(ctx)) {
8059                                         needs_sched = false;
8060                                         break;
8061                                 }
8062                         }
8063
8064                         if (needs_sched) {
8065                                 mutex_unlock(&sqd->lock);
8066                                 schedule();
8067                                 mutex_lock(&sqd->lock);
8068                         }
8069                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8070                                 io_ring_clear_wakeup_flag(ctx);
8071                 }
8072
8073                 finish_wait(&sqd->wait, &wait);
8074                 timeout = jiffies + sqd->sq_thread_idle;
8075         }
8076
8077         io_uring_cancel_generic(true, sqd);
8078         sqd->thread = NULL;
8079         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8080                 io_ring_set_wakeup_flag(ctx);
8081         io_run_task_work();
8082         mutex_unlock(&sqd->lock);
8083
8084         audit_free(current);
8085
8086         complete(&sqd->exited);
8087         do_exit(0);
8088 }
8089
8090 struct io_wait_queue {
8091         struct wait_queue_entry wq;
8092         struct io_ring_ctx *ctx;
8093         unsigned cq_tail;
8094         unsigned nr_timeouts;
8095 };
8096
8097 static inline bool io_should_wake(struct io_wait_queue *iowq)
8098 {
8099         struct io_ring_ctx *ctx = iowq->ctx;
8100         int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
8101
8102         /*
8103          * Wake up if we have enough events, or if a timeout occurred since we
8104          * started waiting. For timeouts, we always want to return to userspace,
8105          * regardless of event count.
8106          */
8107         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
8108 }
8109
8110 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
8111                             int wake_flags, void *key)
8112 {
8113         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
8114                                                         wq);
8115
8116         /*
8117          * Cannot safely flush overflowed CQEs from here, ensure we wake up
8118          * the task, and the next invocation will do it.
8119          */
8120         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
8121                 return autoremove_wake_function(curr, mode, wake_flags, key);
8122         return -1;
8123 }
8124
8125 static int io_run_task_work_sig(void)
8126 {
8127         if (io_run_task_work())
8128                 return 1;
8129         if (test_thread_flag(TIF_NOTIFY_SIGNAL))
8130                 return -ERESTARTSYS;
8131         if (task_sigpending(current))
8132                 return -EINTR;
8133         return 0;
8134 }
8135
8136 /* when returns >0, the caller should retry */
8137 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
8138                                           struct io_wait_queue *iowq,
8139                                           ktime_t timeout)
8140 {
8141         int ret;
8142
8143         /* make sure we run task_work before checking for signals */
8144         ret = io_run_task_work_sig();
8145         if (ret || io_should_wake(iowq))
8146                 return ret;
8147         /* let the caller flush overflows, retry */
8148         if (test_bit(0, &ctx->check_cq_overflow))
8149                 return 1;
8150
8151         if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
8152                 return -ETIME;
8153         return 1;
8154 }
8155
8156 /*
8157  * Wait until events become available, if we don't already have some. The
8158  * application must reap them itself, as they reside on the shared cq ring.
8159  */
8160 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
8161                           const sigset_t __user *sig, size_t sigsz,
8162                           struct __kernel_timespec __user *uts)
8163 {
8164         struct io_wait_queue iowq;
8165         struct io_rings *rings = ctx->rings;
8166         ktime_t timeout = KTIME_MAX;
8167         int ret;
8168
8169         do {
8170                 io_cqring_overflow_flush(ctx);
8171                 if (io_cqring_events(ctx) >= min_events)
8172                         return 0;
8173                 if (!io_run_task_work())
8174                         break;
8175         } while (1);
8176
8177         if (sig) {
8178 #ifdef CONFIG_COMPAT
8179                 if (in_compat_syscall())
8180                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
8181                                                       sigsz);
8182                 else
8183 #endif
8184                         ret = set_user_sigmask(sig, sigsz);
8185
8186                 if (ret)
8187                         return ret;
8188         }
8189
8190         if (uts) {
8191                 struct timespec64 ts;
8192
8193                 if (get_timespec64(&ts, uts))
8194                         return -EFAULT;
8195                 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
8196         }
8197
8198         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
8199         iowq.wq.private = current;
8200         INIT_LIST_HEAD(&iowq.wq.entry);
8201         iowq.ctx = ctx;
8202         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
8203         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
8204
8205         trace_io_uring_cqring_wait(ctx, min_events);
8206         do {
8207                 /* if we can't even flush overflow, don't wait for more */
8208                 if (!io_cqring_overflow_flush(ctx)) {
8209                         ret = -EBUSY;
8210                         break;
8211                 }
8212                 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
8213                                                 TASK_INTERRUPTIBLE);
8214                 ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
8215                 cond_resched();
8216         } while (ret > 0);
8217
8218         finish_wait(&ctx->cq_wait, &iowq.wq);
8219         restore_saved_sigmask_unless(ret == -EINTR);
8220
8221         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
8222 }
8223
8224 static void io_free_page_table(void **table, size_t size)
8225 {
8226         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
8227
8228         for (i = 0; i < nr_tables; i++)
8229                 kfree(table[i]);
8230         kfree(table);
8231 }
8232
8233 static __cold void **io_alloc_page_table(size_t size)
8234 {
8235         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
8236         size_t init_size = size;
8237         void **table;
8238
8239         table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
8240         if (!table)
8241                 return NULL;
8242
8243         for (i = 0; i < nr_tables; i++) {
8244                 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
8245
8246                 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
8247                 if (!table[i]) {
8248                         io_free_page_table(table, init_size);
8249                         return NULL;
8250                 }
8251                 size -= this_size;
8252         }
8253         return table;
8254 }
8255
8256 static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
8257 {
8258         percpu_ref_exit(&ref_node->refs);
8259         kfree(ref_node);
8260 }
8261
8262 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
8263 {
8264         struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
8265         struct io_ring_ctx *ctx = node->rsrc_data->ctx;
8266         unsigned long flags;
8267         bool first_add = false;
8268         unsigned long delay = HZ;
8269
8270         spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
8271         node->done = true;
8272
8273         /* if we are mid-quiesce then do not delay */
8274         if (node->rsrc_data->quiesce)
8275                 delay = 0;
8276
8277         while (!list_empty(&ctx->rsrc_ref_list)) {
8278                 node = list_first_entry(&ctx->rsrc_ref_list,
8279                                             struct io_rsrc_node, node);
8280                 /* recycle ref nodes in order */
8281                 if (!node->done)
8282                         break;
8283                 list_del(&node->node);
8284                 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
8285         }
8286         spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
8287
8288         if (first_add)
8289                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
8290 }
8291
8292 static struct io_rsrc_node *io_rsrc_node_alloc(void)
8293 {
8294         struct io_rsrc_node *ref_node;
8295
8296         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
8297         if (!ref_node)
8298                 return NULL;
8299
8300         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
8301                             0, GFP_KERNEL)) {
8302                 kfree(ref_node);
8303                 return NULL;
8304         }
8305         INIT_LIST_HEAD(&ref_node->node);
8306         INIT_LIST_HEAD(&ref_node->rsrc_list);
8307         ref_node->done = false;
8308         return ref_node;
8309 }
8310
8311 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
8312                                 struct io_rsrc_data *data_to_kill)
8313         __must_hold(&ctx->uring_lock)
8314 {
8315         WARN_ON_ONCE(!ctx->rsrc_backup_node);
8316         WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
8317
8318         io_rsrc_refs_drop(ctx);
8319
8320         if (data_to_kill) {
8321                 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
8322
8323                 rsrc_node->rsrc_data = data_to_kill;
8324                 spin_lock_irq(&ctx->rsrc_ref_lock);
8325                 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
8326                 spin_unlock_irq(&ctx->rsrc_ref_lock);
8327
8328                 atomic_inc(&data_to_kill->refs);
8329                 percpu_ref_kill(&rsrc_node->refs);
8330                 ctx->rsrc_node = NULL;
8331         }
8332
8333         if (!ctx->rsrc_node) {
8334                 ctx->rsrc_node = ctx->rsrc_backup_node;
8335                 ctx->rsrc_backup_node = NULL;
8336         }
8337 }
8338
8339 static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
8340 {
8341         if (ctx->rsrc_backup_node)
8342                 return 0;
8343         ctx->rsrc_backup_node = io_rsrc_node_alloc();
8344         return ctx->rsrc_backup_node ? 0 : -ENOMEM;
8345 }
8346
8347 static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
8348                                       struct io_ring_ctx *ctx)
8349 {
8350         int ret;
8351
8352         /* As we may drop ->uring_lock, other task may have started quiesce */
8353         if (data->quiesce)
8354                 return -ENXIO;
8355
8356         data->quiesce = true;
8357         do {
8358                 ret = io_rsrc_node_switch_start(ctx);
8359                 if (ret)
8360                         break;
8361                 io_rsrc_node_switch(ctx, data);
8362
8363                 /* kill initial ref, already quiesced if zero */
8364                 if (atomic_dec_and_test(&data->refs))
8365                         break;
8366                 mutex_unlock(&ctx->uring_lock);
8367                 flush_delayed_work(&ctx->rsrc_put_work);
8368                 ret = wait_for_completion_interruptible(&data->done);
8369                 if (!ret) {
8370                         mutex_lock(&ctx->uring_lock);
8371                         if (atomic_read(&data->refs) > 0) {
8372                                 /*
8373                                  * it has been revived by another thread while
8374                                  * we were unlocked
8375                                  */
8376                                 mutex_unlock(&ctx->uring_lock);
8377                         } else {
8378                                 break;
8379                         }
8380                 }
8381
8382                 atomic_inc(&data->refs);
8383                 /* wait for all works potentially completing data->done */
8384                 flush_delayed_work(&ctx->rsrc_put_work);
8385                 reinit_completion(&data->done);
8386
8387                 ret = io_run_task_work_sig();
8388                 mutex_lock(&ctx->uring_lock);
8389         } while (ret >= 0);
8390         data->quiesce = false;
8391
8392         return ret;
8393 }
8394
8395 static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
8396 {
8397         unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
8398         unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
8399
8400         return &data->tags[table_idx][off];
8401 }
8402
8403 static void io_rsrc_data_free(struct io_rsrc_data *data)
8404 {
8405         size_t size = data->nr * sizeof(data->tags[0][0]);
8406
8407         if (data->tags)
8408                 io_free_page_table((void **)data->tags, size);
8409         kfree(data);
8410 }
8411
8412 static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
8413                                      u64 __user *utags, unsigned nr,
8414                                      struct io_rsrc_data **pdata)
8415 {
8416         struct io_rsrc_data *data;
8417         int ret = -ENOMEM;
8418         unsigned i;
8419
8420         data = kzalloc(sizeof(*data), GFP_KERNEL);
8421         if (!data)
8422                 return -ENOMEM;
8423         data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
8424         if (!data->tags) {
8425                 kfree(data);
8426                 return -ENOMEM;
8427         }
8428
8429         data->nr = nr;
8430         data->ctx = ctx;
8431         data->do_put = do_put;
8432         if (utags) {
8433                 ret = -EFAULT;
8434                 for (i = 0; i < nr; i++) {
8435                         u64 *tag_slot = io_get_tag_slot(data, i);
8436
8437                         if (copy_from_user(tag_slot, &utags[i],
8438                                            sizeof(*tag_slot)))
8439                                 goto fail;
8440                 }
8441         }
8442
8443         atomic_set(&data->refs, 1);
8444         init_completion(&data->done);
8445         *pdata = data;
8446         return 0;
8447 fail:
8448         io_rsrc_data_free(data);
8449         return ret;
8450 }
8451
8452 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
8453 {
8454         table->files = kvcalloc(nr_files, sizeof(table->files[0]),
8455                                 GFP_KERNEL_ACCOUNT);
8456         return !!table->files;
8457 }
8458
8459 static void io_free_file_tables(struct io_file_table *table)
8460 {
8461         kvfree(table->files);
8462         table->files = NULL;
8463 }
8464
8465 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
8466 {
8467         int i;
8468
8469         for (i = 0; i < ctx->nr_user_files; i++) {
8470                 struct file *file = io_file_from_index(ctx, i);
8471
8472                 if (!file || io_file_need_scm(file))
8473                         continue;
8474                 io_fixed_file_slot(&ctx->file_table, i)->file_ptr = 0;
8475                 fput(file);
8476         }
8477
8478 #if defined(CONFIG_UNIX)
8479         if (ctx->ring_sock) {
8480                 struct sock *sock = ctx->ring_sock->sk;
8481                 struct sk_buff *skb;
8482
8483                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
8484                         kfree_skb(skb);
8485         }
8486 #endif
8487         io_free_file_tables(&ctx->file_table);
8488         io_rsrc_data_free(ctx->file_data);
8489         ctx->file_data = NULL;
8490         ctx->nr_user_files = 0;
8491 }
8492
8493 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
8494 {
8495         int ret;
8496
8497         if (!ctx->file_data)
8498                 return -ENXIO;
8499         ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
8500         if (!ret)
8501                 __io_sqe_files_unregister(ctx);
8502         return ret;
8503 }
8504
8505 static void io_sq_thread_unpark(struct io_sq_data *sqd)
8506         __releases(&sqd->lock)
8507 {
8508         WARN_ON_ONCE(sqd->thread == current);
8509
8510         /*
8511          * Do the dance but not conditional clear_bit() because it'd race with
8512          * other threads incrementing park_pending and setting the bit.
8513          */
8514         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8515         if (atomic_dec_return(&sqd->park_pending))
8516                 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8517         mutex_unlock(&sqd->lock);
8518 }
8519
8520 static void io_sq_thread_park(struct io_sq_data *sqd)
8521         __acquires(&sqd->lock)
8522 {
8523         WARN_ON_ONCE(sqd->thread == current);
8524
8525         atomic_inc(&sqd->park_pending);
8526         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8527         mutex_lock(&sqd->lock);
8528         if (sqd->thread)
8529                 wake_up_process(sqd->thread);
8530 }
8531
8532 static void io_sq_thread_stop(struct io_sq_data *sqd)
8533 {
8534         WARN_ON_ONCE(sqd->thread == current);
8535         WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
8536
8537         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8538         mutex_lock(&sqd->lock);
8539         if (sqd->thread)
8540                 wake_up_process(sqd->thread);
8541         mutex_unlock(&sqd->lock);
8542         wait_for_completion(&sqd->exited);
8543 }
8544
8545 static void io_put_sq_data(struct io_sq_data *sqd)
8546 {
8547         if (refcount_dec_and_test(&sqd->refs)) {
8548                 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
8549
8550                 io_sq_thread_stop(sqd);
8551                 kfree(sqd);
8552         }
8553 }
8554
8555 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
8556 {
8557         struct io_sq_data *sqd = ctx->sq_data;
8558
8559         if (sqd) {
8560                 io_sq_thread_park(sqd);
8561                 list_del_init(&ctx->sqd_list);
8562                 io_sqd_update_thread_idle(sqd);
8563                 io_sq_thread_unpark(sqd);
8564
8565                 io_put_sq_data(sqd);
8566                 ctx->sq_data = NULL;
8567         }
8568 }
8569
8570 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
8571 {
8572         struct io_ring_ctx *ctx_attach;
8573         struct io_sq_data *sqd;
8574         struct fd f;
8575
8576         f = fdget(p->wq_fd);
8577         if (!f.file)
8578                 return ERR_PTR(-ENXIO);
8579         if (f.file->f_op != &io_uring_fops) {
8580                 fdput(f);
8581                 return ERR_PTR(-EINVAL);
8582         }
8583
8584         ctx_attach = f.file->private_data;
8585         sqd = ctx_attach->sq_data;
8586         if (!sqd) {
8587                 fdput(f);
8588                 return ERR_PTR(-EINVAL);
8589         }
8590         if (sqd->task_tgid != current->tgid) {
8591                 fdput(f);
8592                 return ERR_PTR(-EPERM);
8593         }
8594
8595         refcount_inc(&sqd->refs);
8596         fdput(f);
8597         return sqd;
8598 }
8599
8600 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
8601                                          bool *attached)
8602 {
8603         struct io_sq_data *sqd;
8604
8605         *attached = false;
8606         if (p->flags & IORING_SETUP_ATTACH_WQ) {
8607                 sqd = io_attach_sq_data(p);
8608                 if (!IS_ERR(sqd)) {
8609                         *attached = true;
8610                         return sqd;
8611                 }
8612                 /* fall through for EPERM case, setup new sqd/task */
8613                 if (PTR_ERR(sqd) != -EPERM)
8614                         return sqd;
8615         }
8616
8617         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
8618         if (!sqd)
8619                 return ERR_PTR(-ENOMEM);
8620
8621         atomic_set(&sqd->park_pending, 0);
8622         refcount_set(&sqd->refs, 1);
8623         INIT_LIST_HEAD(&sqd->ctx_list);
8624         mutex_init(&sqd->lock);
8625         init_waitqueue_head(&sqd->wait);
8626         init_completion(&sqd->exited);
8627         return sqd;
8628 }
8629
8630 /*
8631  * Ensure the UNIX gc is aware of our file set, so we are certain that
8632  * the io_uring can be safely unregistered on process exit, even if we have
8633  * loops in the file referencing. We account only files that can hold other
8634  * files because otherwise they can't form a loop and so are not interesting
8635  * for GC.
8636  */
8637 static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
8638 {
8639 #if defined(CONFIG_UNIX)
8640         struct sock *sk = ctx->ring_sock->sk;
8641         struct sk_buff_head *head = &sk->sk_receive_queue;
8642         struct scm_fp_list *fpl;
8643         struct sk_buff *skb;
8644
8645         if (likely(!io_file_need_scm(file)))
8646                 return 0;
8647
8648         /*
8649          * See if we can merge this file into an existing skb SCM_RIGHTS
8650          * file set. If there's no room, fall back to allocating a new skb
8651          * and filling it in.
8652          */
8653         spin_lock_irq(&head->lock);
8654         skb = skb_peek(head);
8655         if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
8656                 __skb_unlink(skb, head);
8657         else
8658                 skb = NULL;
8659         spin_unlock_irq(&head->lock);
8660
8661         if (!skb) {
8662                 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
8663                 if (!fpl)
8664                         return -ENOMEM;
8665
8666                 skb = alloc_skb(0, GFP_KERNEL);
8667                 if (!skb) {
8668                         kfree(fpl);
8669                         return -ENOMEM;
8670                 }
8671
8672                 fpl->user = get_uid(current_user());
8673                 fpl->max = SCM_MAX_FD;
8674                 fpl->count = 0;
8675
8676                 UNIXCB(skb).fp = fpl;
8677                 skb->sk = sk;
8678                 skb->destructor = unix_destruct_scm;
8679                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8680         }
8681
8682         fpl = UNIXCB(skb).fp;
8683         fpl->fp[fpl->count++] = get_file(file);
8684         unix_inflight(fpl->user, file);
8685         skb_queue_head(head, skb);
8686         fput(file);
8687 #endif
8688         return 0;
8689 }
8690
8691 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8692 {
8693         struct file *file = prsrc->file;
8694 #if defined(CONFIG_UNIX)
8695         struct sock *sock = ctx->ring_sock->sk;
8696         struct sk_buff_head list, *head = &sock->sk_receive_queue;
8697         struct sk_buff *skb;
8698         int i;
8699
8700         if (!io_file_need_scm(file)) {
8701                 fput(file);
8702                 return;
8703         }
8704
8705         __skb_queue_head_init(&list);
8706
8707         /*
8708          * Find the skb that holds this file in its SCM_RIGHTS. When found,
8709          * remove this entry and rearrange the file array.
8710          */
8711         skb = skb_dequeue(head);
8712         while (skb) {
8713                 struct scm_fp_list *fp;
8714
8715                 fp = UNIXCB(skb).fp;
8716                 for (i = 0; i < fp->count; i++) {
8717                         int left;
8718
8719                         if (fp->fp[i] != file)
8720                                 continue;
8721
8722                         unix_notinflight(fp->user, fp->fp[i]);
8723                         left = fp->count - 1 - i;
8724                         if (left) {
8725                                 memmove(&fp->fp[i], &fp->fp[i + 1],
8726                                                 left * sizeof(struct file *));
8727                         }
8728                         fp->count--;
8729                         if (!fp->count) {
8730                                 kfree_skb(skb);
8731                                 skb = NULL;
8732                         } else {
8733                                 __skb_queue_tail(&list, skb);
8734                         }
8735                         fput(file);
8736                         file = NULL;
8737                         break;
8738                 }
8739
8740                 if (!file)
8741                         break;
8742
8743                 __skb_queue_tail(&list, skb);
8744
8745                 skb = skb_dequeue(head);
8746         }
8747
8748         if (skb_peek(&list)) {
8749                 spin_lock_irq(&head->lock);
8750                 while ((skb = __skb_dequeue(&list)) != NULL)
8751                         __skb_queue_tail(head, skb);
8752                 spin_unlock_irq(&head->lock);
8753         }
8754 #else
8755         fput(file);
8756 #endif
8757 }
8758
8759 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
8760 {
8761         struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
8762         struct io_ring_ctx *ctx = rsrc_data->ctx;
8763         struct io_rsrc_put *prsrc, *tmp;
8764
8765         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8766                 list_del(&prsrc->list);
8767
8768                 if (prsrc->tag) {
8769                         if (ctx->flags & IORING_SETUP_IOPOLL)
8770                                 mutex_lock(&ctx->uring_lock);
8771
8772                         spin_lock(&ctx->completion_lock);
8773                         io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
8774                         io_commit_cqring(ctx);
8775                         spin_unlock(&ctx->completion_lock);
8776                         io_cqring_ev_posted(ctx);
8777
8778                         if (ctx->flags & IORING_SETUP_IOPOLL)
8779                                 mutex_unlock(&ctx->uring_lock);
8780                 }
8781
8782                 rsrc_data->do_put(ctx, prsrc);
8783                 kfree(prsrc);
8784         }
8785
8786         io_rsrc_node_destroy(ref_node);
8787         if (atomic_dec_and_test(&rsrc_data->refs))
8788                 complete(&rsrc_data->done);
8789 }
8790
8791 static void io_rsrc_put_work(struct work_struct *work)
8792 {
8793         struct io_ring_ctx *ctx;
8794         struct llist_node *node;
8795
8796         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8797         node = llist_del_all(&ctx->rsrc_put_llist);
8798
8799         while (node) {
8800                 struct io_rsrc_node *ref_node;
8801                 struct llist_node *next = node->next;
8802
8803                 ref_node = llist_entry(node, struct io_rsrc_node, llist);
8804                 __io_rsrc_put_work(ref_node);
8805                 node = next;
8806         }
8807 }
8808
8809 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
8810                                  unsigned nr_args, u64 __user *tags)
8811 {
8812         __s32 __user *fds = (__s32 __user *) arg;
8813         struct file *file;
8814         int fd, ret;
8815         unsigned i;
8816
8817         if (ctx->file_data)
8818                 return -EBUSY;
8819         if (!nr_args)
8820                 return -EINVAL;
8821         if (nr_args > IORING_MAX_FIXED_FILES)
8822                 return -EMFILE;
8823         if (nr_args > rlimit(RLIMIT_NOFILE))
8824                 return -EMFILE;
8825         ret = io_rsrc_node_switch_start(ctx);
8826         if (ret)
8827                 return ret;
8828         ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8829                                  &ctx->file_data);
8830         if (ret)
8831                 return ret;
8832
8833         if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
8834                 io_rsrc_data_free(ctx->file_data);
8835                 ctx->file_data = NULL;
8836                 return -ENOMEM;
8837         }
8838
8839         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
8840                 struct io_fixed_file *file_slot;
8841
8842                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
8843                         ret = -EFAULT;
8844                         goto fail;
8845                 }
8846                 /* allow sparse sets */
8847                 if (fd == -1) {
8848                         ret = -EINVAL;
8849                         if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
8850                                 goto fail;
8851                         continue;
8852                 }
8853
8854                 file = fget(fd);
8855                 ret = -EBADF;
8856                 if (unlikely(!file))
8857                         goto fail;
8858
8859                 /*
8860                  * Don't allow io_uring instances to be registered. If UNIX
8861                  * isn't enabled, then this causes a reference cycle and this
8862                  * instance can never get freed. If UNIX is enabled we'll
8863                  * handle it just fine, but there's still no point in allowing
8864                  * a ring fd as it doesn't support regular read/write anyway.
8865                  */
8866                 if (file->f_op == &io_uring_fops) {
8867                         fput(file);
8868                         goto fail;
8869                 }
8870                 ret = io_scm_file_account(ctx, file);
8871                 if (ret) {
8872                         fput(file);
8873                         goto fail;
8874                 }
8875                 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8876                 io_fixed_file_set(file_slot, file);
8877         }
8878
8879         io_rsrc_node_switch(ctx, NULL);
8880         return 0;
8881 fail:
8882         __io_sqe_files_unregister(ctx);
8883         return ret;
8884 }
8885
8886 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8887                                  struct io_rsrc_node *node, void *rsrc)
8888 {
8889         u64 *tag_slot = io_get_tag_slot(data, idx);
8890         struct io_rsrc_put *prsrc;
8891
8892         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8893         if (!prsrc)
8894                 return -ENOMEM;
8895
8896         prsrc->tag = *tag_slot;
8897         *tag_slot = 0;
8898         prsrc->rsrc = rsrc;
8899         list_add(&prsrc->list, &node->rsrc_list);
8900         return 0;
8901 }
8902
8903 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8904                                  unsigned int issue_flags, u32 slot_index)
8905 {
8906         struct io_ring_ctx *ctx = req->ctx;
8907         bool needs_switch = false;
8908         struct io_fixed_file *file_slot;
8909         int ret = -EBADF;
8910
8911         io_ring_submit_lock(ctx, issue_flags);
8912         if (file->f_op == &io_uring_fops)
8913                 goto err;
8914         ret = -ENXIO;
8915         if (!ctx->file_data)
8916                 goto err;
8917         ret = -EINVAL;
8918         if (slot_index >= ctx->nr_user_files)
8919                 goto err;
8920
8921         slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8922         file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
8923
8924         if (file_slot->file_ptr) {
8925                 struct file *old_file;
8926
8927                 ret = io_rsrc_node_switch_start(ctx);
8928                 if (ret)
8929                         goto err;
8930
8931                 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8932                 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8933                                             ctx->rsrc_node, old_file);
8934                 if (ret)
8935                         goto err;
8936                 file_slot->file_ptr = 0;
8937                 needs_switch = true;
8938         }
8939
8940         ret = io_scm_file_account(ctx, file);
8941         if (!ret) {
8942                 *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8943                 io_fixed_file_set(file_slot, file);
8944         }
8945 err:
8946         if (needs_switch)
8947                 io_rsrc_node_switch(ctx, ctx->file_data);
8948         io_ring_submit_unlock(ctx, issue_flags);
8949         if (ret)
8950                 fput(file);
8951         return ret;
8952 }
8953
8954 static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8955 {
8956         unsigned int offset = req->close.file_slot - 1;
8957         struct io_ring_ctx *ctx = req->ctx;
8958         struct io_fixed_file *file_slot;
8959         struct file *file;
8960         int ret;
8961
8962         io_ring_submit_lock(ctx, issue_flags);
8963         ret = -ENXIO;
8964         if (unlikely(!ctx->file_data))
8965                 goto out;
8966         ret = -EINVAL;
8967         if (offset >= ctx->nr_user_files)
8968                 goto out;
8969         ret = io_rsrc_node_switch_start(ctx);
8970         if (ret)
8971                 goto out;
8972
8973         offset = array_index_nospec(offset, ctx->nr_user_files);
8974         file_slot = io_fixed_file_slot(&ctx->file_table, offset);
8975         ret = -EBADF;
8976         if (!file_slot->file_ptr)
8977                 goto out;
8978
8979         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8980         ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8981         if (ret)
8982                 goto out;
8983
8984         file_slot->file_ptr = 0;
8985         io_rsrc_node_switch(ctx, ctx->file_data);
8986         ret = 0;
8987 out:
8988         io_ring_submit_unlock(ctx, issue_flags);
8989         return ret;
8990 }
8991
8992 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
8993                                  struct io_uring_rsrc_update2 *up,
8994                                  unsigned nr_args)
8995 {
8996         u64 __user *tags = u64_to_user_ptr(up->tags);
8997         __s32 __user *fds = u64_to_user_ptr(up->data);
8998         struct io_rsrc_data *data = ctx->file_data;
8999         struct io_fixed_file *file_slot;
9000         struct file *file;
9001         int fd, i, err = 0;
9002         unsigned int done;
9003         bool needs_switch = false;
9004
9005         if (!ctx->file_data)
9006                 return -ENXIO;
9007         if (up->offset + nr_args > ctx->nr_user_files)
9008                 return -EINVAL;
9009
9010         for (done = 0; done < nr_args; done++) {
9011                 u64 tag = 0;
9012
9013                 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
9014                     copy_from_user(&fd, &fds[done], sizeof(fd))) {
9015                         err = -EFAULT;
9016                         break;
9017                 }
9018                 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
9019                         err = -EINVAL;
9020                         break;
9021                 }
9022                 if (fd == IORING_REGISTER_FILES_SKIP)
9023                         continue;
9024
9025                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
9026                 file_slot = io_fixed_file_slot(&ctx->file_table, i);
9027
9028                 if (file_slot->file_ptr) {
9029                         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
9030                         err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
9031                         if (err)
9032                                 break;
9033                         file_slot->file_ptr = 0;
9034                         needs_switch = true;
9035                 }
9036                 if (fd != -1) {
9037                         file = fget(fd);
9038                         if (!file) {
9039                                 err = -EBADF;
9040                                 break;
9041                         }
9042                         /*
9043                          * Don't allow io_uring instances to be registered. If
9044                          * UNIX isn't enabled, then this causes a reference
9045                          * cycle and this instance can never get freed. If UNIX
9046                          * is enabled we'll handle it just fine, but there's
9047                          * still no point in allowing a ring fd as it doesn't
9048                          * support regular read/write anyway.
9049                          */
9050                         if (file->f_op == &io_uring_fops) {
9051                                 fput(file);
9052                                 err = -EBADF;
9053                                 break;
9054                         }
9055                         err = io_scm_file_account(ctx, file);
9056                         if (err) {
9057                                 fput(file);
9058                                 break;
9059                         }
9060                         *io_get_tag_slot(data, i) = tag;
9061                         io_fixed_file_set(file_slot, file);
9062                 }
9063         }
9064
9065         if (needs_switch)
9066                 io_rsrc_node_switch(ctx, data);
9067         return done ? done : err;
9068 }
9069
9070 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
9071                                         struct task_struct *task)
9072 {
9073         struct io_wq_hash *hash;
9074         struct io_wq_data data;
9075         unsigned int concurrency;
9076
9077         mutex_lock(&ctx->uring_lock);
9078         hash = ctx->hash_map;
9079         if (!hash) {
9080                 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
9081                 if (!hash) {
9082                         mutex_unlock(&ctx->uring_lock);
9083                         return ERR_PTR(-ENOMEM);
9084                 }
9085                 refcount_set(&hash->refs, 1);
9086                 init_waitqueue_head(&hash->wait);
9087                 ctx->hash_map = hash;
9088         }
9089         mutex_unlock(&ctx->uring_lock);
9090
9091         data.hash = hash;
9092         data.task = task;
9093         data.free_work = io_wq_free_work;
9094         data.do_work = io_wq_submit_work;
9095
9096         /* Do QD, or 4 * CPUS, whatever is smallest */
9097         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
9098
9099         return io_wq_create(concurrency, &data);
9100 }
9101
9102 static __cold int io_uring_alloc_task_context(struct task_struct *task,
9103                                               struct io_ring_ctx *ctx)
9104 {
9105         struct io_uring_task *tctx;
9106         int ret;
9107
9108         tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
9109         if (unlikely(!tctx))
9110                 return -ENOMEM;
9111
9112         tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
9113                                          sizeof(struct file *), GFP_KERNEL);
9114         if (unlikely(!tctx->registered_rings)) {
9115                 kfree(tctx);
9116                 return -ENOMEM;
9117         }
9118
9119         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
9120         if (unlikely(ret)) {
9121                 kfree(tctx->registered_rings);
9122                 kfree(tctx);
9123                 return ret;
9124         }
9125
9126         tctx->io_wq = io_init_wq_offload(ctx, task);
9127         if (IS_ERR(tctx->io_wq)) {
9128                 ret = PTR_ERR(tctx->io_wq);
9129                 percpu_counter_destroy(&tctx->inflight);
9130                 kfree(tctx->registered_rings);
9131                 kfree(tctx);
9132                 return ret;
9133         }
9134
9135         xa_init(&tctx->xa);
9136         init_waitqueue_head(&tctx->wait);
9137         atomic_set(&tctx->in_idle, 0);
9138         task->io_uring = tctx;
9139         spin_lock_init(&tctx->task_lock);
9140         INIT_WQ_LIST(&tctx->task_list);
9141         INIT_WQ_LIST(&tctx->prior_task_list);
9142         init_task_work(&tctx->task_work, tctx_task_work);
9143         return 0;
9144 }
9145
9146 void __io_uring_free(struct task_struct *tsk)
9147 {
9148         struct io_uring_task *tctx = tsk->io_uring;
9149
9150         WARN_ON_ONCE(!xa_empty(&tctx->xa));
9151         WARN_ON_ONCE(tctx->io_wq);
9152         WARN_ON_ONCE(tctx->cached_refs);
9153
9154         kfree(tctx->registered_rings);
9155         percpu_counter_destroy(&tctx->inflight);
9156         kfree(tctx);
9157         tsk->io_uring = NULL;
9158 }
9159
9160 static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
9161                                        struct io_uring_params *p)
9162 {
9163         int ret;
9164
9165         /* Retain compatibility with failing for an invalid attach attempt */
9166         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
9167                                 IORING_SETUP_ATTACH_WQ) {
9168                 struct fd f;
9169
9170                 f = fdget(p->wq_fd);
9171                 if (!f.file)
9172                         return -ENXIO;
9173                 if (f.file->f_op != &io_uring_fops) {
9174                         fdput(f);
9175                         return -EINVAL;
9176                 }
9177                 fdput(f);
9178         }
9179         if (ctx->flags & IORING_SETUP_SQPOLL) {
9180                 struct task_struct *tsk;
9181                 struct io_sq_data *sqd;
9182                 bool attached;
9183
9184                 ret = security_uring_sqpoll();
9185                 if (ret)
9186                         return ret;
9187
9188                 sqd = io_get_sq_data(p, &attached);
9189                 if (IS_ERR(sqd)) {
9190                         ret = PTR_ERR(sqd);
9191                         goto err;
9192                 }
9193
9194                 ctx->sq_creds = get_current_cred();
9195                 ctx->sq_data = sqd;
9196                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
9197                 if (!ctx->sq_thread_idle)
9198                         ctx->sq_thread_idle = HZ;
9199
9200                 io_sq_thread_park(sqd);
9201                 list_add(&ctx->sqd_list, &sqd->ctx_list);
9202                 io_sqd_update_thread_idle(sqd);
9203                 /* don't attach to a dying SQPOLL thread, would be racy */
9204                 ret = (attached && !sqd->thread) ? -ENXIO : 0;
9205                 io_sq_thread_unpark(sqd);
9206
9207                 if (ret < 0)
9208                         goto err;
9209                 if (attached)
9210                         return 0;
9211
9212                 if (p->flags & IORING_SETUP_SQ_AFF) {
9213                         int cpu = p->sq_thread_cpu;
9214
9215                         ret = -EINVAL;
9216                         if (cpu >= nr_cpu_ids || !cpu_online(cpu))
9217                                 goto err_sqpoll;
9218                         sqd->sq_cpu = cpu;
9219                 } else {
9220                         sqd->sq_cpu = -1;
9221                 }
9222
9223                 sqd->task_pid = current->pid;
9224                 sqd->task_tgid = current->tgid;
9225                 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
9226                 if (IS_ERR(tsk)) {
9227                         ret = PTR_ERR(tsk);
9228                         goto err_sqpoll;
9229                 }
9230
9231                 sqd->thread = tsk;
9232                 ret = io_uring_alloc_task_context(tsk, ctx);
9233                 wake_up_new_task(tsk);
9234                 if (ret)
9235                         goto err;
9236         } else if (p->flags & IORING_SETUP_SQ_AFF) {
9237                 /* Can't have SQ_AFF without SQPOLL */
9238                 ret = -EINVAL;
9239                 goto err;
9240         }
9241
9242         return 0;
9243 err_sqpoll:
9244         complete(&ctx->sq_data->exited);
9245 err:
9246         io_sq_thread_finish(ctx);
9247         return ret;
9248 }
9249
9250 static inline void __io_unaccount_mem(struct user_struct *user,
9251                                       unsigned long nr_pages)
9252 {
9253         atomic_long_sub(nr_pages, &user->locked_vm);
9254 }
9255
9256 static inline int __io_account_mem(struct user_struct *user,
9257                                    unsigned long nr_pages)
9258 {
9259         unsigned long page_limit, cur_pages, new_pages;
9260
9261         /* Don't allow more pages than we can safely lock */
9262         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
9263
9264         do {
9265                 cur_pages = atomic_long_read(&user->locked_vm);
9266                 new_pages = cur_pages + nr_pages;
9267                 if (new_pages > page_limit)
9268                         return -ENOMEM;
9269         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
9270                                         new_pages) != cur_pages);
9271
9272         return 0;
9273 }
9274
9275 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
9276 {
9277         if (ctx->user)
9278                 __io_unaccount_mem(ctx->user, nr_pages);
9279
9280         if (ctx->mm_account)
9281                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
9282 }
9283
9284 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
9285 {
9286         int ret;
9287
9288         if (ctx->user) {
9289                 ret = __io_account_mem(ctx->user, nr_pages);
9290                 if (ret)
9291                         return ret;
9292         }
9293
9294         if (ctx->mm_account)
9295                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
9296
9297         return 0;
9298 }
9299
9300 static void io_mem_free(void *ptr)
9301 {
9302         struct page *page;
9303
9304         if (!ptr)
9305                 return;
9306
9307         page = virt_to_head_page(ptr);
9308         if (put_page_testzero(page))
9309                 free_compound_page(page);
9310 }
9311
9312 static void *io_mem_alloc(size_t size)
9313 {
9314         gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
9315
9316         return (void *) __get_free_pages(gfp, get_order(size));
9317 }
9318
9319 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
9320                                 size_t *sq_offset)
9321 {
9322         struct io_rings *rings;
9323         size_t off, sq_array_size;
9324
9325         off = struct_size(rings, cqes, cq_entries);
9326         if (off == SIZE_MAX)
9327                 return SIZE_MAX;
9328
9329 #ifdef CONFIG_SMP
9330         off = ALIGN(off, SMP_CACHE_BYTES);
9331         if (off == 0)
9332                 return SIZE_MAX;
9333 #endif
9334
9335         if (sq_offset)
9336                 *sq_offset = off;
9337
9338         sq_array_size = array_size(sizeof(u32), sq_entries);
9339         if (sq_array_size == SIZE_MAX)
9340                 return SIZE_MAX;
9341
9342         if (check_add_overflow(off, sq_array_size, &off))
9343                 return SIZE_MAX;
9344
9345         return off;
9346 }
9347
9348 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
9349 {
9350         struct io_mapped_ubuf *imu = *slot;
9351         unsigned int i;
9352
9353         if (imu != ctx->dummy_ubuf) {
9354                 for (i = 0; i < imu->nr_bvecs; i++)
9355                         unpin_user_page(imu->bvec[i].bv_page);
9356                 if (imu->acct_pages)
9357                         io_unaccount_mem(ctx, imu->acct_pages);
9358                 kvfree(imu);
9359         }
9360         *slot = NULL;
9361 }
9362
9363 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
9364 {
9365         io_buffer_unmap(ctx, &prsrc->buf);
9366         prsrc->buf = NULL;
9367 }
9368
9369 static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
9370 {
9371         unsigned int i;
9372
9373         for (i = 0; i < ctx->nr_user_bufs; i++)
9374                 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
9375         kfree(ctx->user_bufs);
9376         io_rsrc_data_free(ctx->buf_data);
9377         ctx->user_bufs = NULL;
9378         ctx->buf_data = NULL;
9379         ctx->nr_user_bufs = 0;
9380 }
9381
9382 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
9383 {
9384         int ret;
9385
9386         if (!ctx->buf_data)
9387                 return -ENXIO;
9388
9389         ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
9390         if (!ret)
9391                 __io_sqe_buffers_unregister(ctx);
9392         return ret;
9393 }
9394
9395 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
9396                        void __user *arg, unsigned index)
9397 {
9398         struct iovec __user *src;
9399
9400 #ifdef CONFIG_COMPAT
9401         if (ctx->compat) {
9402                 struct compat_iovec __user *ciovs;
9403                 struct compat_iovec ciov;
9404
9405                 ciovs = (struct compat_iovec __user *) arg;
9406                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
9407                         return -EFAULT;
9408
9409                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
9410                 dst->iov_len = ciov.iov_len;
9411                 return 0;
9412         }
9413 #endif
9414         src = (struct iovec __user *) arg;
9415         if (copy_from_user(dst, &src[index], sizeof(*dst)))
9416                 return -EFAULT;
9417         return 0;
9418 }
9419
9420 /*
9421  * Not super efficient, but this is just a registration time. And we do cache
9422  * the last compound head, so generally we'll only do a full search if we don't
9423  * match that one.
9424  *
9425  * We check if the given compound head page has already been accounted, to
9426  * avoid double accounting it. This allows us to account the full size of the
9427  * page, not just the constituent pages of a huge page.
9428  */
9429 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
9430                                   int nr_pages, struct page *hpage)
9431 {
9432         int i, j;
9433
9434         /* check current page array */
9435         for (i = 0; i < nr_pages; i++) {
9436                 if (!PageCompound(pages[i]))
9437                         continue;
9438                 if (compound_head(pages[i]) == hpage)
9439                         return true;
9440         }
9441
9442         /* check previously registered pages */
9443         for (i = 0; i < ctx->nr_user_bufs; i++) {
9444                 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
9445
9446                 for (j = 0; j < imu->nr_bvecs; j++) {
9447                         if (!PageCompound(imu->bvec[j].bv_page))
9448                                 continue;
9449                         if (compound_head(imu->bvec[j].bv_page) == hpage)
9450                                 return true;
9451                 }
9452         }
9453
9454         return false;
9455 }
9456
9457 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
9458                                  int nr_pages, struct io_mapped_ubuf *imu,
9459                                  struct page **last_hpage)
9460 {
9461         int i, ret;
9462
9463         imu->acct_pages = 0;
9464         for (i = 0; i < nr_pages; i++) {
9465                 if (!PageCompound(pages[i])) {
9466                         imu->acct_pages++;
9467                 } else {
9468                         struct page *hpage;
9469
9470                         hpage = compound_head(pages[i]);
9471                         if (hpage == *last_hpage)
9472                                 continue;
9473                         *last_hpage = hpage;
9474                         if (headpage_already_acct(ctx, pages, i, hpage))
9475                                 continue;
9476                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
9477                 }
9478         }
9479
9480         if (!imu->acct_pages)
9481                 return 0;
9482
9483         ret = io_account_mem(ctx, imu->acct_pages);
9484         if (ret)
9485                 imu->acct_pages = 0;
9486         return ret;
9487 }
9488
9489 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
9490                                   struct io_mapped_ubuf **pimu,
9491                                   struct page **last_hpage)
9492 {
9493         struct io_mapped_ubuf *imu = NULL;
9494         struct vm_area_struct **vmas = NULL;
9495         struct page **pages = NULL;
9496         unsigned long off, start, end, ubuf;
9497         size_t size;
9498         int ret, pret, nr_pages, i;
9499
9500         if (!iov->iov_base) {
9501                 *pimu = ctx->dummy_ubuf;
9502                 return 0;
9503         }
9504
9505         ubuf = (unsigned long) iov->iov_base;
9506         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
9507         start = ubuf >> PAGE_SHIFT;
9508         nr_pages = end - start;
9509
9510         *pimu = NULL;
9511         ret = -ENOMEM;
9512
9513         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
9514         if (!pages)
9515                 goto done;
9516
9517         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
9518                               GFP_KERNEL);
9519         if (!vmas)
9520                 goto done;
9521
9522         imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
9523         if (!imu)
9524                 goto done;
9525
9526         ret = 0;
9527         mmap_read_lock(current->mm);
9528         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
9529                               pages, vmas);
9530         if (pret == nr_pages) {
9531                 /* don't support file backed memory */
9532                 for (i = 0; i < nr_pages; i++) {
9533                         struct vm_area_struct *vma = vmas[i];
9534
9535                         if (vma_is_shmem(vma))
9536                                 continue;
9537                         if (vma->vm_file &&
9538                             !is_file_hugepages(vma->vm_file)) {
9539                                 ret = -EOPNOTSUPP;
9540                                 break;
9541                         }
9542                 }
9543         } else {
9544                 ret = pret < 0 ? pret : -EFAULT;
9545         }
9546         mmap_read_unlock(current->mm);
9547         if (ret) {
9548                 /*
9549                  * if we did partial map, or found file backed vmas,
9550                  * release any pages we did get
9551                  */
9552                 if (pret > 0)
9553                         unpin_user_pages(pages, pret);
9554                 goto done;
9555         }
9556
9557         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
9558         if (ret) {
9559                 unpin_user_pages(pages, pret);
9560                 goto done;
9561         }
9562
9563         off = ubuf & ~PAGE_MASK;
9564         size = iov->iov_len;
9565         for (i = 0; i < nr_pages; i++) {
9566                 size_t vec_len;
9567
9568                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
9569                 imu->bvec[i].bv_page = pages[i];
9570                 imu->bvec[i].bv_len = vec_len;
9571                 imu->bvec[i].bv_offset = off;
9572                 off = 0;
9573                 size -= vec_len;
9574         }
9575         /* store original address for later verification */
9576         imu->ubuf = ubuf;
9577         imu->ubuf_end = ubuf + iov->iov_len;
9578         imu->nr_bvecs = nr_pages;
9579         *pimu = imu;
9580         ret = 0;
9581 done:
9582         if (ret)
9583                 kvfree(imu);
9584         kvfree(pages);
9585         kvfree(vmas);
9586         return ret;
9587 }
9588
9589 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
9590 {
9591         ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9592         return ctx->user_bufs ? 0 : -ENOMEM;
9593 }
9594
9595 static int io_buffer_validate(struct iovec *iov)
9596 {
9597         unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9598
9599         /*
9600          * Don't impose further limits on the size and buffer
9601          * constraints here, we'll -EINVAL later when IO is
9602          * submitted if they are wrong.
9603          */
9604         if (!iov->iov_base)
9605                 return iov->iov_len ? -EFAULT : 0;
9606         if (!iov->iov_len)
9607                 return -EFAULT;
9608
9609         /* arbitrary limit, but we need something */
9610         if (iov->iov_len > SZ_1G)
9611                 return -EFAULT;
9612
9613         if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9614                 return -EOVERFLOW;
9615
9616         return 0;
9617 }
9618
9619 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
9620                                    unsigned int nr_args, u64 __user *tags)
9621 {
9622         struct page *last_hpage = NULL;
9623         struct io_rsrc_data *data;
9624         int i, ret;
9625         struct iovec iov;
9626
9627         if (ctx->user_bufs)
9628                 return -EBUSY;
9629         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
9630                 return -EINVAL;
9631         ret = io_rsrc_node_switch_start(ctx);
9632         if (ret)
9633                 return ret;
9634         ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9635         if (ret)
9636                 return ret;
9637         ret = io_buffers_map_alloc(ctx, nr_args);
9638         if (ret) {
9639                 io_rsrc_data_free(data);
9640                 return ret;
9641         }
9642
9643         for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
9644                 ret = io_copy_iov(ctx, &iov, arg, i);
9645                 if (ret)
9646                         break;
9647                 ret = io_buffer_validate(&iov);
9648                 if (ret)
9649                         break;
9650                 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
9651                         ret = -EINVAL;
9652                         break;
9653                 }
9654
9655                 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9656                                              &last_hpage);
9657                 if (ret)
9658                         break;
9659         }
9660
9661         WARN_ON_ONCE(ctx->buf_data);
9662
9663         ctx->buf_data = data;
9664         if (ret)
9665                 __io_sqe_buffers_unregister(ctx);
9666         else
9667                 io_rsrc_node_switch(ctx, NULL);
9668         return ret;
9669 }
9670
9671 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9672                                    struct io_uring_rsrc_update2 *up,
9673                                    unsigned int nr_args)
9674 {
9675         u64 __user *tags = u64_to_user_ptr(up->tags);
9676         struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
9677         struct page *last_hpage = NULL;
9678         bool needs_switch = false;
9679         __u32 done;
9680         int i, err;
9681
9682         if (!ctx->buf_data)
9683                 return -ENXIO;
9684         if (up->offset + nr_args > ctx->nr_user_bufs)
9685                 return -EINVAL;
9686
9687         for (done = 0; done < nr_args; done++) {
9688                 struct io_mapped_ubuf *imu;
9689                 int offset = up->offset + done;
9690                 u64 tag = 0;
9691
9692                 err = io_copy_iov(ctx, &iov, iovs, done);
9693                 if (err)
9694                         break;
9695                 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9696                         err = -EFAULT;
9697                         break;
9698                 }
9699                 err = io_buffer_validate(&iov);
9700                 if (err)
9701                         break;
9702                 if (!iov.iov_base && tag) {
9703                         err = -EINVAL;
9704                         break;
9705                 }
9706                 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9707                 if (err)
9708                         break;
9709
9710                 i = array_index_nospec(offset, ctx->nr_user_bufs);
9711                 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
9712                         err = io_queue_rsrc_removal(ctx->buf_data, i,
9713                                                     ctx->rsrc_node, ctx->user_bufs[i]);
9714                         if (unlikely(err)) {
9715                                 io_buffer_unmap(ctx, &imu);
9716                                 break;
9717                         }
9718                         ctx->user_bufs[i] = NULL;
9719                         needs_switch = true;
9720                 }
9721
9722                 ctx->user_bufs[i] = imu;
9723                 *io_get_tag_slot(ctx->buf_data, offset) = tag;
9724         }
9725
9726         if (needs_switch)
9727                 io_rsrc_node_switch(ctx, ctx->buf_data);
9728         return done ? done : err;
9729 }
9730
9731 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
9732                                unsigned int eventfd_async)
9733 {
9734         struct io_ev_fd *ev_fd;
9735         __s32 __user *fds = arg;
9736         int fd;
9737
9738         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
9739                                         lockdep_is_held(&ctx->uring_lock));
9740         if (ev_fd)
9741                 return -EBUSY;
9742
9743         if (copy_from_user(&fd, fds, sizeof(*fds)))
9744                 return -EFAULT;
9745
9746         ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
9747         if (!ev_fd)
9748                 return -ENOMEM;
9749
9750         ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
9751         if (IS_ERR(ev_fd->cq_ev_fd)) {
9752                 int ret = PTR_ERR(ev_fd->cq_ev_fd);
9753                 kfree(ev_fd);
9754                 return ret;
9755         }
9756         ev_fd->eventfd_async = eventfd_async;
9757         ctx->has_evfd = true;
9758         rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
9759         return 0;
9760 }
9761
9762 static void io_eventfd_put(struct rcu_head *rcu)
9763 {
9764         struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
9765
9766         eventfd_ctx_put(ev_fd->cq_ev_fd);
9767         kfree(ev_fd);
9768 }
9769
9770 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9771 {
9772         struct io_ev_fd *ev_fd;
9773
9774         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
9775                                         lockdep_is_held(&ctx->uring_lock));
9776         if (ev_fd) {
9777                 ctx->has_evfd = false;
9778                 rcu_assign_pointer(ctx->io_ev_fd, NULL);
9779                 call_rcu(&ev_fd->rcu, io_eventfd_put);
9780                 return 0;
9781         }
9782
9783         return -ENXIO;
9784 }
9785
9786 static void io_destroy_buffers(struct io_ring_ctx *ctx)
9787 {
9788         int i;
9789
9790         for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
9791                 struct list_head *list = &ctx->io_buffers[i];
9792
9793                 while (!list_empty(list)) {
9794                         struct io_buffer_list *bl;
9795
9796                         bl = list_first_entry(list, struct io_buffer_list, list);
9797                         __io_remove_buffers(ctx, bl, -1U);
9798                         list_del(&bl->list);
9799                         kfree(bl);
9800                 }
9801         }
9802
9803         while (!list_empty(&ctx->io_buffers_pages)) {
9804                 struct page *page;
9805
9806                 page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
9807                 list_del_init(&page->lru);
9808                 __free_page(page);
9809         }
9810 }
9811
9812 static void io_req_caches_free(struct io_ring_ctx *ctx)
9813 {
9814         struct io_submit_state *state = &ctx->submit_state;
9815         int nr = 0;
9816
9817         mutex_lock(&ctx->uring_lock);
9818         io_flush_cached_locked_reqs(ctx, state);
9819
9820         while (state->free_list.next) {
9821                 struct io_wq_work_node *node;
9822                 struct io_kiocb *req;
9823
9824                 node = wq_stack_extract(&state->free_list);
9825                 req = container_of(node, struct io_kiocb, comp_list);
9826                 kmem_cache_free(req_cachep, req);
9827                 nr++;
9828         }
9829         if (nr)
9830                 percpu_ref_put_many(&ctx->refs, nr);
9831         mutex_unlock(&ctx->uring_lock);
9832 }
9833
9834 static void io_wait_rsrc_data(struct io_rsrc_data *data)
9835 {
9836         if (data && !atomic_dec_and_test(&data->refs))
9837                 wait_for_completion(&data->done);
9838 }
9839
9840 static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
9841 {
9842         struct async_poll *apoll;
9843
9844         while (!list_empty(&ctx->apoll_cache)) {
9845                 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
9846                                                 poll.wait.entry);
9847                 list_del(&apoll->poll.wait.entry);
9848                 kfree(apoll);
9849         }
9850 }
9851
9852 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
9853 {
9854         io_sq_thread_finish(ctx);
9855
9856         if (ctx->mm_account) {
9857                 mmdrop(ctx->mm_account);
9858                 ctx->mm_account = NULL;
9859         }
9860
9861         io_rsrc_refs_drop(ctx);
9862         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
9863         io_wait_rsrc_data(ctx->buf_data);
9864         io_wait_rsrc_data(ctx->file_data);
9865
9866         mutex_lock(&ctx->uring_lock);
9867         if (ctx->buf_data)
9868                 __io_sqe_buffers_unregister(ctx);
9869         if (ctx->file_data)
9870                 __io_sqe_files_unregister(ctx);
9871         if (ctx->rings)
9872                 __io_cqring_overflow_flush(ctx, true);
9873         io_eventfd_unregister(ctx);
9874         io_flush_apoll_cache(ctx);
9875         mutex_unlock(&ctx->uring_lock);
9876         io_destroy_buffers(ctx);
9877         if (ctx->sq_creds)
9878                 put_cred(ctx->sq_creds);
9879
9880         /* there are no registered resources left, nobody uses it */
9881         if (ctx->rsrc_node)
9882                 io_rsrc_node_destroy(ctx->rsrc_node);
9883         if (ctx->rsrc_backup_node)
9884                 io_rsrc_node_destroy(ctx->rsrc_backup_node);
9885         flush_delayed_work(&ctx->rsrc_put_work);
9886         flush_delayed_work(&ctx->fallback_work);
9887
9888         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9889         WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
9890
9891 #if defined(CONFIG_UNIX)
9892         if (ctx->ring_sock) {
9893                 ctx->ring_sock->file = NULL; /* so that iput() is called */
9894                 sock_release(ctx->ring_sock);
9895         }
9896 #endif
9897         WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
9898
9899         io_mem_free(ctx->rings);
9900         io_mem_free(ctx->sq_sqes);
9901
9902         percpu_ref_exit(&ctx->refs);
9903         free_uid(ctx->user);
9904         io_req_caches_free(ctx);
9905         if (ctx->hash_map)
9906                 io_wq_put_hash(ctx->hash_map);
9907         kfree(ctx->cancel_hash);
9908         kfree(ctx->dummy_ubuf);
9909         kfree(ctx->io_buffers);
9910         kfree(ctx);
9911 }
9912
9913 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9914 {
9915         struct io_ring_ctx *ctx = file->private_data;
9916         __poll_t mask = 0;
9917
9918         poll_wait(file, &ctx->cq_wait, wait);
9919         /*
9920          * synchronizes with barrier from wq_has_sleeper call in
9921          * io_commit_cqring
9922          */
9923         smp_rmb();
9924         if (!io_sqring_full(ctx))
9925                 mask |= EPOLLOUT | EPOLLWRNORM;
9926
9927         /*
9928          * Don't flush cqring overflow list here, just do a simple check.
9929          * Otherwise there could possible be ABBA deadlock:
9930          *      CPU0                    CPU1
9931          *      ----                    ----
9932          * lock(&ctx->uring_lock);
9933          *                              lock(&ep->mtx);
9934          *                              lock(&ctx->uring_lock);
9935          * lock(&ep->mtx);
9936          *
9937          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
9938          * pushs them to do the flush.
9939          */
9940         if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
9941                 mask |= EPOLLIN | EPOLLRDNORM;
9942
9943         return mask;
9944 }
9945
9946 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
9947 {
9948         const struct cred *creds;
9949
9950         creds = xa_erase(&ctx->personalities, id);
9951         if (creds) {
9952                 put_cred(creds);
9953                 return 0;
9954         }
9955
9956         return -EINVAL;
9957 }
9958
9959 struct io_tctx_exit {
9960         struct callback_head            task_work;
9961         struct completion               completion;
9962         struct io_ring_ctx              *ctx;
9963 };
9964
9965 static __cold void io_tctx_exit_cb(struct callback_head *cb)
9966 {
9967         struct io_uring_task *tctx = current->io_uring;
9968         struct io_tctx_exit *work;
9969
9970         work = container_of(cb, struct io_tctx_exit, task_work);
9971         /*
9972          * When @in_idle, we're in cancellation and it's racy to remove the
9973          * node. It'll be removed by the end of cancellation, just ignore it.
9974          */
9975         if (!atomic_read(&tctx->in_idle))
9976                 io_uring_del_tctx_node((unsigned long)work->ctx);
9977         complete(&work->completion);
9978 }
9979
9980 static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
9981 {
9982         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9983
9984         return req->ctx == data;
9985 }
9986
9987 static __cold void io_ring_exit_work(struct work_struct *work)
9988 {
9989         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
9990         unsigned long timeout = jiffies + HZ * 60 * 5;
9991         unsigned long interval = HZ / 20;
9992         struct io_tctx_exit exit;
9993         struct io_tctx_node *node;
9994         int ret;
9995
9996         /*
9997          * If we're doing polled IO and end up having requests being
9998          * submitted async (out-of-line), then completions can come in while
9999          * we're waiting for refs to drop. We need to reap these manually,
10000          * as nobody else will be looking for them.
10001          */
10002         do {
10003                 io_uring_try_cancel_requests(ctx, NULL, true);
10004                 if (ctx->sq_data) {
10005                         struct io_sq_data *sqd = ctx->sq_data;
10006                         struct task_struct *tsk;
10007
10008                         io_sq_thread_park(sqd);
10009                         tsk = sqd->thread;
10010                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
10011                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
10012                                                 io_cancel_ctx_cb, ctx, true);
10013                         io_sq_thread_unpark(sqd);
10014                 }
10015
10016                 io_req_caches_free(ctx);
10017
10018                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
10019                         /* there is little hope left, don't run it too often */
10020                         interval = HZ * 60;
10021                 }
10022         } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
10023
10024         init_completion(&exit.completion);
10025         init_task_work(&exit.task_work, io_tctx_exit_cb);
10026         exit.ctx = ctx;
10027         /*
10028          * Some may use context even when all refs and requests have been put,
10029          * and they are free to do so while still holding uring_lock or
10030          * completion_lock, see io_req_task_submit(). Apart from other work,
10031          * this lock/unlock section also waits them to finish.
10032          */
10033         mutex_lock(&ctx->uring_lock);
10034         while (!list_empty(&ctx->tctx_list)) {
10035                 WARN_ON_ONCE(time_after(jiffies, timeout));
10036
10037                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
10038                                         ctx_node);
10039                 /* don't spin on a single task if cancellation failed */
10040                 list_rotate_left(&ctx->tctx_list);
10041                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
10042                 if (WARN_ON_ONCE(ret))
10043                         continue;
10044
10045                 mutex_unlock(&ctx->uring_lock);
10046                 wait_for_completion(&exit.completion);
10047                 mutex_lock(&ctx->uring_lock);
10048         }
10049         mutex_unlock(&ctx->uring_lock);
10050         spin_lock(&ctx->completion_lock);
10051         spin_unlock(&ctx->completion_lock);
10052
10053         io_ring_ctx_free(ctx);
10054 }
10055
10056 /* Returns true if we found and killed one or more timeouts */
10057 static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
10058                                     struct task_struct *tsk, bool cancel_all)
10059 {
10060         struct io_kiocb *req, *tmp;
10061         int canceled = 0;
10062
10063         spin_lock(&ctx->completion_lock);
10064         spin_lock_irq(&ctx->timeout_lock);
10065         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
10066                 if (io_match_task(req, tsk, cancel_all)) {
10067                         io_kill_timeout(req, -ECANCELED);
10068                         canceled++;
10069                 }
10070         }
10071         spin_unlock_irq(&ctx->timeout_lock);
10072         io_commit_cqring(ctx);
10073         spin_unlock(&ctx->completion_lock);
10074         if (canceled != 0)
10075                 io_cqring_ev_posted(ctx);
10076         return canceled != 0;
10077 }
10078
10079 static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
10080 {
10081         unsigned long index;
10082         struct creds *creds;
10083
10084         mutex_lock(&ctx->uring_lock);
10085         percpu_ref_kill(&ctx->refs);
10086         if (ctx->rings)
10087                 __io_cqring_overflow_flush(ctx, true);
10088         xa_for_each(&ctx->personalities, index, creds)
10089                 io_unregister_personality(ctx, index);
10090         mutex_unlock(&ctx->uring_lock);
10091
10092         /* failed during ring init, it couldn't have issued any requests */
10093         if (ctx->rings) {
10094                 io_kill_timeouts(ctx, NULL, true);
10095                 io_poll_remove_all(ctx, NULL, true);
10096                 /* if we failed setting up the ctx, we might not have any rings */
10097                 io_iopoll_try_reap_events(ctx);
10098         }
10099
10100         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
10101         /*
10102          * Use system_unbound_wq to avoid spawning tons of event kworkers
10103          * if we're exiting a ton of rings at the same time. It just adds
10104          * noise and overhead, there's no discernable change in runtime
10105          * over using system_wq.
10106          */
10107         queue_work(system_unbound_wq, &ctx->exit_work);
10108 }
10109
10110 static int io_uring_release(struct inode *inode, struct file *file)
10111 {
10112         struct io_ring_ctx *ctx = file->private_data;
10113
10114         file->private_data = NULL;
10115         io_ring_ctx_wait_and_kill(ctx);
10116         return 0;
10117 }
10118
10119 struct io_task_cancel {
10120         struct task_struct *task;
10121         bool all;
10122 };
10123
10124 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
10125 {
10126         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
10127         struct io_task_cancel *cancel = data;
10128
10129         return io_match_task_safe(req, cancel->task, cancel->all);
10130 }
10131
10132 static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
10133                                          struct task_struct *task,
10134                                          bool cancel_all)
10135 {
10136         struct io_defer_entry *de;
10137         LIST_HEAD(list);
10138
10139         spin_lock(&ctx->completion_lock);
10140         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
10141                 if (io_match_task_safe(de->req, task, cancel_all)) {
10142                         list_cut_position(&list, &ctx->defer_list, &de->list);
10143                         break;
10144                 }
10145         }
10146         spin_unlock(&ctx->completion_lock);
10147         if (list_empty(&list))
10148                 return false;
10149
10150         while (!list_empty(&list)) {
10151                 de = list_first_entry(&list, struct io_defer_entry, list);
10152                 list_del_init(&de->list);
10153                 io_req_complete_failed(de->req, -ECANCELED);
10154                 kfree(de);
10155         }
10156         return true;
10157 }
10158
10159 static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
10160 {
10161         struct io_tctx_node *node;
10162         enum io_wq_cancel cret;
10163         bool ret = false;
10164
10165         mutex_lock(&ctx->uring_lock);
10166         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10167                 struct io_uring_task *tctx = node->task->io_uring;
10168
10169                 /*
10170                  * io_wq will stay alive while we hold uring_lock, because it's
10171                  * killed after ctx nodes, which requires to take the lock.
10172                  */
10173                 if (!tctx || !tctx->io_wq)
10174                         continue;
10175                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
10176                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
10177         }
10178         mutex_unlock(&ctx->uring_lock);
10179
10180         return ret;
10181 }
10182
10183 static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
10184                                                 struct task_struct *task,
10185                                                 bool cancel_all)
10186 {
10187         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
10188         struct io_uring_task *tctx = task ? task->io_uring : NULL;
10189
10190         /* failed during ring init, it couldn't have issued any requests */
10191         if (!ctx->rings)
10192                 return;
10193
10194         while (1) {
10195                 enum io_wq_cancel cret;
10196                 bool ret = false;
10197
10198                 if (!task) {
10199                         ret |= io_uring_try_cancel_iowq(ctx);
10200                 } else if (tctx && tctx->io_wq) {
10201                         /*
10202                          * Cancels requests of all rings, not only @ctx, but
10203                          * it's fine as the task is in exit/exec.
10204                          */
10205                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
10206                                                &cancel, true);
10207                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
10208                 }
10209
10210                 /* SQPOLL thread does its own polling */
10211                 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
10212                     (ctx->sq_data && ctx->sq_data->thread == current)) {
10213                         while (!wq_list_empty(&ctx->iopoll_list)) {
10214                                 io_iopoll_try_reap_events(ctx);
10215                                 ret = true;
10216                         }
10217                 }
10218
10219                 ret |= io_cancel_defer_files(ctx, task, cancel_all);
10220                 ret |= io_poll_remove_all(ctx, task, cancel_all);
10221                 ret |= io_kill_timeouts(ctx, task, cancel_all);
10222                 if (task)
10223                         ret |= io_run_task_work();
10224                 if (!ret)
10225                         break;
10226                 cond_resched();
10227         }
10228 }
10229
10230 static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
10231 {
10232         struct io_uring_task *tctx = current->io_uring;
10233         struct io_tctx_node *node;
10234         int ret;
10235
10236         if (unlikely(!tctx)) {
10237                 ret = io_uring_alloc_task_context(current, ctx);
10238                 if (unlikely(ret))
10239                         return ret;
10240
10241                 tctx = current->io_uring;
10242                 if (ctx->iowq_limits_set) {
10243                         unsigned int limits[2] = { ctx->iowq_limits[0],
10244                                                    ctx->iowq_limits[1], };
10245
10246                         ret = io_wq_max_workers(tctx->io_wq, limits);
10247                         if (ret)
10248                                 return ret;
10249                 }
10250         }
10251         if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
10252                 node = kmalloc(sizeof(*node), GFP_KERNEL);
10253                 if (!node)
10254                         return -ENOMEM;
10255                 node->ctx = ctx;
10256                 node->task = current;
10257
10258                 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
10259                                         node, GFP_KERNEL));
10260                 if (ret) {
10261                         kfree(node);
10262                         return ret;
10263                 }
10264
10265                 mutex_lock(&ctx->uring_lock);
10266                 list_add(&node->ctx_node, &ctx->tctx_list);
10267                 mutex_unlock(&ctx->uring_lock);
10268         }
10269         tctx->last = ctx;
10270         return 0;
10271 }
10272
10273 /*
10274  * Note that this task has used io_uring. We use it for cancelation purposes.
10275  */
10276 static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
10277 {
10278         struct io_uring_task *tctx = current->io_uring;
10279
10280         if (likely(tctx && tctx->last == ctx))
10281                 return 0;
10282         return __io_uring_add_tctx_node(ctx);
10283 }
10284
10285 /*
10286  * Remove this io_uring_file -> task mapping.
10287  */
10288 static __cold void io_uring_del_tctx_node(unsigned long index)
10289 {
10290         struct io_uring_task *tctx = current->io_uring;
10291         struct io_tctx_node *node;
10292
10293         if (!tctx)
10294                 return;
10295         node = xa_erase(&tctx->xa, index);
10296         if (!node)
10297                 return;
10298
10299         WARN_ON_ONCE(current != node->task);
10300         WARN_ON_ONCE(list_empty(&node->ctx_node));
10301
10302         mutex_lock(&node->ctx->uring_lock);
10303         list_del(&node->ctx_node);
10304         mutex_unlock(&node->ctx->uring_lock);
10305
10306         if (tctx->last == node->ctx)
10307                 tctx->last = NULL;
10308         kfree(node);
10309 }
10310
10311 static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
10312 {
10313         struct io_wq *wq = tctx->io_wq;
10314         struct io_tctx_node *node;
10315         unsigned long index;
10316
10317         xa_for_each(&tctx->xa, index, node) {
10318                 io_uring_del_tctx_node(index);
10319                 cond_resched();
10320         }
10321         if (wq) {
10322                 /*
10323                  * Must be after io_uring_del_tctx_node() (removes nodes under
10324                  * uring_lock) to avoid race with io_uring_try_cancel_iowq().
10325                  */
10326                 io_wq_put_and_exit(wq);
10327                 tctx->io_wq = NULL;
10328         }
10329 }
10330
10331 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
10332 {
10333         if (tracked)
10334                 return 0;
10335         return percpu_counter_sum(&tctx->inflight);
10336 }
10337
10338 /*
10339  * Find any io_uring ctx that this task has registered or done IO on, and cancel
10340  * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
10341  */
10342 static __cold void io_uring_cancel_generic(bool cancel_all,
10343                                            struct io_sq_data *sqd)
10344 {
10345         struct io_uring_task *tctx = current->io_uring;
10346         struct io_ring_ctx *ctx;
10347         s64 inflight;
10348         DEFINE_WAIT(wait);
10349
10350         WARN_ON_ONCE(sqd && sqd->thread != current);
10351
10352         if (!current->io_uring)
10353                 return;
10354         if (tctx->io_wq)
10355                 io_wq_exit_start(tctx->io_wq);
10356
10357         atomic_inc(&tctx->in_idle);
10358         do {
10359                 io_uring_drop_tctx_refs(current);
10360                 /* read completions before cancelations */
10361                 inflight = tctx_inflight(tctx, !cancel_all);
10362                 if (!inflight)
10363                         break;
10364
10365                 if (!sqd) {
10366                         struct io_tctx_node *node;
10367                         unsigned long index;
10368
10369                         xa_for_each(&tctx->xa, index, node) {
10370                                 /* sqpoll task will cancel all its requests */
10371                                 if (node->ctx->sq_data)
10372                                         continue;
10373                                 io_uring_try_cancel_requests(node->ctx, current,
10374                                                              cancel_all);
10375                         }
10376                 } else {
10377                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
10378                                 io_uring_try_cancel_requests(ctx, current,
10379                                                              cancel_all);
10380                 }
10381
10382                 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
10383                 io_run_task_work();
10384                 io_uring_drop_tctx_refs(current);
10385
10386                 /*
10387                  * If we've seen completions, retry without waiting. This
10388                  * avoids a race where a completion comes in before we did
10389                  * prepare_to_wait().
10390                  */
10391                 if (inflight == tctx_inflight(tctx, !cancel_all))
10392                         schedule();
10393                 finish_wait(&tctx->wait, &wait);
10394         } while (1);
10395
10396         io_uring_clean_tctx(tctx);
10397         if (cancel_all) {
10398                 /*
10399                  * We shouldn't run task_works after cancel, so just leave
10400                  * ->in_idle set for normal exit.
10401                  */
10402                 atomic_dec(&tctx->in_idle);
10403                 /* for exec all current's requests should be gone, kill tctx */
10404                 __io_uring_free(current);
10405         }
10406 }
10407
10408 void __io_uring_cancel(bool cancel_all)
10409 {
10410         io_uring_cancel_generic(cancel_all, NULL);
10411 }
10412
10413 void io_uring_unreg_ringfd(void)
10414 {
10415         struct io_uring_task *tctx = current->io_uring;
10416         int i;
10417
10418         for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
10419                 if (tctx->registered_rings[i]) {
10420                         fput(tctx->registered_rings[i]);
10421                         tctx->registered_rings[i] = NULL;
10422                 }
10423         }
10424 }
10425
10426 static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
10427                                      int start, int end)
10428 {
10429         struct file *file;
10430         int offset;
10431
10432         for (offset = start; offset < end; offset++) {
10433                 offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
10434                 if (tctx->registered_rings[offset])
10435                         continue;
10436
10437                 file = fget(fd);
10438                 if (!file) {
10439                         return -EBADF;
10440                 } else if (file->f_op != &io_uring_fops) {
10441                         fput(file);
10442                         return -EOPNOTSUPP;
10443                 }
10444                 tctx->registered_rings[offset] = file;
10445                 return offset;
10446         }
10447
10448         return -EBUSY;
10449 }
10450
10451 /*
10452  * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
10453  * invocation. User passes in an array of struct io_uring_rsrc_update
10454  * with ->data set to the ring_fd, and ->offset given for the desired
10455  * index. If no index is desired, application may set ->offset == -1U
10456  * and we'll find an available index. Returns number of entries
10457  * successfully processed, or < 0 on error if none were processed.
10458  */
10459 static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
10460                               unsigned nr_args)
10461 {
10462         struct io_uring_rsrc_update __user *arg = __arg;
10463         struct io_uring_rsrc_update reg;
10464         struct io_uring_task *tctx;
10465         int ret, i;
10466
10467         if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
10468                 return -EINVAL;
10469
10470         mutex_unlock(&ctx->uring_lock);
10471         ret = io_uring_add_tctx_node(ctx);
10472         mutex_lock(&ctx->uring_lock);
10473         if (ret)
10474                 return ret;
10475
10476         tctx = current->io_uring;
10477         for (i = 0; i < nr_args; i++) {
10478                 int start, end;
10479
10480                 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
10481                         ret = -EFAULT;
10482                         break;
10483                 }
10484
10485                 if (reg.resv) {
10486                         ret = -EINVAL;
10487                         break;
10488                 }
10489
10490                 if (reg.offset == -1U) {
10491                         start = 0;
10492                         end = IO_RINGFD_REG_MAX;
10493                 } else {
10494                         if (reg.offset >= IO_RINGFD_REG_MAX) {
10495                                 ret = -EINVAL;
10496                                 break;
10497                         }
10498                         start = reg.offset;
10499                         end = start + 1;
10500                 }
10501
10502                 ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
10503                 if (ret < 0)
10504                         break;
10505
10506                 reg.offset = ret;
10507                 if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
10508                         fput(tctx->registered_rings[reg.offset]);
10509                         tctx->registered_rings[reg.offset] = NULL;
10510                         ret = -EFAULT;
10511                         break;
10512                 }
10513         }
10514
10515         return i ? i : ret;
10516 }
10517
10518 static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
10519                                 unsigned nr_args)
10520 {
10521         struct io_uring_rsrc_update __user *arg = __arg;
10522         struct io_uring_task *tctx = current->io_uring;
10523         struct io_uring_rsrc_update reg;
10524         int ret = 0, i;
10525
10526         if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
10527                 return -EINVAL;
10528         if (!tctx)
10529                 return 0;
10530
10531         for (i = 0; i < nr_args; i++) {
10532                 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
10533                         ret = -EFAULT;
10534                         break;
10535                 }
10536                 if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) {
10537                         ret = -EINVAL;
10538                         break;
10539                 }
10540
10541                 reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
10542                 if (tctx->registered_rings[reg.offset]) {
10543                         fput(tctx->registered_rings[reg.offset]);
10544                         tctx->registered_rings[reg.offset] = NULL;
10545                 }
10546         }
10547
10548         return i ? i : ret;
10549 }
10550
10551 static void *io_uring_validate_mmap_request(struct file *file,
10552                                             loff_t pgoff, size_t sz)
10553 {
10554         struct io_ring_ctx *ctx = file->private_data;
10555         loff_t offset = pgoff << PAGE_SHIFT;
10556         struct page *page;
10557         void *ptr;
10558
10559         switch (offset) {
10560         case IORING_OFF_SQ_RING:
10561         case IORING_OFF_CQ_RING:
10562                 ptr = ctx->rings;
10563                 break;
10564         case IORING_OFF_SQES:
10565                 ptr = ctx->sq_sqes;
10566                 break;
10567         default:
10568                 return ERR_PTR(-EINVAL);
10569         }
10570
10571         page = virt_to_head_page(ptr);
10572         if (sz > page_size(page))
10573                 return ERR_PTR(-EINVAL);
10574
10575         return ptr;
10576 }
10577
10578 #ifdef CONFIG_MMU
10579
10580 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
10581 {
10582         size_t sz = vma->vm_end - vma->vm_start;
10583         unsigned long pfn;
10584         void *ptr;
10585
10586         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
10587         if (IS_ERR(ptr))
10588                 return PTR_ERR(ptr);
10589
10590         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
10591         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
10592 }
10593
10594 #else /* !CONFIG_MMU */
10595
10596 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
10597 {
10598         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
10599 }
10600
10601 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
10602 {
10603         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
10604 }
10605
10606 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
10607         unsigned long addr, unsigned long len,
10608         unsigned long pgoff, unsigned long flags)
10609 {
10610         void *ptr;
10611
10612         ptr = io_uring_validate_mmap_request(file, pgoff, len);
10613         if (IS_ERR(ptr))
10614                 return PTR_ERR(ptr);
10615
10616         return (unsigned long) ptr;
10617 }
10618
10619 #endif /* !CONFIG_MMU */
10620
10621 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
10622 {
10623         DEFINE_WAIT(wait);
10624
10625         do {
10626                 if (!io_sqring_full(ctx))
10627                         break;
10628                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
10629
10630                 if (!io_sqring_full(ctx))
10631                         break;
10632                 schedule();
10633         } while (!signal_pending(current));
10634
10635         finish_wait(&ctx->sqo_sq_wait, &wait);
10636         return 0;
10637 }
10638
10639 static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
10640 {
10641         if (flags & IORING_ENTER_EXT_ARG) {
10642                 struct io_uring_getevents_arg arg;
10643
10644                 if (argsz != sizeof(arg))
10645                         return -EINVAL;
10646                 if (copy_from_user(&arg, argp, sizeof(arg)))
10647                         return -EFAULT;
10648         }
10649         return 0;
10650 }
10651
10652 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
10653                           struct __kernel_timespec __user **ts,
10654                           const sigset_t __user **sig)
10655 {
10656         struct io_uring_getevents_arg arg;
10657
10658         /*
10659          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
10660          * is just a pointer to the sigset_t.
10661          */
10662         if (!(flags & IORING_ENTER_EXT_ARG)) {
10663                 *sig = (const sigset_t __user *) argp;
10664                 *ts = NULL;
10665                 return 0;
10666         }
10667
10668         /*
10669          * EXT_ARG is set - ensure we agree on the size of it and copy in our
10670          * timespec and sigset_t pointers if good.
10671          */
10672         if (*argsz != sizeof(arg))
10673                 return -EINVAL;
10674         if (copy_from_user(&arg, argp, sizeof(arg)))
10675                 return -EFAULT;
10676         if (arg.pad)
10677                 return -EINVAL;
10678         *sig = u64_to_user_ptr(arg.sigmask);
10679         *argsz = arg.sigmask_sz;
10680         *ts = u64_to_user_ptr(arg.ts);
10681         return 0;
10682 }
10683
10684 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
10685                 u32, min_complete, u32, flags, const void __user *, argp,
10686                 size_t, argsz)
10687 {
10688         struct io_ring_ctx *ctx;
10689         int submitted = 0;
10690         struct fd f;
10691         long ret;
10692
10693         io_run_task_work();
10694
10695         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
10696                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
10697                                IORING_ENTER_REGISTERED_RING)))
10698                 return -EINVAL;
10699
10700         /*
10701          * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
10702          * need only dereference our task private array to find it.
10703          */
10704         if (flags & IORING_ENTER_REGISTERED_RING) {
10705                 struct io_uring_task *tctx = current->io_uring;
10706
10707                 if (!tctx || fd >= IO_RINGFD_REG_MAX)
10708                         return -EINVAL;
10709                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
10710                 f.file = tctx->registered_rings[fd];
10711                 if (unlikely(!f.file))
10712                         return -EBADF;
10713         } else {
10714                 f = fdget(fd);
10715                 if (unlikely(!f.file))
10716                         return -EBADF;
10717         }
10718
10719         ret = -EOPNOTSUPP;
10720         if (unlikely(f.file->f_op != &io_uring_fops))
10721                 goto out_fput;
10722
10723         ret = -ENXIO;
10724         ctx = f.file->private_data;
10725         if (unlikely(!percpu_ref_tryget(&ctx->refs)))
10726                 goto out_fput;
10727
10728         ret = -EBADFD;
10729         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
10730                 goto out;
10731
10732         /*
10733          * For SQ polling, the thread will do all submissions and completions.
10734          * Just return the requested submit count, and wake the thread if
10735          * we were asked to.
10736          */
10737         ret = 0;
10738         if (ctx->flags & IORING_SETUP_SQPOLL) {
10739                 io_cqring_overflow_flush(ctx);
10740
10741                 if (unlikely(ctx->sq_data->thread == NULL)) {
10742                         ret = -EOWNERDEAD;
10743                         goto out;
10744                 }
10745                 if (flags & IORING_ENTER_SQ_WAKEUP)
10746                         wake_up(&ctx->sq_data->wait);
10747                 if (flags & IORING_ENTER_SQ_WAIT) {
10748                         ret = io_sqpoll_wait_sq(ctx);
10749                         if (ret)
10750                                 goto out;
10751                 }
10752                 submitted = to_submit;
10753         } else if (to_submit) {
10754                 ret = io_uring_add_tctx_node(ctx);
10755                 if (unlikely(ret))
10756                         goto out;
10757
10758                 mutex_lock(&ctx->uring_lock);
10759                 submitted = io_submit_sqes(ctx, to_submit);
10760                 if (submitted != to_submit) {
10761                         mutex_unlock(&ctx->uring_lock);
10762                         goto out;
10763                 }
10764                 if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
10765                         goto iopoll_locked;
10766                 mutex_unlock(&ctx->uring_lock);
10767         }
10768         if (flags & IORING_ENTER_GETEVENTS) {
10769                 if (ctx->syscall_iopoll) {
10770                         /*
10771                          * We disallow the app entering submit/complete with
10772                          * polling, but we still need to lock the ring to
10773                          * prevent racing with polled issue that got punted to
10774                          * a workqueue.
10775                          */
10776                         mutex_lock(&ctx->uring_lock);
10777 iopoll_locked:
10778                         ret = io_validate_ext_arg(flags, argp, argsz);
10779                         if (likely(!ret)) {
10780                                 min_complete = min(min_complete, ctx->cq_entries);
10781                                 ret = io_iopoll_check(ctx, min_complete);
10782                         }
10783                         mutex_unlock(&ctx->uring_lock);
10784                 } else {
10785                         const sigset_t __user *sig;
10786                         struct __kernel_timespec __user *ts;
10787
10788                         ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
10789                         if (unlikely(ret))
10790                                 goto out;
10791                         min_complete = min(min_complete, ctx->cq_entries);
10792                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
10793                 }
10794         }
10795
10796 out:
10797         percpu_ref_put(&ctx->refs);
10798 out_fput:
10799         if (!(flags & IORING_ENTER_REGISTERED_RING))
10800                 fdput(f);
10801         return submitted ? submitted : ret;
10802 }
10803
10804 #ifdef CONFIG_PROC_FS
10805 static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
10806                 const struct cred *cred)
10807 {
10808         struct user_namespace *uns = seq_user_ns(m);
10809         struct group_info *gi;
10810         kernel_cap_t cap;
10811         unsigned __capi;
10812         int g;
10813
10814         seq_printf(m, "%5d\n", id);
10815         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10816         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10817         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10818         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10819         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10820         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10821         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10822         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10823         seq_puts(m, "\n\tGroups:\t");
10824         gi = cred->group_info;
10825         for (g = 0; g < gi->ngroups; g++) {
10826                 seq_put_decimal_ull(m, g ? " " : "",
10827                                         from_kgid_munged(uns, gi->gid[g]));
10828         }
10829         seq_puts(m, "\n\tCapEff:\t");
10830         cap = cred->cap_effective;
10831         CAP_FOR_EACH_U32(__capi)
10832                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10833         seq_putc(m, '\n');
10834         return 0;
10835 }
10836
10837 static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10838                                           struct seq_file *m)
10839 {
10840         struct io_sq_data *sq = NULL;
10841         struct io_overflow_cqe *ocqe;
10842         struct io_rings *r = ctx->rings;
10843         unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
10844         unsigned int sq_head = READ_ONCE(r->sq.head);
10845         unsigned int sq_tail = READ_ONCE(r->sq.tail);
10846         unsigned int cq_head = READ_ONCE(r->cq.head);
10847         unsigned int cq_tail = READ_ONCE(r->cq.tail);
10848         unsigned int sq_entries, cq_entries;
10849         bool has_lock;
10850         unsigned int i;
10851
10852         /*
10853          * we may get imprecise sqe and cqe info if uring is actively running
10854          * since we get cached_sq_head and cached_cq_tail without uring_lock
10855          * and sq_tail and cq_head are changed by userspace. But it's ok since
10856          * we usually use these info when it is stuck.
10857          */
10858         seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
10859         seq_printf(m, "SqHead:\t%u\n", sq_head);
10860         seq_printf(m, "SqTail:\t%u\n", sq_tail);
10861         seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
10862         seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
10863         seq_printf(m, "CqHead:\t%u\n", cq_head);
10864         seq_printf(m, "CqTail:\t%u\n", cq_tail);
10865         seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
10866         seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
10867         sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
10868         for (i = 0; i < sq_entries; i++) {
10869                 unsigned int entry = i + sq_head;
10870                 unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
10871                 struct io_uring_sqe *sqe;
10872
10873                 if (sq_idx > sq_mask)
10874                         continue;
10875                 sqe = &ctx->sq_sqes[sq_idx];
10876                 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10877                            sq_idx, sqe->opcode, sqe->fd, sqe->flags,
10878                            sqe->user_data);
10879         }
10880         seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
10881         cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
10882         for (i = 0; i < cq_entries; i++) {
10883                 unsigned int entry = i + cq_head;
10884                 struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
10885
10886                 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
10887                            entry & cq_mask, cqe->user_data, cqe->res,
10888                            cqe->flags);
10889         }
10890
10891         /*
10892          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10893          * since fdinfo case grabs it in the opposite direction of normal use
10894          * cases. If we fail to get the lock, we just don't iterate any
10895          * structures that could be going away outside the io_uring mutex.
10896          */
10897         has_lock = mutex_trylock(&ctx->uring_lock);
10898
10899         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
10900                 sq = ctx->sq_data;
10901                 if (!sq->thread)
10902                         sq = NULL;
10903         }
10904
10905         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10906         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
10907         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
10908         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
10909                 struct file *f = io_file_from_index(ctx, i);
10910
10911                 if (f)
10912                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10913                 else
10914                         seq_printf(m, "%5u: <none>\n", i);
10915         }
10916         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
10917         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
10918                 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
10919                 unsigned int len = buf->ubuf_end - buf->ubuf;
10920
10921                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
10922         }
10923         if (has_lock && !xa_empty(&ctx->personalities)) {
10924                 unsigned long index;
10925                 const struct cred *cred;
10926
10927                 seq_printf(m, "Personalities:\n");
10928                 xa_for_each(&ctx->personalities, index, cred)
10929                         io_uring_show_cred(m, index, cred);
10930         }
10931         if (has_lock)
10932                 mutex_unlock(&ctx->uring_lock);
10933
10934         seq_puts(m, "PollList:\n");
10935         spin_lock(&ctx->completion_lock);
10936         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10937                 struct hlist_head *list = &ctx->cancel_hash[i];
10938                 struct io_kiocb *req;
10939
10940                 hlist_for_each_entry(req, list, hash_node)
10941                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
10942                                         task_work_pending(req->task));
10943         }
10944
10945         seq_puts(m, "CqOverflowList:\n");
10946         list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10947                 struct io_uring_cqe *cqe = &ocqe->cqe;
10948
10949                 seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
10950                            cqe->user_data, cqe->res, cqe->flags);
10951
10952         }
10953
10954         spin_unlock(&ctx->completion_lock);
10955 }
10956
10957 static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
10958 {
10959         struct io_ring_ctx *ctx = f->private_data;
10960
10961         if (percpu_ref_tryget(&ctx->refs)) {
10962                 __io_uring_show_fdinfo(ctx, m);
10963                 percpu_ref_put(&ctx->refs);
10964         }
10965 }
10966 #endif
10967
10968 static const struct file_operations io_uring_fops = {
10969         .release        = io_uring_release,
10970         .mmap           = io_uring_mmap,
10971 #ifndef CONFIG_MMU
10972         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
10973         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
10974 #endif
10975         .poll           = io_uring_poll,
10976 #ifdef CONFIG_PROC_FS
10977         .show_fdinfo    = io_uring_show_fdinfo,
10978 #endif
10979 };
10980
10981 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10982                                          struct io_uring_params *p)
10983 {
10984         struct io_rings *rings;
10985         size_t size, sq_array_offset;
10986
10987         /* make sure these are sane, as we already accounted them */
10988         ctx->sq_entries = p->sq_entries;
10989         ctx->cq_entries = p->cq_entries;
10990
10991         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10992         if (size == SIZE_MAX)
10993                 return -EOVERFLOW;
10994
10995         rings = io_mem_alloc(size);
10996         if (!rings)
10997                 return -ENOMEM;
10998
10999         ctx->rings = rings;
11000         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
11001         rings->sq_ring_mask = p->sq_entries - 1;
11002         rings->cq_ring_mask = p->cq_entries - 1;
11003         rings->sq_ring_entries = p->sq_entries;
11004         rings->cq_ring_entries = p->cq_entries;
11005
11006         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
11007         if (size == SIZE_MAX) {
11008                 io_mem_free(ctx->rings);
11009                 ctx->rings = NULL;
11010                 return -EOVERFLOW;
11011         }
11012
11013         ctx->sq_sqes = io_mem_alloc(size);
11014         if (!ctx->sq_sqes) {
11015                 io_mem_free(ctx->rings);
11016                 ctx->rings = NULL;
11017                 return -ENOMEM;
11018         }
11019
11020         return 0;
11021 }
11022
11023 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
11024 {
11025         int ret, fd;
11026
11027         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
11028         if (fd < 0)
11029                 return fd;
11030
11031         ret = io_uring_add_tctx_node(ctx);
11032         if (ret) {
11033                 put_unused_fd(fd);
11034                 return ret;
11035         }
11036         fd_install(fd, file);
11037         return fd;
11038 }
11039
11040 /*
11041  * Allocate an anonymous fd, this is what constitutes the application
11042  * visible backing of an io_uring instance. The application mmaps this
11043  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
11044  * we have to tie this fd to a socket for file garbage collection purposes.
11045  */
11046 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
11047 {
11048         struct file *file;
11049 #if defined(CONFIG_UNIX)
11050         int ret;
11051
11052         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
11053                                 &ctx->ring_sock);
11054         if (ret)
11055                 return ERR_PTR(ret);
11056 #endif
11057
11058         file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
11059                                          O_RDWR | O_CLOEXEC, NULL);
11060 #if defined(CONFIG_UNIX)
11061         if (IS_ERR(file)) {
11062                 sock_release(ctx->ring_sock);
11063                 ctx->ring_sock = NULL;
11064         } else {
11065                 ctx->ring_sock->file = file;
11066         }
11067 #endif
11068         return file;
11069 }
11070
11071 static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
11072                                   struct io_uring_params __user *params)
11073 {
11074         struct io_ring_ctx *ctx;
11075         struct file *file;
11076         int ret;
11077
11078         if (!entries)
11079                 return -EINVAL;
11080         if (entries > IORING_MAX_ENTRIES) {
11081                 if (!(p->flags & IORING_SETUP_CLAMP))
11082                         return -EINVAL;
11083                 entries = IORING_MAX_ENTRIES;
11084         }
11085
11086         /*
11087          * Use twice as many entries for the CQ ring. It's possible for the
11088          * application to drive a higher depth than the size of the SQ ring,
11089          * since the sqes are only used at submission time. This allows for
11090          * some flexibility in overcommitting a bit. If the application has
11091          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
11092          * of CQ ring entries manually.
11093          */
11094         p->sq_entries = roundup_pow_of_two(entries);
11095         if (p->flags & IORING_SETUP_CQSIZE) {
11096                 /*
11097                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
11098                  * to a power-of-two, if it isn't already. We do NOT impose
11099                  * any cq vs sq ring sizing.
11100                  */
11101                 if (!p->cq_entries)
11102                         return -EINVAL;
11103                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
11104                         if (!(p->flags & IORING_SETUP_CLAMP))
11105                                 return -EINVAL;
11106                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
11107                 }
11108                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
11109                 if (p->cq_entries < p->sq_entries)
11110                         return -EINVAL;
11111         } else {
11112                 p->cq_entries = 2 * p->sq_entries;
11113         }
11114
11115         ctx = io_ring_ctx_alloc(p);
11116         if (!ctx)
11117                 return -ENOMEM;
11118
11119         /*
11120          * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
11121          * space applications don't need to do io completion events
11122          * polling again, they can rely on io_sq_thread to do polling
11123          * work, which can reduce cpu usage and uring_lock contention.
11124          */
11125         if (ctx->flags & IORING_SETUP_IOPOLL &&
11126             !(ctx->flags & IORING_SETUP_SQPOLL))
11127                 ctx->syscall_iopoll = 1;
11128
11129         ctx->compat = in_compat_syscall();
11130         if (!capable(CAP_IPC_LOCK))
11131                 ctx->user = get_uid(current_user());
11132
11133         /*
11134          * This is just grabbed for accounting purposes. When a process exits,
11135          * the mm is exited and dropped before the files, hence we need to hang
11136          * on to this mm purely for the purposes of being able to unaccount
11137          * memory (locked/pinned vm). It's not used for anything else.
11138          */
11139         mmgrab(current->mm);
11140         ctx->mm_account = current->mm;
11141
11142         ret = io_allocate_scq_urings(ctx, p);
11143         if (ret)
11144                 goto err;
11145
11146         ret = io_sq_offload_create(ctx, p);
11147         if (ret)
11148                 goto err;
11149         /* always set a rsrc node */
11150         ret = io_rsrc_node_switch_start(ctx);
11151         if (ret)
11152                 goto err;
11153         io_rsrc_node_switch(ctx, NULL);
11154
11155         memset(&p->sq_off, 0, sizeof(p->sq_off));
11156         p->sq_off.head = offsetof(struct io_rings, sq.head);
11157         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
11158         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
11159         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
11160         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
11161         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
11162         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
11163
11164         memset(&p->cq_off, 0, sizeof(p->cq_off));
11165         p->cq_off.head = offsetof(struct io_rings, cq.head);
11166         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
11167         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
11168         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
11169         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
11170         p->cq_off.cqes = offsetof(struct io_rings, cqes);
11171         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
11172
11173         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
11174                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
11175                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
11176                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
11177                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
11178                         IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
11179                         IORING_FEAT_LINKED_FILE;
11180
11181         if (copy_to_user(params, p, sizeof(*p))) {
11182                 ret = -EFAULT;
11183                 goto err;
11184         }
11185
11186         file = io_uring_get_file(ctx);
11187         if (IS_ERR(file)) {
11188                 ret = PTR_ERR(file);
11189                 goto err;
11190         }
11191
11192         /*
11193          * Install ring fd as the very last thing, so we don't risk someone
11194          * having closed it before we finish setup
11195          */
11196         ret = io_uring_install_fd(ctx, file);
11197         if (ret < 0) {
11198                 /* fput will clean it up */
11199                 fput(file);
11200                 return ret;
11201         }
11202
11203         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
11204         return ret;
11205 err:
11206         io_ring_ctx_wait_and_kill(ctx);
11207         return ret;
11208 }
11209
11210 /*
11211  * Sets up an aio uring context, and returns the fd. Applications asks for a
11212  * ring size, we return the actual sq/cq ring sizes (among other things) in the
11213  * params structure passed in.
11214  */
11215 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
11216 {
11217         struct io_uring_params p;
11218         int i;
11219
11220         if (copy_from_user(&p, params, sizeof(p)))
11221                 return -EFAULT;
11222         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
11223                 if (p.resv[i])
11224                         return -EINVAL;
11225         }
11226
11227         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
11228                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
11229                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
11230                         IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
11231                 return -EINVAL;
11232
11233         return  io_uring_create(entries, &p, params);
11234 }
11235
11236 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
11237                 struct io_uring_params __user *, params)
11238 {
11239         return io_uring_setup(entries, params);
11240 }
11241
11242 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
11243                            unsigned nr_args)
11244 {
11245         struct io_uring_probe *p;
11246         size_t size;
11247         int i, ret;
11248
11249         size = struct_size(p, ops, nr_args);
11250         if (size == SIZE_MAX)
11251                 return -EOVERFLOW;
11252         p = kzalloc(size, GFP_KERNEL);
11253         if (!p)
11254                 return -ENOMEM;
11255
11256         ret = -EFAULT;
11257         if (copy_from_user(p, arg, size))
11258                 goto out;
11259         ret = -EINVAL;
11260         if (memchr_inv(p, 0, size))
11261                 goto out;
11262
11263         p->last_op = IORING_OP_LAST - 1;
11264         if (nr_args > IORING_OP_LAST)
11265                 nr_args = IORING_OP_LAST;
11266
11267         for (i = 0; i < nr_args; i++) {
11268                 p->ops[i].op = i;
11269                 if (!io_op_defs[i].not_supported)
11270                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
11271         }
11272         p->ops_len = i;
11273
11274         ret = 0;
11275         if (copy_to_user(arg, p, size))
11276                 ret = -EFAULT;
11277 out:
11278         kfree(p);
11279         return ret;
11280 }
11281
11282 static int io_register_personality(struct io_ring_ctx *ctx)
11283 {
11284         const struct cred *creds;
11285         u32 id;
11286         int ret;
11287
11288         creds = get_current_cred();
11289
11290         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
11291                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
11292         if (ret < 0) {
11293                 put_cred(creds);
11294                 return ret;
11295         }
11296         return id;
11297 }
11298
11299 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
11300                                            void __user *arg, unsigned int nr_args)
11301 {
11302         struct io_uring_restriction *res;
11303         size_t size;
11304         int i, ret;
11305
11306         /* Restrictions allowed only if rings started disabled */
11307         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
11308                 return -EBADFD;
11309
11310         /* We allow only a single restrictions registration */
11311         if (ctx->restrictions.registered)
11312                 return -EBUSY;
11313
11314         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
11315                 return -EINVAL;
11316
11317         size = array_size(nr_args, sizeof(*res));
11318         if (size == SIZE_MAX)
11319                 return -EOVERFLOW;
11320
11321         res = memdup_user(arg, size);
11322         if (IS_ERR(res))
11323                 return PTR_ERR(res);
11324
11325         ret = 0;
11326
11327         for (i = 0; i < nr_args; i++) {
11328                 switch (res[i].opcode) {
11329                 case IORING_RESTRICTION_REGISTER_OP:
11330                         if (res[i].register_op >= IORING_REGISTER_LAST) {
11331                                 ret = -EINVAL;
11332                                 goto out;
11333                         }
11334
11335                         __set_bit(res[i].register_op,
11336                                   ctx->restrictions.register_op);
11337                         break;
11338                 case IORING_RESTRICTION_SQE_OP:
11339                         if (res[i].sqe_op >= IORING_OP_LAST) {
11340                                 ret = -EINVAL;
11341                                 goto out;
11342                         }
11343
11344                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
11345                         break;
11346                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
11347                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
11348                         break;
11349                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
11350                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
11351                         break;
11352                 default:
11353                         ret = -EINVAL;
11354                         goto out;
11355                 }
11356         }
11357
11358 out:
11359         /* Reset all restrictions if an error happened */
11360         if (ret != 0)
11361                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
11362         else
11363                 ctx->restrictions.registered = true;
11364
11365         kfree(res);
11366         return ret;
11367 }
11368
11369 static int io_register_enable_rings(struct io_ring_ctx *ctx)
11370 {
11371         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
11372                 return -EBADFD;
11373
11374         if (ctx->restrictions.registered)
11375                 ctx->restricted = 1;
11376
11377         ctx->flags &= ~IORING_SETUP_R_DISABLED;
11378         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
11379                 wake_up(&ctx->sq_data->wait);
11380         return 0;
11381 }
11382
11383 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
11384                                      struct io_uring_rsrc_update2 *up,
11385                                      unsigned nr_args)
11386 {
11387         __u32 tmp;
11388         int err;
11389
11390         if (check_add_overflow(up->offset, nr_args, &tmp))
11391                 return -EOVERFLOW;
11392         err = io_rsrc_node_switch_start(ctx);
11393         if (err)
11394                 return err;
11395
11396         switch (type) {
11397         case IORING_RSRC_FILE:
11398                 return __io_sqe_files_update(ctx, up, nr_args);
11399         case IORING_RSRC_BUFFER:
11400                 return __io_sqe_buffers_update(ctx, up, nr_args);
11401         }
11402         return -EINVAL;
11403 }
11404
11405 static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
11406                                     unsigned nr_args)
11407 {
11408         struct io_uring_rsrc_update2 up;
11409
11410         if (!nr_args)
11411                 return -EINVAL;
11412         memset(&up, 0, sizeof(up));
11413         if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
11414                 return -EFAULT;
11415         if (up.resv || up.resv2)
11416                 return -EINVAL;
11417         return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
11418 }
11419
11420 static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
11421                                    unsigned size, unsigned type)
11422 {
11423         struct io_uring_rsrc_update2 up;
11424
11425         if (size != sizeof(up))
11426                 return -EINVAL;
11427         if (copy_from_user(&up, arg, sizeof(up)))
11428                 return -EFAULT;
11429         if (!up.nr || up.resv || up.resv2)
11430                 return -EINVAL;
11431         return __io_register_rsrc_update(ctx, type, &up, up.nr);
11432 }
11433
11434 static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
11435                             unsigned int size, unsigned int type)
11436 {
11437         struct io_uring_rsrc_register rr;
11438
11439         /* keep it extendible */
11440         if (size != sizeof(rr))
11441                 return -EINVAL;
11442
11443         memset(&rr, 0, sizeof(rr));
11444         if (copy_from_user(&rr, arg, size))
11445                 return -EFAULT;
11446         if (!rr.nr || rr.resv || rr.resv2)
11447                 return -EINVAL;
11448
11449         switch (type) {
11450         case IORING_RSRC_FILE:
11451                 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
11452                                              rr.nr, u64_to_user_ptr(rr.tags));
11453         case IORING_RSRC_BUFFER:
11454                 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
11455                                                rr.nr, u64_to_user_ptr(rr.tags));
11456         }
11457         return -EINVAL;
11458 }
11459
11460 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
11461                                        void __user *arg, unsigned len)
11462 {
11463         struct io_uring_task *tctx = current->io_uring;
11464         cpumask_var_t new_mask;
11465         int ret;
11466
11467         if (!tctx || !tctx->io_wq)
11468                 return -EINVAL;
11469
11470         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
11471                 return -ENOMEM;
11472
11473         cpumask_clear(new_mask);
11474         if (len > cpumask_size())
11475                 len = cpumask_size();
11476
11477         if (in_compat_syscall()) {
11478                 ret = compat_get_bitmap(cpumask_bits(new_mask),
11479                                         (const compat_ulong_t __user *)arg,
11480                                         len * 8 /* CHAR_BIT */);
11481         } else {
11482                 ret = copy_from_user(new_mask, arg, len);
11483         }
11484
11485         if (ret) {
11486                 free_cpumask_var(new_mask);
11487                 return -EFAULT;
11488         }
11489
11490         ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
11491         free_cpumask_var(new_mask);
11492         return ret;
11493 }
11494
11495 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
11496 {
11497         struct io_uring_task *tctx = current->io_uring;
11498
11499         if (!tctx || !tctx->io_wq)
11500                 return -EINVAL;
11501
11502         return io_wq_cpu_affinity(tctx->io_wq, NULL);
11503 }
11504
11505 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
11506                                                void __user *arg)
11507         __must_hold(&ctx->uring_lock)
11508 {
11509         struct io_tctx_node *node;
11510         struct io_uring_task *tctx = NULL;
11511         struct io_sq_data *sqd = NULL;
11512         __u32 new_count[2];
11513         int i, ret;
11514
11515         if (copy_from_user(new_count, arg, sizeof(new_count)))
11516                 return -EFAULT;
11517         for (i = 0; i < ARRAY_SIZE(new_count); i++)
11518                 if (new_count[i] > INT_MAX)
11519                         return -EINVAL;
11520
11521         if (ctx->flags & IORING_SETUP_SQPOLL) {
11522                 sqd = ctx->sq_data;
11523                 if (sqd) {
11524                         /*
11525                          * Observe the correct sqd->lock -> ctx->uring_lock
11526                          * ordering. Fine to drop uring_lock here, we hold
11527                          * a ref to the ctx.
11528                          */
11529                         refcount_inc(&sqd->refs);
11530                         mutex_unlock(&ctx->uring_lock);
11531                         mutex_lock(&sqd->lock);
11532                         mutex_lock(&ctx->uring_lock);
11533                         if (sqd->thread)
11534                                 tctx = sqd->thread->io_uring;
11535                 }
11536         } else {
11537                 tctx = current->io_uring;
11538         }
11539
11540         BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
11541
11542         for (i = 0; i < ARRAY_SIZE(new_count); i++)
11543                 if (new_count[i])
11544                         ctx->iowq_limits[i] = new_count[i];
11545         ctx->iowq_limits_set = true;
11546
11547         if (tctx && tctx->io_wq) {
11548                 ret = io_wq_max_workers(tctx->io_wq, new_count);
11549                 if (ret)
11550                         goto err;
11551         } else {
11552                 memset(new_count, 0, sizeof(new_count));
11553         }
11554
11555         if (sqd) {
11556                 mutex_unlock(&sqd->lock);
11557                 io_put_sq_data(sqd);
11558         }
11559
11560         if (copy_to_user(arg, new_count, sizeof(new_count)))
11561                 return -EFAULT;
11562
11563         /* that's it for SQPOLL, only the SQPOLL task creates requests */
11564         if (sqd)
11565                 return 0;
11566
11567         /* now propagate the restriction to all registered users */
11568         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
11569                 struct io_uring_task *tctx = node->task->io_uring;
11570
11571                 if (WARN_ON_ONCE(!tctx->io_wq))
11572                         continue;
11573
11574                 for (i = 0; i < ARRAY_SIZE(new_count); i++)
11575                         new_count[i] = ctx->iowq_limits[i];
11576                 /* ignore errors, it always returns zero anyway */
11577                 (void)io_wq_max_workers(tctx->io_wq, new_count);
11578         }
11579         return 0;
11580 err:
11581         if (sqd) {
11582                 mutex_unlock(&sqd->lock);
11583                 io_put_sq_data(sqd);
11584         }
11585         return ret;
11586 }
11587
11588 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
11589                                void __user *arg, unsigned nr_args)
11590         __releases(ctx->uring_lock)
11591         __acquires(ctx->uring_lock)
11592 {
11593         int ret;
11594
11595         /*
11596          * We're inside the ring mutex, if the ref is already dying, then
11597          * someone else killed the ctx or is already going through
11598          * io_uring_register().
11599          */
11600         if (percpu_ref_is_dying(&ctx->refs))
11601                 return -ENXIO;
11602
11603         if (ctx->restricted) {
11604                 if (opcode >= IORING_REGISTER_LAST)
11605                         return -EINVAL;
11606                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
11607                 if (!test_bit(opcode, ctx->restrictions.register_op))
11608                         return -EACCES;
11609         }
11610
11611         switch (opcode) {
11612         case IORING_REGISTER_BUFFERS:
11613                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
11614                 break;
11615         case IORING_UNREGISTER_BUFFERS:
11616                 ret = -EINVAL;
11617                 if (arg || nr_args)
11618                         break;
11619                 ret = io_sqe_buffers_unregister(ctx);
11620                 break;
11621         case IORING_REGISTER_FILES:
11622                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
11623                 break;
11624         case IORING_UNREGISTER_FILES:
11625                 ret = -EINVAL;
11626                 if (arg || nr_args)
11627                         break;
11628                 ret = io_sqe_files_unregister(ctx);
11629                 break;
11630         case IORING_REGISTER_FILES_UPDATE:
11631                 ret = io_register_files_update(ctx, arg, nr_args);
11632                 break;
11633         case IORING_REGISTER_EVENTFD:
11634                 ret = -EINVAL;
11635                 if (nr_args != 1)
11636                         break;
11637                 ret = io_eventfd_register(ctx, arg, 0);
11638                 break;
11639         case IORING_REGISTER_EVENTFD_ASYNC:
11640                 ret = -EINVAL;
11641                 if (nr_args != 1)
11642                         break;
11643                 ret = io_eventfd_register(ctx, arg, 1);
11644                 break;
11645         case IORING_UNREGISTER_EVENTFD:
11646                 ret = -EINVAL;
11647                 if (arg || nr_args)
11648                         break;
11649                 ret = io_eventfd_unregister(ctx);
11650                 break;
11651         case IORING_REGISTER_PROBE:
11652                 ret = -EINVAL;
11653                 if (!arg || nr_args > 256)
11654                         break;
11655                 ret = io_probe(ctx, arg, nr_args);
11656                 break;
11657         case IORING_REGISTER_PERSONALITY:
11658                 ret = -EINVAL;
11659                 if (arg || nr_args)
11660                         break;
11661                 ret = io_register_personality(ctx);
11662                 break;
11663         case IORING_UNREGISTER_PERSONALITY:
11664                 ret = -EINVAL;
11665                 if (arg)
11666                         break;
11667                 ret = io_unregister_personality(ctx, nr_args);
11668                 break;
11669         case IORING_REGISTER_ENABLE_RINGS:
11670                 ret = -EINVAL;
11671                 if (arg || nr_args)
11672                         break;
11673                 ret = io_register_enable_rings(ctx);
11674                 break;
11675         case IORING_REGISTER_RESTRICTIONS:
11676                 ret = io_register_restrictions(ctx, arg, nr_args);
11677                 break;
11678         case IORING_REGISTER_FILES2:
11679                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
11680                 break;
11681         case IORING_REGISTER_FILES_UPDATE2:
11682                 ret = io_register_rsrc_update(ctx, arg, nr_args,
11683                                               IORING_RSRC_FILE);
11684                 break;
11685         case IORING_REGISTER_BUFFERS2:
11686                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
11687                 break;
11688         case IORING_REGISTER_BUFFERS_UPDATE:
11689                 ret = io_register_rsrc_update(ctx, arg, nr_args,
11690                                               IORING_RSRC_BUFFER);
11691                 break;
11692         case IORING_REGISTER_IOWQ_AFF:
11693                 ret = -EINVAL;
11694                 if (!arg || !nr_args)
11695                         break;
11696                 ret = io_register_iowq_aff(ctx, arg, nr_args);
11697                 break;
11698         case IORING_UNREGISTER_IOWQ_AFF:
11699                 ret = -EINVAL;
11700                 if (arg || nr_args)
11701                         break;
11702                 ret = io_unregister_iowq_aff(ctx);
11703                 break;
11704         case IORING_REGISTER_IOWQ_MAX_WORKERS:
11705                 ret = -EINVAL;
11706                 if (!arg || nr_args != 2)
11707                         break;
11708                 ret = io_register_iowq_max_workers(ctx, arg);
11709                 break;
11710         case IORING_REGISTER_RING_FDS:
11711                 ret = io_ringfd_register(ctx, arg, nr_args);
11712                 break;
11713         case IORING_UNREGISTER_RING_FDS:
11714                 ret = io_ringfd_unregister(ctx, arg, nr_args);
11715                 break;
11716         default:
11717                 ret = -EINVAL;
11718                 break;
11719         }
11720
11721         return ret;
11722 }
11723
11724 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11725                 void __user *, arg, unsigned int, nr_args)
11726 {
11727         struct io_ring_ctx *ctx;
11728         long ret = -EBADF;
11729         struct fd f;
11730
11731         f = fdget(fd);
11732         if (!f.file)
11733                 return -EBADF;
11734
11735         ret = -EOPNOTSUPP;
11736         if (f.file->f_op != &io_uring_fops)
11737                 goto out_fput;
11738
11739         ctx = f.file->private_data;
11740
11741         io_run_task_work();
11742
11743         mutex_lock(&ctx->uring_lock);
11744         ret = __io_uring_register(ctx, opcode, arg, nr_args);
11745         mutex_unlock(&ctx->uring_lock);
11746         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
11747 out_fput:
11748         fdput(f);
11749         return ret;
11750 }
11751
11752 static int __init io_uring_init(void)
11753 {
11754 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11755         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11756         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11757 } while (0)
11758
11759 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11760         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11761         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11762         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
11763         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
11764         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
11765         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
11766         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
11767         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
11768         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
11769         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
11770         BUILD_BUG_SQE_ELEM(24, __u32,  len);
11771         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
11772         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
11773         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
11774         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
11775         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
11776         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
11777         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
11778         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
11779         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
11780         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
11781         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
11782         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
11783         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
11784         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
11785         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
11786         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
11787         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
11788         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
11789         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
11790         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
11791         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
11792
11793         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11794                      sizeof(struct io_uring_rsrc_update));
11795         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11796                      sizeof(struct io_uring_rsrc_update2));
11797
11798         /* ->buf_index is u16 */
11799         BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11800
11801         /* should fit into one byte */
11802         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
11803         BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11804         BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
11805
11806         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
11807         BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
11808
11809         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11810                                 SLAB_ACCOUNT);
11811         return 0;
11812 };
11813 __initcall(io_uring_init);